Bug Summary

File:build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 17608, column 31
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-17/lib/clang/17 -I lib/Target/X86 -I /build/source/llvm/lib/Target/X86 -I include -I /build/source/llvm/include -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-04-16-131055-16441-1 -x c++ /build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/ObjCARCUtil.h"
31#include "llvm/Analysis/ProfileSummaryInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/CodeGen/IntrinsicLowering.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineJumpTableInfo.h"
38#include "llvm/CodeGen/MachineLoopInfo.h"
39#include "llvm/CodeGen/MachineModuleInfo.h"
40#include "llvm/CodeGen/MachineRegisterInfo.h"
41#include "llvm/CodeGen/TargetLowering.h"
42#include "llvm/CodeGen/WinEHFuncInfo.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
45#include "llvm/IR/DerivedTypes.h"
46#include "llvm/IR/DiagnosticInfo.h"
47#include "llvm/IR/EHPersonalities.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107/// Returns true if a CC can dynamically exclude a register from the list of
108/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
109/// the return registers.
110static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
111 switch (CC) {
112 default:
113 return false;
114 case CallingConv::X86_RegCall:
115 case CallingConv::PreserveMost:
116 case CallingConv::PreserveAll:
117 return true;
118 }
119}
120
121/// Returns true if a CC can dynamically exclude a register from the list of
122/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
123/// the parameters.
124static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
125 return CC == CallingConv::X86_RegCall;
126}
127
128X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
129 const X86Subtarget &STI)
130 : TargetLowering(TM), Subtarget(STI) {
131 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
132 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
133
134 // Set up the TargetLowering object.
135
136 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 setBooleanContents(ZeroOrOneBooleanContent);
138 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
140
141 // For 64-bit, since we have so many registers, use the ILP scheduler.
142 // For 32-bit, use the register pressure specific scheduling.
143 // For Atom, always use ILP scheduling.
144 if (Subtarget.isAtom())
145 setSchedulingPreference(Sched::ILP);
146 else if (Subtarget.is64Bit())
147 setSchedulingPreference(Sched::ILP);
148 else
149 setSchedulingPreference(Sched::RegPressure);
150 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
151 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
152
153 // Bypass expensive divides and use cheaper ones.
154 if (TM.getOptLevel() >= CodeGenOpt::Default) {
155 if (Subtarget.hasSlowDivide32())
156 addBypassSlowDiv(32, 8);
157 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
158 addBypassSlowDiv(64, 32);
159 }
160
161 // Setup Windows compiler runtime calls.
162 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
163 static const struct {
164 const RTLIB::Libcall Op;
165 const char * const Name;
166 const CallingConv::ID CC;
167 } LibraryCalls[] = {
168 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
169 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
170 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
171 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
172 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
173 };
174
175 for (const auto &LC : LibraryCalls) {
176 setLibcallName(LC.Op, LC.Name);
177 setLibcallCallingConv(LC.Op, LC.CC);
178 }
179 }
180
181 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
182 // MSVCRT doesn't have powi; fall back to pow
183 setLibcallName(RTLIB::POWI_F32, nullptr);
184 setLibcallName(RTLIB::POWI_F64, nullptr);
185 }
186
187 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
188 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
189 // FIXME: Should we be limiting the atomic size on other configs? Default is
190 // 1024.
191 if (!Subtarget.canUseCMPXCHG8B())
192 setMaxAtomicSizeInBitsSupported(32);
193
194 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
195
196 setMaxLargeFPConvertBitWidthSupported(128);
197
198 // Set up the register classes.
199 addRegisterClass(MVT::i8, &X86::GR8RegClass);
200 addRegisterClass(MVT::i16, &X86::GR16RegClass);
201 addRegisterClass(MVT::i32, &X86::GR32RegClass);
202 if (Subtarget.is64Bit())
203 addRegisterClass(MVT::i64, &X86::GR64RegClass);
204
205 for (MVT VT : MVT::integer_valuetypes())
206 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
207
208 // We don't accept any truncstore of integer registers.
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
211 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
212 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
213 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
214 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
215
216 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
217
218 // SETOEQ and SETUNE require checking two conditions.
219 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
220 setCondCodeAction(ISD::SETOEQ, VT, Expand);
221 setCondCodeAction(ISD::SETUNE, VT, Expand);
222 }
223
224 // Integer absolute.
225 if (Subtarget.canUseCMOV()) {
226 setOperationAction(ISD::ABS , MVT::i16 , Custom);
227 setOperationAction(ISD::ABS , MVT::i32 , Custom);
228 if (Subtarget.is64Bit())
229 setOperationAction(ISD::ABS , MVT::i64 , Custom);
230 }
231
232 // Signed saturation subtraction.
233 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
234 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
235 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
238
239 // Funnel shifts.
240 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
241 // For slow shld targets we only lower for code size.
242 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
243
244 setOperationAction(ShiftOp , MVT::i8 , Custom);
245 setOperationAction(ShiftOp , MVT::i16 , Custom);
246 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
247 if (Subtarget.is64Bit())
248 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
249 }
250
251 if (!Subtarget.useSoftFloat()) {
252 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
253 // operation.
254 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
255 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
256 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
257 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
258 // We have an algorithm for SSE2, and we turn this into a 64-bit
259 // FILD or VCVTUSI2SS/SD for other targets.
260 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
261 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
262 // We have an algorithm for SSE2->double, and we turn this into a
263 // 64-bit FILD followed by conditional FADD for other targets.
264 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
265 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
266
267 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
268 // this operation.
269 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
270 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
271 // SSE has no i16 to fp conversion, only i32. We promote in the handler
272 // to allow f80 to use i16 and f64 to use i16 with sse1 only
273 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
274 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
275 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
276 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
277 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
278 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
279 // are Legal, f80 is custom lowered.
280 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
281 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
282
283 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
284 // this operation.
285 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
286 // FIXME: This doesn't generate invalid exception when it should. PR44019.
287 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
288 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
289 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
290 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
291 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
292 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
293 // are Legal, f80 is custom lowered.
294 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
295 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
296
297 // Handle FP_TO_UINT by promoting the destination to a larger signed
298 // conversion.
299 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
300 // FIXME: This doesn't generate invalid exception when it should. PR44019.
301 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
302 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
303 // FIXME: This doesn't generate invalid exception when it should. PR44019.
304 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
305 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
306 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
307 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
308 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
309
310 setOperationAction(ISD::LRINT, MVT::f32, Custom);
311 setOperationAction(ISD::LRINT, MVT::f64, Custom);
312 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
313 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
314
315 if (!Subtarget.is64Bit()) {
316 setOperationAction(ISD::LRINT, MVT::i64, Custom);
317 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
318 }
319 }
320
321 if (Subtarget.hasSSE2()) {
322 // Custom lowering for saturating float to int conversions.
323 // We handle promotion to larger result types manually.
324 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
325 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
326 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
327 }
328 if (Subtarget.is64Bit()) {
329 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
330 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
331 }
332 }
333
334 // Handle address space casts between mixed sized pointers.
335 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
336 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
337
338 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
339 if (!Subtarget.hasSSE2()) {
340 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
341 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
342 if (Subtarget.is64Bit()) {
343 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
344 // Without SSE, i64->f64 goes through memory.
345 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
346 }
347 } else if (!Subtarget.is64Bit())
348 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
349
350 // Scalar integer divide and remainder are lowered to use operations that
351 // produce two results, to match the available instructions. This exposes
352 // the two-result form to trivial CSE, which is able to combine x/y and x%y
353 // into a single instruction.
354 //
355 // Scalar integer multiply-high is also lowered to use two-result
356 // operations, to match the available instructions. However, plain multiply
357 // (low) operations are left as Legal, as there are single-result
358 // instructions for this in x86. Using the two-result multiply instructions
359 // when both high and low results are needed must be arranged by dagcombine.
360 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
361 setOperationAction(ISD::MULHS, VT, Expand);
362 setOperationAction(ISD::MULHU, VT, Expand);
363 setOperationAction(ISD::SDIV, VT, Expand);
364 setOperationAction(ISD::UDIV, VT, Expand);
365 setOperationAction(ISD::SREM, VT, Expand);
366 setOperationAction(ISD::UREM, VT, Expand);
367 }
368
369 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
370 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
371 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
372 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
373 setOperationAction(ISD::BR_CC, VT, Expand);
374 setOperationAction(ISD::SELECT_CC, VT, Expand);
375 }
376 if (Subtarget.is64Bit())
377 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
378 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
379 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
380 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
381
382 setOperationAction(ISD::FREM , MVT::f32 , Expand);
383 setOperationAction(ISD::FREM , MVT::f64 , Expand);
384 setOperationAction(ISD::FREM , MVT::f80 , Expand);
385 setOperationAction(ISD::FREM , MVT::f128 , Expand);
386
387 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
388 setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
389 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
390 }
391
392 // Promote the i8 variants and force them on up to i32 which has a shorter
393 // encoding.
394 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
395 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
396 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
397 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
398 // promote that too.
399 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
400 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
401
402 if (!Subtarget.hasBMI()) {
403 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
404 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
405 if (Subtarget.is64Bit()) {
406 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
407 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
408 }
409 }
410
411 if (Subtarget.hasLZCNT()) {
412 // When promoting the i8 variants, force them to i32 for a shorter
413 // encoding.
414 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
415 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
416 } else {
417 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
418 if (VT == MVT::i64 && !Subtarget.is64Bit())
419 continue;
420 setOperationAction(ISD::CTLZ , VT, Custom);
421 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
422 }
423 }
424
425 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
426 ISD::STRICT_FP_TO_FP16}) {
427 // Special handling for half-precision floating point conversions.
428 // If we don't have F16C support, then lower half float conversions
429 // into library calls.
430 setOperationAction(
431 Op, MVT::f32,
432 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
433 // There's never any support for operations beyond MVT::f32.
434 setOperationAction(Op, MVT::f64, Expand);
435 setOperationAction(Op, MVT::f80, Expand);
436 setOperationAction(Op, MVT::f128, Expand);
437 }
438
439 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
440 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
441 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
442 setTruncStoreAction(VT, MVT::f16, Expand);
443 setTruncStoreAction(VT, MVT::bf16, Expand);
444
445 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
446 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
447 }
448
449 setOperationAction(ISD::PARITY, MVT::i8, Custom);
450 setOperationAction(ISD::PARITY, MVT::i16, Custom);
451 setOperationAction(ISD::PARITY, MVT::i32, Custom);
452 if (Subtarget.is64Bit())
453 setOperationAction(ISD::PARITY, MVT::i64, Custom);
454 if (Subtarget.hasPOPCNT()) {
455 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
456 // popcntw is longer to encode than popcntl and also has a false dependency
457 // on the dest that popcntl hasn't had since Cannon Lake.
458 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
459 } else {
460 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
461 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
462 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
463 if (Subtarget.is64Bit())
464 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
465 else
466 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
467 }
468
469 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
470
471 if (!Subtarget.hasMOVBE())
472 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
473
474 // X86 wants to expand cmov itself.
475 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
476 setOperationAction(ISD::SELECT, VT, Custom);
477 setOperationAction(ISD::SETCC, VT, Custom);
478 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
479 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
480 }
481 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
482 if (VT == MVT::i64 && !Subtarget.is64Bit())
483 continue;
484 setOperationAction(ISD::SELECT, VT, Custom);
485 setOperationAction(ISD::SETCC, VT, Custom);
486 }
487
488 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
489 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
490 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
491
492 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
493 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
494 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
495 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
496 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
497 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
498 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
499 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
500
501 // Darwin ABI issue.
502 for (auto VT : { MVT::i32, MVT::i64 }) {
503 if (VT == MVT::i64 && !Subtarget.is64Bit())
504 continue;
505 setOperationAction(ISD::ConstantPool , VT, Custom);
506 setOperationAction(ISD::JumpTable , VT, Custom);
507 setOperationAction(ISD::GlobalAddress , VT, Custom);
508 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
509 setOperationAction(ISD::ExternalSymbol , VT, Custom);
510 setOperationAction(ISD::BlockAddress , VT, Custom);
511 }
512
513 // 64-bit shl, sra, srl (iff 32-bit x86)
514 for (auto VT : { MVT::i32, MVT::i64 }) {
515 if (VT == MVT::i64 && !Subtarget.is64Bit())
516 continue;
517 setOperationAction(ISD::SHL_PARTS, VT, Custom);
518 setOperationAction(ISD::SRA_PARTS, VT, Custom);
519 setOperationAction(ISD::SRL_PARTS, VT, Custom);
520 }
521
522 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
523 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
524
525 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
526
527 // Expand certain atomics
528 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
529 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
530 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
531 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
532 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
533 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
534 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
535 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
536 }
537
538 if (!Subtarget.is64Bit())
539 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
540
541 if (Subtarget.canUseCMPXCHG16B())
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
543
544 // FIXME - use subtarget debug flags
545 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
546 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
547 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
548 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
549 }
550
551 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
552 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
553
554 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
555 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
556
557 setOperationAction(ISD::TRAP, MVT::Other, Legal);
558 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
559 if (Subtarget.isTargetPS())
560 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
561 else
562 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
563
564 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
565 setOperationAction(ISD::VASTART , MVT::Other, Custom);
566 setOperationAction(ISD::VAEND , MVT::Other, Expand);
567 bool Is64Bit = Subtarget.is64Bit();
568 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
569 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
570
571 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
572 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
573
574 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
575
576 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
577 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
578 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
579
580 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
581
582 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
583 setOperationAction(ISD::FABS, VT, Action);
584 setOperationAction(ISD::FNEG, VT, Action);
585 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
586 setOperationAction(ISD::FREM, VT, Action);
587 setOperationAction(ISD::FMA, VT, Action);
588 setOperationAction(ISD::FMINNUM, VT, Action);
589 setOperationAction(ISD::FMAXNUM, VT, Action);
590 setOperationAction(ISD::FMINIMUM, VT, Action);
591 setOperationAction(ISD::FMAXIMUM, VT, Action);
592 setOperationAction(ISD::FSIN, VT, Action);
593 setOperationAction(ISD::FCOS, VT, Action);
594 setOperationAction(ISD::FSINCOS, VT, Action);
595 setOperationAction(ISD::FSQRT, VT, Action);
596 setOperationAction(ISD::FPOW, VT, Action);
597 setOperationAction(ISD::FLOG, VT, Action);
598 setOperationAction(ISD::FLOG2, VT, Action);
599 setOperationAction(ISD::FLOG10, VT, Action);
600 setOperationAction(ISD::FEXP, VT, Action);
601 setOperationAction(ISD::FEXP2, VT, Action);
602 setOperationAction(ISD::FCEIL, VT, Action);
603 setOperationAction(ISD::FFLOOR, VT, Action);
604 setOperationAction(ISD::FNEARBYINT, VT, Action);
605 setOperationAction(ISD::FRINT, VT, Action);
606 setOperationAction(ISD::BR_CC, VT, Action);
607 setOperationAction(ISD::SETCC, VT, Action);
608 setOperationAction(ISD::SELECT, VT, Custom);
609 setOperationAction(ISD::SELECT_CC, VT, Action);
610 setOperationAction(ISD::FROUND, VT, Action);
611 setOperationAction(ISD::FROUNDEVEN, VT, Action);
612 setOperationAction(ISD::FTRUNC, VT, Action);
613 };
614
615 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
616 // f16, f32 and f64 use SSE.
617 // Set up the FP register classes.
618 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
619 : &X86::FR16RegClass);
620 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
621 : &X86::FR32RegClass);
622 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
623 : &X86::FR64RegClass);
624
625 // Disable f32->f64 extload as we can only generate this in one instruction
626 // under optsize. So its easier to pattern match (fpext (load)) for that
627 // case instead of needing to emit 2 instructions for extload in the
628 // non-optsize case.
629 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
630
631 for (auto VT : { MVT::f32, MVT::f64 }) {
632 // Use ANDPD to simulate FABS.
633 setOperationAction(ISD::FABS, VT, Custom);
634
635 // Use XORP to simulate FNEG.
636 setOperationAction(ISD::FNEG, VT, Custom);
637
638 // Use ANDPD and ORPD to simulate FCOPYSIGN.
639 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
640
641 // These might be better off as horizontal vector ops.
642 setOperationAction(ISD::FADD, VT, Custom);
643 setOperationAction(ISD::FSUB, VT, Custom);
644
645 // We don't support sin/cos/fmod
646 setOperationAction(ISD::FSIN , VT, Expand);
647 setOperationAction(ISD::FCOS , VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 }
650
651 // Half type will be promoted by default.
652 setF16Action(MVT::f16, Promote);
653 setOperationAction(ISD::FADD, MVT::f16, Promote);
654 setOperationAction(ISD::FSUB, MVT::f16, Promote);
655 setOperationAction(ISD::FMUL, MVT::f16, Promote);
656 setOperationAction(ISD::FDIV, MVT::f16, Promote);
657 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
658 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
659 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
660
661 setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
662 setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
663 setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
664 setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
665 setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
666 setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
667 setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
668 setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
669 setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
670 setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
671 setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
672 setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
673 setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
674 setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
675 setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
676 setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
677 setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
678 setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
679 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
680 setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
681 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
682 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
683 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
684 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
685 setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
686 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
687 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
689
690 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
691 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
692
693 // Lower this to MOVMSK plus an AND.
694 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
695 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
696
697 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
698 (UseX87 || Is64Bit)) {
699 // Use SSE for f32, x87 for f64.
700 // Set up the FP register classes.
701 addRegisterClass(MVT::f32, &X86::FR32RegClass);
702 if (UseX87)
703 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
704
705 // Use ANDPS to simulate FABS.
706 setOperationAction(ISD::FABS , MVT::f32, Custom);
707
708 // Use XORP to simulate FNEG.
709 setOperationAction(ISD::FNEG , MVT::f32, Custom);
710
711 if (UseX87)
712 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
713
714 // Use ANDPS and ORPS to simulate FCOPYSIGN.
715 if (UseX87)
716 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
717 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
718
719 // We don't support sin/cos/fmod
720 setOperationAction(ISD::FSIN , MVT::f32, Expand);
721 setOperationAction(ISD::FCOS , MVT::f32, Expand);
722 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
723
724 if (UseX87) {
725 // Always expand sin/cos functions even though x87 has an instruction.
726 setOperationAction(ISD::FSIN, MVT::f64, Expand);
727 setOperationAction(ISD::FCOS, MVT::f64, Expand);
728 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
729 }
730 } else if (UseX87) {
731 // f32 and f64 in x87.
732 // Set up the FP register classes.
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
735
736 for (auto VT : { MVT::f32, MVT::f64 }) {
737 setOperationAction(ISD::UNDEF, VT, Expand);
738 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
739
740 // Always expand sin/cos functions even though x87 has an instruction.
741 setOperationAction(ISD::FSIN , VT, Expand);
742 setOperationAction(ISD::FCOS , VT, Expand);
743 setOperationAction(ISD::FSINCOS, VT, Expand);
744 }
745 }
746
747 // Expand FP32 immediates into loads from the stack, save special cases.
748 if (isTypeLegal(MVT::f32)) {
749 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
750 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
751 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
752 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
753 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
754 } else // SSE immediates.
755 addLegalFPImmediate(APFloat(+0.0f)); // xorps
756 }
757 // Expand FP64 immediates into loads from the stack, save special cases.
758 if (isTypeLegal(MVT::f64)) {
759 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
760 addLegalFPImmediate(APFloat(+0.0)); // FLD0
761 addLegalFPImmediate(APFloat(+1.0)); // FLD1
762 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
763 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
764 } else // SSE immediates.
765 addLegalFPImmediate(APFloat(+0.0)); // xorpd
766 }
767 // Support fp16 0 immediate.
768 if (isTypeLegal(MVT::f16))
769 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
770
771 // Handle constrained floating-point operations of scalar.
772 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
773 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
774 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
775 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
776 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
777 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
778 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
779 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
780 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
781 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
782 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
783 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
784
785 // We don't support FMA.
786 setOperationAction(ISD::FMA, MVT::f64, Expand);
787 setOperationAction(ISD::FMA, MVT::f32, Expand);
788
789 // f80 always uses X87.
790 if (UseX87) {
791 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
792 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
793 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
794 {
795 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
796 addLegalFPImmediate(TmpFlt); // FLD0
797 TmpFlt.changeSign();
798 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
799
800 bool ignored;
801 APFloat TmpFlt2(+1.0);
802 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
803 &ignored);
804 addLegalFPImmediate(TmpFlt2); // FLD1
805 TmpFlt2.changeSign();
806 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
807 }
808
809 // Always expand sin/cos functions even though x87 has an instruction.
810 setOperationAction(ISD::FSIN , MVT::f80, Expand);
811 setOperationAction(ISD::FCOS , MVT::f80, Expand);
812 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
813
814 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
815 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
816 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
817 setOperationAction(ISD::FRINT, MVT::f80, Expand);
818 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
819 setOperationAction(ISD::FMA, MVT::f80, Expand);
820 setOperationAction(ISD::LROUND, MVT::f80, Expand);
821 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
822 setOperationAction(ISD::LRINT, MVT::f80, Custom);
823 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
824
825 // Handle constrained floating-point operations of scalar.
826 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
827 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
828 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
829 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
830 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
831 if (isTypeLegal(MVT::f16)) {
832 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
833 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
834 } else {
835 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
836 }
837 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
838 // as Custom.
839 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
840 }
841
842 // f128 uses xmm registers, but most operations require libcalls.
843 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
844 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
845 : &X86::VR128RegClass);
846
847 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
848
849 setOperationAction(ISD::FADD, MVT::f128, LibCall);
850 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
851 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
852 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
853 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
854 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
855 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
856 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
857 setOperationAction(ISD::FMA, MVT::f128, LibCall);
858 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
859
860 setOperationAction(ISD::FABS, MVT::f128, Custom);
861 setOperationAction(ISD::FNEG, MVT::f128, Custom);
862 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
863
864 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
865 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
866 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
867 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
868 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
869 // No STRICT_FSINCOS
870 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
871 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
872
873 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
874 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
875 // We need to custom handle any FP_ROUND with an f128 input, but
876 // LegalizeDAG uses the result type to know when to run a custom handler.
877 // So we have to list all legal floating point result types here.
878 if (isTypeLegal(MVT::f32)) {
879 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
880 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
881 }
882 if (isTypeLegal(MVT::f64)) {
883 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
884 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
885 }
886 if (isTypeLegal(MVT::f80)) {
887 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
888 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
889 }
890
891 setOperationAction(ISD::SETCC, MVT::f128, Custom);
892
893 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
894 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
895 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
896 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
897 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
898 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
899 }
900
901 // Always use a library call for pow.
902 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
903 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
904 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
905 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
906
907 setOperationAction(ISD::FLOG, MVT::f80, Expand);
908 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
909 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
910 setOperationAction(ISD::FEXP, MVT::f80, Expand);
911 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
912 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
913 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
914
915 // Some FP actions are always expanded for vector types.
916 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
917 MVT::v4f32, MVT::v8f32, MVT::v16f32,
918 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
919 setOperationAction(ISD::FSIN, VT, Expand);
920 setOperationAction(ISD::FSINCOS, VT, Expand);
921 setOperationAction(ISD::FCOS, VT, Expand);
922 setOperationAction(ISD::FREM, VT, Expand);
923 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
924 setOperationAction(ISD::FPOW, VT, Expand);
925 setOperationAction(ISD::FLOG, VT, Expand);
926 setOperationAction(ISD::FLOG2, VT, Expand);
927 setOperationAction(ISD::FLOG10, VT, Expand);
928 setOperationAction(ISD::FEXP, VT, Expand);
929 setOperationAction(ISD::FEXP2, VT, Expand);
930 }
931
932 // First set operation action for all vector types to either promote
933 // (for widening) or expand (for scalarization). Then we will selectively
934 // turn on ones that can be effectively codegen'd.
935 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
936 setOperationAction(ISD::SDIV, VT, Expand);
937 setOperationAction(ISD::UDIV, VT, Expand);
938 setOperationAction(ISD::SREM, VT, Expand);
939 setOperationAction(ISD::UREM, VT, Expand);
940 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
941 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
942 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
943 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
944 setOperationAction(ISD::FMA, VT, Expand);
945 setOperationAction(ISD::FFLOOR, VT, Expand);
946 setOperationAction(ISD::FCEIL, VT, Expand);
947 setOperationAction(ISD::FTRUNC, VT, Expand);
948 setOperationAction(ISD::FRINT, VT, Expand);
949 setOperationAction(ISD::FNEARBYINT, VT, Expand);
950 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
951 setOperationAction(ISD::MULHS, VT, Expand);
952 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
953 setOperationAction(ISD::MULHU, VT, Expand);
954 setOperationAction(ISD::SDIVREM, VT, Expand);
955 setOperationAction(ISD::UDIVREM, VT, Expand);
956 setOperationAction(ISD::CTPOP, VT, Expand);
957 setOperationAction(ISD::CTTZ, VT, Expand);
958 setOperationAction(ISD::CTLZ, VT, Expand);
959 setOperationAction(ISD::ROTL, VT, Expand);
960 setOperationAction(ISD::ROTR, VT, Expand);
961 setOperationAction(ISD::BSWAP, VT, Expand);
962 setOperationAction(ISD::SETCC, VT, Expand);
963 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
964 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
965 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
966 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
967 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
968 setOperationAction(ISD::TRUNCATE, VT, Expand);
969 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
970 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
971 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
972 setOperationAction(ISD::SELECT_CC, VT, Expand);
973 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
974 setTruncStoreAction(InnerVT, VT, Expand);
975
976 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
977 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
978
979 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
980 // types, we have to deal with them whether we ask for Expansion or not.
981 // Setting Expand causes its own optimisation problems though, so leave
982 // them legal.
983 if (VT.getVectorElementType() == MVT::i1)
984 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
985
986 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
987 // split/scalarized right now.
988 if (VT.getVectorElementType() == MVT::f16 ||
989 VT.getVectorElementType() == MVT::bf16)
990 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
991 }
992 }
993
994 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
995 // with -msoft-float, disable use of MMX as well.
996 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
997 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
998 // No operations on x86mmx supported, everything uses intrinsics.
999 }
1000
1001 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1002 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1003 : &X86::VR128RegClass);
1004
1005 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1006 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1007 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
1008 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
1009 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
1010 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1011 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
1013
1014 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1015 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1016
1017 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1018 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1019 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1020 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1021 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1022 }
1023
1024 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1025 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1026 : &X86::VR128RegClass);
1027
1028 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1029 // registers cannot be used even for integer operations.
1030 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1031 : &X86::VR128RegClass);
1032 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1033 : &X86::VR128RegClass);
1034 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1035 : &X86::VR128RegClass);
1036 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1037 : &X86::VR128RegClass);
1038 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1039 : &X86::VR128RegClass);
1040
1041 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1042 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1043 setOperationAction(ISD::SDIV, VT, Custom);
1044 setOperationAction(ISD::SREM, VT, Custom);
1045 setOperationAction(ISD::UDIV, VT, Custom);
1046 setOperationAction(ISD::UREM, VT, Custom);
1047 }
1048
1049 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1052
1053 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1055 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1056 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1057 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1060 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1061 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1062 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1063 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
1064 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
1065
1066 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1067 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1068 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1069
1070 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1071 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1072 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
1073
1074 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1075 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1076 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1077 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1078 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1079 }
1080
1081 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1082 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1083
1084 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
1085 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
1086 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
1087 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
1088 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
1089 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
1090 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
1091 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
1092 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
1093 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
1094
1095 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1096 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1097 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1098 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1099
1100 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1101 setOperationAction(ISD::SETCC, VT, Custom);
1102 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1103 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1104 setOperationAction(ISD::CTPOP, VT, Custom);
1105 setOperationAction(ISD::ABS, VT, Custom);
1106
1107 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1108 // setcc all the way to isel and prefer SETGT in some isel patterns.
1109 setCondCodeAction(ISD::SETLT, VT, Custom);
1110 setCondCodeAction(ISD::SETLE, VT, Custom);
1111 }
1112
1113 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1114 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1115 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1116 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1117 setOperationAction(ISD::VSELECT, VT, Custom);
1118 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1119 }
1120
1121 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1122 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1123 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1124 setOperationAction(ISD::VSELECT, VT, Custom);
1125
1126 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1127 continue;
1128
1129 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1130 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1131 }
1132 setF16Action(MVT::v8f16, Expand);
1133 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1134 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1135 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1136 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1137
1138 // Custom lower v2i64 and v2f64 selects.
1139 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1140 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1141 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1142 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1143 setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
1144 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1145
1146 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
1147 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1148 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1149 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1150 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
1151 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1152
1153 // Custom legalize these to avoid over promotion or custom promotion.
1154 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1155 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1156 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1157 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1158 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1159 }
1160
1161 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1162 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
1163 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1164 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1165
1166 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1167 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1168
1169 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1170 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1171
1172 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1173 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1174 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1175 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1176 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1177
1178 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1179 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1180 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1181 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1182
1183 // We want to legalize this to an f64 load rather than an i64 load on
1184 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1185 // store.
1186 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1187 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1188 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1189 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1190 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1191 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1192
1193 // Add 32-bit vector stores to help vectorization opportunities.
1194 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1195 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1196
1197 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1198 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1199 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1200 if (!Subtarget.hasAVX512())
1201 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1202
1203 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1204 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1205 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1206
1207 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1208
1209 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1210 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1211 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1212 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1213 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1214 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1215
1216 // In the customized shift lowering, the legal v4i32/v2i64 cases
1217 // in AVX2 will be recognized.
1218 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1219 setOperationAction(ISD::SRL, VT, Custom);
1220 setOperationAction(ISD::SHL, VT, Custom);
1221 setOperationAction(ISD::SRA, VT, Custom);
1222 if (VT == MVT::v2i64) continue;
1223 setOperationAction(ISD::ROTL, VT, Custom);
1224 setOperationAction(ISD::ROTR, VT, Custom);
1225 setOperationAction(ISD::FSHL, VT, Custom);
1226 setOperationAction(ISD::FSHR, VT, Custom);
1227 }
1228
1229 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1230 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1231 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1232 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1233 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1234 }
1235
1236 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1237 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1238 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1239 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1240 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1241 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1242 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1243 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1244 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1245
1246 // These might be better off as horizontal vector ops.
1247 setOperationAction(ISD::ADD, MVT::i16, Custom);
1248 setOperationAction(ISD::ADD, MVT::i32, Custom);
1249 setOperationAction(ISD::SUB, MVT::i16, Custom);
1250 setOperationAction(ISD::SUB, MVT::i32, Custom);
1251 }
1252
1253 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1254 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1255 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1256 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1257 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1258 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1259 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1260 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1261 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1262 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1263 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1264 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1265 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1266 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1267
1268 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1269 }
1270
1271 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1272 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1273 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1274 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1275 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1276 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1277 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1278 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1279
1280 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1281 setOperationAction(ISD::ABDS, VT, Custom);
1282 setOperationAction(ISD::ABDU, VT, Custom);
1283 }
1284
1285 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1286 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1287 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1288
1289 // FIXME: Do we need to handle scalar-to-vector here?
1290 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1291 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1292
1293 // We directly match byte blends in the backend as they match the VSELECT
1294 // condition form.
1295 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1296
1297 // SSE41 brings specific instructions for doing vector sign extend even in
1298 // cases where we don't have SRA.
1299 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1300 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1301 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1302 }
1303
1304 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1305 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1306 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1307 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1308 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1309 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1310 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1311 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1312 }
1313
1314 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1315 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1316 // do the pre and post work in the vector domain.
1317 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1318 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1319 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1320 // so that DAG combine doesn't try to turn it into uint_to_fp.
1321 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1322 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1323 }
1324 }
1325
1326 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1327 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1328 }
1329
1330 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1331 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1332 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1333 setOperationAction(ISD::ROTL, VT, Custom);
1334 setOperationAction(ISD::ROTR, VT, Custom);
1335 }
1336
1337 // XOP can efficiently perform BITREVERSE with VPPERM.
1338 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1339 setOperationAction(ISD::BITREVERSE, VT, Custom);
1340
1341 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1342 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1343 setOperationAction(ISD::BITREVERSE, VT, Custom);
1344 }
1345
1346 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1347 bool HasInt256 = Subtarget.hasInt256();
1348
1349 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1350 : &X86::VR256RegClass);
1351 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1352 : &X86::VR256RegClass);
1353 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1354 : &X86::VR256RegClass);
1355 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1356 : &X86::VR256RegClass);
1357 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1358 : &X86::VR256RegClass);
1359 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1360 : &X86::VR256RegClass);
1361 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1362 : &X86::VR256RegClass);
1363
1364 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1365 setOperationAction(ISD::FFLOOR, VT, Legal);
1366 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1367 setOperationAction(ISD::FCEIL, VT, Legal);
1368 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1369 setOperationAction(ISD::FTRUNC, VT, Legal);
1370 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1371 setOperationAction(ISD::FRINT, VT, Legal);
1372 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1373 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1374 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1375 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1376 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1377
1378 setOperationAction(ISD::FROUND, VT, Custom);
1379
1380 setOperationAction(ISD::FNEG, VT, Custom);
1381 setOperationAction(ISD::FABS, VT, Custom);
1382 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1383 }
1384
1385 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1386 // even though v8i16 is a legal type.
1387 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1388 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1389 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1390 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1391 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
1392 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1393 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
1394
1395 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
1396 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
1397 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1398 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
1399 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1400 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
1401
1402 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1403 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1404 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1405 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1406 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1407 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1408 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1409 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1410 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1411 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1412 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1413
1414 if (!Subtarget.hasAVX512())
1415 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1416
1417 // In the customized shift lowering, the legal v8i32/v4i64 cases
1418 // in AVX2 will be recognized.
1419 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1420 setOperationAction(ISD::SRL, VT, Custom);
1421 setOperationAction(ISD::SHL, VT, Custom);
1422 setOperationAction(ISD::SRA, VT, Custom);
1423 setOperationAction(ISD::ABDS, VT, Custom);
1424 setOperationAction(ISD::ABDU, VT, Custom);
1425 if (VT == MVT::v4i64) continue;
1426 setOperationAction(ISD::ROTL, VT, Custom);
1427 setOperationAction(ISD::ROTR, VT, Custom);
1428 setOperationAction(ISD::FSHL, VT, Custom);
1429 setOperationAction(ISD::FSHR, VT, Custom);
1430 }
1431
1432 // These types need custom splitting if their input is a 128-bit vector.
1433 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1434 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1435 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1436 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1437
1438 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1439 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1440 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1441 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1442 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1443 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1444 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1445
1446 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1447 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1448 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1449 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1450 }
1451
1452 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1453 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1454 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1455 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1456
1457 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1458 setOperationAction(ISD::SETCC, VT, Custom);
1459 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1460 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1461 setOperationAction(ISD::CTPOP, VT, Custom);
1462 setOperationAction(ISD::CTLZ, VT, Custom);
1463
1464 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1465 // setcc all the way to isel and prefer SETGT in some isel patterns.
1466 setCondCodeAction(ISD::SETLT, VT, Custom);
1467 setCondCodeAction(ISD::SETLE, VT, Custom);
1468 }
1469
1470 if (Subtarget.hasAnyFMA()) {
1471 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1472 MVT::v2f64, MVT::v4f64 }) {
1473 setOperationAction(ISD::FMA, VT, Legal);
1474 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1475 }
1476 }
1477
1478 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1479 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1480 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1481 }
1482
1483 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1484 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1485 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1486 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1487
1488 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1489 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1490 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1491 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1492 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1493 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1494 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1495 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1496
1497 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1498 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1499
1500 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1501 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1502 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1503 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1504 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1505
1506 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1507 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1508 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1509 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1510 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1511 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1512 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1513 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1514 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1515 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1516 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1517 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1518
1519 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1520 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1521 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1522 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1523 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1524 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1525 }
1526
1527 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1528 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1529 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1530 }
1531
1532 if (HasInt256) {
1533 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1534 // when we have a 256bit-wide blend with immediate.
1535 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1536 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1537
1538 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1539 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1540 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1541 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1542 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1543 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1544 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1545 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1546 }
1547 }
1548
1549 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1550 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1551 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1552 setOperationAction(ISD::MSTORE, VT, Legal);
1553 }
1554
1555 // Extract subvector is special because the value type
1556 // (result) is 128-bit but the source is 256-bit wide.
1557 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1558 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1559 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1560 }
1561
1562 // Custom lower several nodes for 256-bit types.
1563 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1564 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1565 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1566 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1567 setOperationAction(ISD::VSELECT, VT, Custom);
1568 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1569 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1570 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1571 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1572 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1573 setOperationAction(ISD::STORE, VT, Custom);
1574 }
1575 setF16Action(MVT::v16f16, Expand);
1576 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1577 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1578 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1579 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1580
1581 if (HasInt256) {
1582 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1583
1584 // Custom legalize 2x32 to get a little better code.
1585 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1586 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1587
1588 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1589 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1590 setOperationAction(ISD::MGATHER, VT, Custom);
1591 }
1592 }
1593
1594 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1595 Subtarget.hasF16C()) {
1596 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1597 setOperationAction(ISD::FP_ROUND, VT, Custom);
1598 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1599 }
1600 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
1601 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1602 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
1603 }
1604 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1605 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1606 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1607 }
1608
1609 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1610 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
1611 }
1612
1613 // This block controls legalization of the mask vector sizes that are
1614 // available with AVX512. 512-bit vectors are in a separate block controlled
1615 // by useAVX512Regs.
1616 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1617 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1618 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1619 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1620 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1621 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1622
1623 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1624 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1625 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1626
1627 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1628 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1629 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1630 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1631 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1632 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1633 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1634 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1635 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1636 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1637 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1638 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1639
1640 // There is no byte sized k-register load or store without AVX512DQ.
1641 if (!Subtarget.hasDQI()) {
1642 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1643 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1644 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1645 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1646
1647 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1648 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1649 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1650 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1651 }
1652
1653 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1654 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1655 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1656 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1657 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1658 }
1659
1660 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1661 setOperationAction(ISD::VSELECT, VT, Expand);
1662
1663 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1664 setOperationAction(ISD::SETCC, VT, Custom);
1665 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1666 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1667 setOperationAction(ISD::SELECT, VT, Custom);
1668 setOperationAction(ISD::TRUNCATE, VT, Custom);
1669
1670 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1671 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1672 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1673 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1674 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1675 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1676 }
1677
1678 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1679 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1680 }
1681
1682 // This block controls legalization for 512-bit operations with 32/64 bit
1683 // elements. 512-bits can be disabled based on prefer-vector-width and
1684 // required-vector-width function attributes.
1685 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1686 bool HasBWI = Subtarget.hasBWI();
1687
1688 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1689 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1690 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1691 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1692 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1693 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1694 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1695
1696 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1697 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1698 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1699 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1700 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1701 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1702 if (HasBWI)
1703 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1704 }
1705
1706 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1707 setOperationAction(ISD::FNEG, VT, Custom);
1708 setOperationAction(ISD::FABS, VT, Custom);
1709 setOperationAction(ISD::FMA, VT, Legal);
1710 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1711 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1712 }
1713
1714 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1715 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1716 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1717 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1718 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1719 }
1720
1721 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1722 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1723 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1724 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1725 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1726 }
1727
1728 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
1729 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
1730 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1731 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1732 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1733 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
1734
1735 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1736 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1737 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1738 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1739 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1740 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1741 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1742 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1743 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1744 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1745 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1746
1747 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1748 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1749 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1750 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1751 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1752 if (HasBWI)
1753 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1754
1755 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1756 // to 512-bit rather than use the AVX2 instructions so that we can use
1757 // k-masks.
1758 if (!Subtarget.hasVLX()) {
1759 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1760 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1761 setOperationAction(ISD::MLOAD, VT, Custom);
1762 setOperationAction(ISD::MSTORE, VT, Custom);
1763 }
1764 }
1765
1766 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1767 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1768 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1769 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1770 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1771 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1772 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1773 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1774 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1775 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1776 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1777 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1778 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1779
1780 if (HasBWI) {
1781 // Extends from v64i1 masks to 512-bit vectors.
1782 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1783 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1784 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1785 }
1786
1787 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1788 setOperationAction(ISD::FFLOOR, VT, Legal);
1789 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1790 setOperationAction(ISD::FCEIL, VT, Legal);
1791 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1792 setOperationAction(ISD::FTRUNC, VT, Legal);
1793 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1794 setOperationAction(ISD::FRINT, VT, Legal);
1795 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1796 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1797 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1798 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1799 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1800
1801 setOperationAction(ISD::FROUND, VT, Custom);
1802 }
1803
1804 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1805 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1806 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1807 }
1808
1809 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1810 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1811 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1812 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1813
1814 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1815 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1816 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1817 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1818
1819 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1820 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1821 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1822 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1823 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1824 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1825 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1826 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1827
1828 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1829 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1830
1831 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1832
1833 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1834 setOperationAction(ISD::SRL, VT, Custom);
1835 setOperationAction(ISD::SHL, VT, Custom);
1836 setOperationAction(ISD::SRA, VT, Custom);
1837 setOperationAction(ISD::ROTL, VT, Custom);
1838 setOperationAction(ISD::ROTR, VT, Custom);
1839 setOperationAction(ISD::SETCC, VT, Custom);
1840 setOperationAction(ISD::ABDS, VT, Custom);
1841 setOperationAction(ISD::ABDU, VT, Custom);
1842
1843 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1844 // setcc all the way to isel and prefer SETGT in some isel patterns.
1845 setCondCodeAction(ISD::SETLT, VT, Custom);
1846 setCondCodeAction(ISD::SETLE, VT, Custom);
1847 }
1848 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1849 setOperationAction(ISD::SMAX, VT, Legal);
1850 setOperationAction(ISD::UMAX, VT, Legal);
1851 setOperationAction(ISD::SMIN, VT, Legal);
1852 setOperationAction(ISD::UMIN, VT, Legal);
1853 setOperationAction(ISD::ABS, VT, Legal);
1854 setOperationAction(ISD::CTPOP, VT, Custom);
1855 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1856 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1857 }
1858
1859 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1860 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1861 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1862 setOperationAction(ISD::CTLZ, VT, Custom);
1863 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1864 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1865 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1866 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1867 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1868 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1869 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1870 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1871 }
1872
1873 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1874 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1875 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1876 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1877 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1878 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1879
1880 if (Subtarget.hasDQI()) {
1881 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1882 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1883 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1884 setOperationAction(Opc, MVT::v8i64, Custom);
1885 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1886 }
1887
1888 if (Subtarget.hasCDI()) {
1889 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1890 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1891 setOperationAction(ISD::CTLZ, VT, Legal);
1892 }
1893 } // Subtarget.hasCDI()
1894
1895 if (Subtarget.hasVPOPCNTDQ()) {
1896 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1897 setOperationAction(ISD::CTPOP, VT, Legal);
1898 }
1899
1900 // Extract subvector is special because the value type
1901 // (result) is 256-bit but the source is 512-bit wide.
1902 // 128-bit was made Legal under AVX1.
1903 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1904 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1905 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1906
1907 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1908 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1909 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1910 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1911 setOperationAction(ISD::SELECT, VT, Custom);
1912 setOperationAction(ISD::VSELECT, VT, Custom);
1913 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1914 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1915 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1916 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1917 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1918 }
1919 setF16Action(MVT::v32f16, Expand);
1920 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1921 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1922 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
1923 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1924 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1925 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1926 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1927 }
1928
1929 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1930 setOperationAction(ISD::MLOAD, VT, Legal);
1931 setOperationAction(ISD::MSTORE, VT, Legal);
1932 setOperationAction(ISD::MGATHER, VT, Custom);
1933 setOperationAction(ISD::MSCATTER, VT, Custom);
1934 }
1935 if (HasBWI) {
1936 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1937 setOperationAction(ISD::MLOAD, VT, Legal);
1938 setOperationAction(ISD::MSTORE, VT, Legal);
1939 }
1940 } else {
1941 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1942 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1943 }
1944
1945 if (Subtarget.hasVBMI2()) {
1946 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1947 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1948 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1949 setOperationAction(ISD::FSHL, VT, Custom);
1950 setOperationAction(ISD::FSHR, VT, Custom);
1951 }
1952
1953 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1954 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1955 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1956 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1957 }
1958 }// useAVX512Regs
1959
1960 // This block controls legalization for operations that don't have
1961 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1962 // narrower widths.
1963 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1964 // These operations are handled on non-VLX by artificially widening in
1965 // isel patterns.
1966
1967 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
1968 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
1969 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1970
1971 if (Subtarget.hasDQI()) {
1972 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1973 // v2f32 UINT_TO_FP is already custom under SSE2.
1974 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))
1975 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))
1976 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))
;
1977 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1978 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1979 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1980 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1981 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1982 }
1983
1984 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1985 setOperationAction(ISD::SMAX, VT, Legal);
1986 setOperationAction(ISD::UMAX, VT, Legal);
1987 setOperationAction(ISD::SMIN, VT, Legal);
1988 setOperationAction(ISD::UMIN, VT, Legal);
1989 setOperationAction(ISD::ABS, VT, Legal);
1990 }
1991
1992 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1993 setOperationAction(ISD::ROTL, VT, Custom);
1994 setOperationAction(ISD::ROTR, VT, Custom);
1995 }
1996
1997 // Custom legalize 2x32 to get a little better code.
1998 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1999 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2000
2001 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2002 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2003 setOperationAction(ISD::MSCATTER, VT, Custom);
2004
2005 if (Subtarget.hasDQI()) {
2006 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2007 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2008 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2009 setOperationAction(Opc, MVT::v2i64, Custom);
2010 setOperationAction(Opc, MVT::v4i64, Custom);
2011 }
2012 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2013 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2014 }
2015
2016 if (Subtarget.hasCDI()) {
2017 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2018 setOperationAction(ISD::CTLZ, VT, Legal);
2019 }
2020 } // Subtarget.hasCDI()
2021
2022 if (Subtarget.hasVPOPCNTDQ()) {
2023 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2024 setOperationAction(ISD::CTPOP, VT, Legal);
2025 }
2026 }
2027
2028 // This block control legalization of v32i1/v64i1 which are available with
2029 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
2030 // useBWIRegs.
2031 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2032 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2033 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2034
2035 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2036 setOperationAction(ISD::VSELECT, VT, Expand);
2037 setOperationAction(ISD::TRUNCATE, VT, Custom);
2038 setOperationAction(ISD::SETCC, VT, Custom);
2039 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2040 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
2041 setOperationAction(ISD::SELECT, VT, Custom);
2042 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2043 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2044 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
2045 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
2046 }
2047
2048 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2049 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2050
2051 // Extends from v32i1 masks to 256-bit vectors.
2052 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
2053 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
2054 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
2055
2056 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2057 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2058 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2059 }
2060
2061 // These operations are handled on non-VLX by artificially widening in
2062 // isel patterns.
2063 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2064
2065 if (Subtarget.hasBITALG()) {
2066 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2067 setOperationAction(ISD::CTPOP, VT, Legal);
2068 }
2069 }
2070
2071 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2072 auto setGroup = [&] (MVT VT) {
2073 setOperationAction(ISD::FADD, VT, Legal);
2074 setOperationAction(ISD::STRICT_FADD, VT, Legal);
2075 setOperationAction(ISD::FSUB, VT, Legal);
2076 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
2077 setOperationAction(ISD::FMUL, VT, Legal);
2078 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
2079 setOperationAction(ISD::FDIV, VT, Legal);
2080 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
2081 setOperationAction(ISD::FSQRT, VT, Legal);
2082 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
2083
2084 setOperationAction(ISD::FFLOOR, VT, Legal);
2085 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
2086 setOperationAction(ISD::FCEIL, VT, Legal);
2087 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
2088 setOperationAction(ISD::FTRUNC, VT, Legal);
2089 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
2090 setOperationAction(ISD::FRINT, VT, Legal);
2091 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
2092 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2093 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
2094
2095 setOperationAction(ISD::FROUND, VT, Custom);
2096
2097 setOperationAction(ISD::LOAD, VT, Legal);
2098 setOperationAction(ISD::STORE, VT, Legal);
2099
2100 setOperationAction(ISD::FMA, VT, Legal);
2101 setOperationAction(ISD::STRICT_FMA, VT, Legal);
2102 setOperationAction(ISD::VSELECT, VT, Legal);
2103 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2104 setOperationAction(ISD::SELECT, VT, Custom);
2105
2106 setOperationAction(ISD::FNEG, VT, Custom);
2107 setOperationAction(ISD::FABS, VT, Custom);
2108 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
2109 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2110 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2111 };
2112
2113 // AVX512_FP16 scalar operations
2114 setGroup(MVT::f16);
2115 setOperationAction(ISD::FREM, MVT::f16, Promote);
2116 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
2117 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
2118 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2119 setOperationAction(ISD::SETCC, MVT::f16, Custom);
2120 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
2121 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
2122 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
2123 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2124 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
2125 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
2126 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
2127 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2128 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
2129
2130 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2131 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2132
2133 if (Subtarget.useAVX512Regs()) {
2134 setGroup(MVT::v32f16);
2135 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
2136 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
2137 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
2138 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
2139 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
2140 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2141 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
2142 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
2143 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
2144 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
2145 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
2146 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
2147
2148 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
2149 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
2150 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
2151 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
2152 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2153 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2154 MVT::v32i16);
2155 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2156 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2157 MVT::v32i16);
2158 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2159 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2160 MVT::v32i16);
2161 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2162 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2163 MVT::v32i16);
2164
2165 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2166 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2167 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2168
2169 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2170 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2171
2172 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2173 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2174 }
2175
2176 if (Subtarget.hasVLX()) {
2177 setGroup(MVT::v8f16);
2178 setGroup(MVT::v16f16);
2179
2180 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2181 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2183 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2184 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2185 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2186 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2187 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2188 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2189 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2190
2191 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2192 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2193 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2194 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2195 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
2196 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2197 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
2198 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2199 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
2200 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
2201
2202 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2203 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2204 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2205
2206 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2207 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2208 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2209
2210 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2211 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2212 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2214
2215 // Need to custom widen these to prevent scalarization.
2216 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2217 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2218 }
2219 }
2220
2221 if (!Subtarget.useSoftFloat() &&
2222 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2223 addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
2224 addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
2225 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2226 // provide the method to promote BUILD_VECTOR. Set the operation action
2227 // Custom to do the customization later.
2228 setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2229 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2230 setF16Action(VT, Expand);
2231 setOperationAction(ISD::FADD, VT, Expand);
2232 setOperationAction(ISD::FSUB, VT, Expand);
2233 setOperationAction(ISD::FMUL, VT, Expand);
2234 setOperationAction(ISD::FDIV, VT, Expand);
2235 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2236 }
2237 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2238 }
2239
2240 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2241 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2242 setF16Action(MVT::v32bf16, Expand);
2243 setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2244 setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2245 setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2246 setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2247 setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2248 }
2249
2250 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2251 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2252 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2253 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2254 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2255 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2256
2257 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2258 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2259 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2261 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2262
2263 if (Subtarget.hasBWI()) {
2264 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2265 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2266 }
2267
2268 if (Subtarget.hasFP16()) {
2269 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2270 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2271 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2272 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2273 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2274 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2275 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2276 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2277 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2278 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2279 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2280 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2281 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2282 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2283 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2284 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2285 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2286 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2287 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2288 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2289 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2290 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2291 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2292 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2293 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2294 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2295 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2296 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2297 }
2298
2299 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2300 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2301 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2302 }
2303
2304 if (Subtarget.hasAMXTILE()) {
2305 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2306 }
2307
2308 // We want to custom lower some of our intrinsics.
2309 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2310 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2311 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2312 if (!Subtarget.is64Bit()) {
2313 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2314 }
2315
2316 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2317 // handle type legalization for these operations here.
2318 //
2319 // FIXME: We really should do custom legalization for addition and
2320 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2321 // than generic legalization for 64-bit multiplication-with-overflow, though.
2322 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2323 if (VT == MVT::i64 && !Subtarget.is64Bit())
2324 continue;
2325 // Add/Sub/Mul with overflow operations are custom lowered.
2326 setOperationAction(ISD::SADDO, VT, Custom);
2327 setOperationAction(ISD::UADDO, VT, Custom);
2328 setOperationAction(ISD::SSUBO, VT, Custom);
2329 setOperationAction(ISD::USUBO, VT, Custom);
2330 setOperationAction(ISD::SMULO, VT, Custom);
2331 setOperationAction(ISD::UMULO, VT, Custom);
2332
2333 // Support carry in as value rather than glue.
2334 setOperationAction(ISD::ADDCARRY, VT, Custom);
2335 setOperationAction(ISD::SUBCARRY, VT, Custom);
2336 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2337 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2338 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2339 }
2340
2341 if (!Subtarget.is64Bit()) {
2342 // These libcalls are not available in 32-bit.
2343 setLibcallName(RTLIB::SHL_I128, nullptr);
2344 setLibcallName(RTLIB::SRL_I128, nullptr);
2345 setLibcallName(RTLIB::SRA_I128, nullptr);
2346 setLibcallName(RTLIB::MUL_I128, nullptr);
2347 // The MULO libcall is not part of libgcc, only compiler-rt.
2348 setLibcallName(RTLIB::MULO_I64, nullptr);
2349 }
2350 // The MULO libcall is not part of libgcc, only compiler-rt.
2351 setLibcallName(RTLIB::MULO_I128, nullptr);
2352
2353 // Combine sin / cos into _sincos_stret if it is available.
2354 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2355 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2356 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2357 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2358 }
2359
2360 if (Subtarget.isTargetWin64()) {
2361 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2362 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2363 setOperationAction(ISD::SREM, MVT::i128, Custom);
2364 setOperationAction(ISD::UREM, MVT::i128, Custom);
2365 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2366 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2367 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2368 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2369 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2370 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2371 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2372 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2373 }
2374
2375 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2376 // is. We should promote the value to 64-bits to solve this.
2377 // This is what the CRT headers do - `fmodf` is an inline header
2378 // function casting to f64 and calling `fmod`.
2379 if (Subtarget.is32Bit() &&
2380 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2381 for (ISD::NodeType Op :
2382 {ISD::FCEIL, ISD::STRICT_FCEIL,
2383 ISD::FCOS, ISD::STRICT_FCOS,
2384 ISD::FEXP, ISD::STRICT_FEXP,
2385 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2386 ISD::FREM, ISD::STRICT_FREM,
2387 ISD::FLOG, ISD::STRICT_FLOG,
2388 ISD::FLOG10, ISD::STRICT_FLOG10,
2389 ISD::FPOW, ISD::STRICT_FPOW,
2390 ISD::FSIN, ISD::STRICT_FSIN})
2391 if (isOperationExpand(Op, MVT::f32))
2392 setOperationAction(Op, MVT::f32, Promote);
2393
2394 // We have target-specific dag combine patterns for the following nodes:
2395 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2396 ISD::SCALAR_TO_VECTOR,
2397 ISD::INSERT_VECTOR_ELT,
2398 ISD::EXTRACT_VECTOR_ELT,
2399 ISD::CONCAT_VECTORS,
2400 ISD::INSERT_SUBVECTOR,
2401 ISD::EXTRACT_SUBVECTOR,
2402 ISD::BITCAST,
2403 ISD::VSELECT,
2404 ISD::SELECT,
2405 ISD::SHL,
2406 ISD::SRA,
2407 ISD::SRL,
2408 ISD::OR,
2409 ISD::AND,
2410 ISD::ADD,
2411 ISD::FADD,
2412 ISD::FSUB,
2413 ISD::FNEG,
2414 ISD::FMA,
2415 ISD::STRICT_FMA,
2416 ISD::FMINNUM,
2417 ISD::FMAXNUM,
2418 ISD::SUB,
2419 ISD::LOAD,
2420 ISD::MLOAD,
2421 ISD::STORE,
2422 ISD::MSTORE,
2423 ISD::TRUNCATE,
2424 ISD::ZERO_EXTEND,
2425 ISD::ANY_EXTEND,
2426 ISD::SIGN_EXTEND,
2427 ISD::SIGN_EXTEND_INREG,
2428 ISD::ANY_EXTEND_VECTOR_INREG,
2429 ISD::SIGN_EXTEND_VECTOR_INREG,
2430 ISD::ZERO_EXTEND_VECTOR_INREG,
2431 ISD::SINT_TO_FP,
2432 ISD::UINT_TO_FP,
2433 ISD::STRICT_SINT_TO_FP,
2434 ISD::STRICT_UINT_TO_FP,
2435 ISD::SETCC,
2436 ISD::MUL,
2437 ISD::XOR,
2438 ISD::MSCATTER,
2439 ISD::MGATHER,
2440 ISD::FP16_TO_FP,
2441 ISD::FP_EXTEND,
2442 ISD::STRICT_FP_EXTEND,
2443 ISD::FP_ROUND,
2444 ISD::STRICT_FP_ROUND});
2445
2446 computeRegisterProperties(Subtarget.getRegisterInfo());
2447
2448 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2449 MaxStoresPerMemsetOptSize = 8;
2450 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2451 MaxStoresPerMemcpyOptSize = 4;
2452 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2453 MaxStoresPerMemmoveOptSize = 4;
2454
2455 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2456 // that needs to benchmarked and balanced with the potential use of vector
2457 // load/store types (PR33329, PR33914).
2458 MaxLoadsPerMemcmp = 2;
2459 MaxLoadsPerMemcmpOptSize = 2;
2460
2461 // Default loop alignment, which can be overridden by -align-loops.
2462 setPrefLoopAlignment(Align(16));
2463
2464 // An out-of-order CPU can speculatively execute past a predictable branch,
2465 // but a conditional move could be stalled by an expensive earlier operation.
2466 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2467 EnableExtLdPromotion = true;
2468 setPrefFunctionAlignment(Align(16));
2469
2470 verifyIntrinsicTables();
2471
2472 // Default to having -disable-strictnode-mutation on
2473 IsStrictFPEnabled = true;
2474}
2475
2476// This has so far only been implemented for 64-bit MachO.
2477bool X86TargetLowering::useLoadStackGuardNode() const {
2478 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2479}
2480
2481bool X86TargetLowering::useStackGuardXorFP() const {
2482 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2483 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2484}
2485
2486SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2487 const SDLoc &DL) const {
2488 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2489 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2490 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2491 return SDValue(Node, 0);
2492}
2493
2494TargetLoweringBase::LegalizeTypeAction
2495X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2496 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2497 !Subtarget.hasBWI())
2498 return TypeSplitVector;
2499
2500 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2501 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2502 return TypeSplitVector;
2503
2504 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2505 VT.getVectorElementType() != MVT::i1)
2506 return TypeWidenVector;
2507
2508 return TargetLoweringBase::getPreferredVectorAction(VT);
2509}
2510
2511static std::pair<MVT, unsigned>
2512handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2513 const X86Subtarget &Subtarget) {
2514 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2515 // convention is one that uses k registers.
2516 if (NumElts == 2)
2517 return {MVT::v2i64, 1};
2518 if (NumElts == 4)
2519 return {MVT::v4i32, 1};
2520 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2521 CC != CallingConv::Intel_OCL_BI)
2522 return {MVT::v8i16, 1};
2523 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2524 CC != CallingConv::Intel_OCL_BI)
2525 return {MVT::v16i8, 1};
2526 // v32i1 passes in ymm unless we have BWI and the calling convention is
2527 // regcall.
2528 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2529 return {MVT::v32i8, 1};
2530 // Split v64i1 vectors if we don't have v64i8 available.
2531 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2532 if (Subtarget.useAVX512Regs())
2533 return {MVT::v64i8, 1};
2534 return {MVT::v32i8, 2};
2535 }
2536
2537 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2538 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2539 NumElts > 64)
2540 return {MVT::i8, NumElts};
2541
2542 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2543}
2544
2545MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2546 CallingConv::ID CC,
2547 EVT VT) const {
2548 if (VT.isVector()) {
2549 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2550 unsigned NumElts = VT.getVectorNumElements();
2551
2552 MVT RegisterVT;
2553 unsigned NumRegisters;
2554 std::tie(RegisterVT, NumRegisters) =
2555 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2556 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2557 return RegisterVT;
2558 }
2559
2560 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2561 return MVT::v8f16;
2562 }
2563
2564 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2565 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2566 !Subtarget.hasX87())
2567 return MVT::i32;
2568
2569 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2570 return getRegisterTypeForCallingConv(Context, CC,
2571 VT.changeVectorElementTypeToInteger());
2572
2573 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2574}
2575
2576unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2577 CallingConv::ID CC,
2578 EVT VT) const {
2579 if (VT.isVector()) {
2580 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2581 unsigned NumElts = VT.getVectorNumElements();
2582
2583 MVT RegisterVT;
2584 unsigned NumRegisters;
2585 std::tie(RegisterVT, NumRegisters) =
2586 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2587 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2588 return NumRegisters;
2589 }
2590
2591 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2592 return 1;
2593 }
2594
2595 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2596 // x87 is disabled.
2597 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2598 if (VT == MVT::f64)
2599 return 2;
2600 if (VT == MVT::f80)
2601 return 3;
2602 }
2603
2604 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2605 return getNumRegistersForCallingConv(Context, CC,
2606 VT.changeVectorElementTypeToInteger());
2607
2608 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2609}
2610
2611unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2612 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2613 unsigned &NumIntermediates, MVT &RegisterVT) const {
2614 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2615 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2616 Subtarget.hasAVX512() &&
2617 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2618 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2619 VT.getVectorNumElements() > 64)) {
2620 RegisterVT = MVT::i8;
2621 IntermediateVT = MVT::i1;
2622 NumIntermediates = VT.getVectorNumElements();
2623 return NumIntermediates;
2624 }
2625
2626 // Split v64i1 vectors if we don't have v64i8 available.
2627 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2628 CC != CallingConv::X86_RegCall) {
2629 RegisterVT = MVT::v32i8;
2630 IntermediateVT = MVT::v32i1;
2631 NumIntermediates = 2;
2632 return 2;
2633 }
2634
2635 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2636 NumIntermediates, RegisterVT);
2637}
2638
2639EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2640 LLVMContext& Context,
2641 EVT VT) const {
2642 if (!VT.isVector())
2643 return MVT::i8;
2644
2645 if (Subtarget.hasAVX512()) {
2646 // Figure out what this type will be legalized to.
2647 EVT LegalVT = VT;
2648 while (getTypeAction(Context, LegalVT) != TypeLegal)
2649 LegalVT = getTypeToTransformTo(Context, LegalVT);
2650
2651 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2652 if (LegalVT.getSimpleVT().is512BitVector())
2653 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2654
2655 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2656 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2657 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2658 // vXi16/vXi8.
2659 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2660 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2661 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2662 }
2663 }
2664
2665 return VT.changeVectorElementTypeToInteger();
2666}
2667
2668/// Helper for getByValTypeAlignment to determine
2669/// the desired ByVal argument alignment.
2670static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2671 if (MaxAlign == 16)
2672 return;
2673 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2674 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
2675 MaxAlign = Align(16);
2676 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2677 Align EltAlign;
2678 getMaxByValAlign(ATy->getElementType(), EltAlign);
2679 if (EltAlign > MaxAlign)
2680 MaxAlign = EltAlign;
2681 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2682 for (auto *EltTy : STy->elements()) {
2683 Align EltAlign;
2684 getMaxByValAlign(EltTy, EltAlign);
2685 if (EltAlign > MaxAlign)
2686 MaxAlign = EltAlign;
2687 if (MaxAlign == 16)
2688 break;
2689 }
2690 }
2691}
2692
2693/// Return the desired alignment for ByVal aggregate
2694/// function arguments in the caller parameter area. For X86, aggregates
2695/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2696/// are at 4-byte boundaries.
2697uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2698 const DataLayout &DL) const {
2699 if (Subtarget.is64Bit()) {
2700 // Max of 8 and alignment of type.
2701 Align TyAlign = DL.getABITypeAlign(Ty);
2702 if (TyAlign > 8)
2703 return TyAlign.value();
2704 return 8;
2705 }
2706
2707 Align Alignment(4);
2708 if (Subtarget.hasSSE1())
2709 getMaxByValAlign(Ty, Alignment);
2710 return Alignment.value();
2711}
2712
2713/// It returns EVT::Other if the type should be determined using generic
2714/// target-independent logic.
2715/// For vector ops we check that the overall size isn't larger than our
2716/// preferred vector width.
2717EVT X86TargetLowering::getOptimalMemOpType(
2718 const MemOp &Op, const AttributeList &FuncAttributes) const {
2719 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2720 if (Op.size() >= 16 &&
2721 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2722 // FIXME: Check if unaligned 64-byte accesses are slow.
2723 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2724 (Subtarget.getPreferVectorWidth() >= 512)) {
2725 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2726 }
2727 // FIXME: Check if unaligned 32-byte accesses are slow.
2728 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2729 Subtarget.useLight256BitInstructions()) {
2730 // Although this isn't a well-supported type for AVX1, we'll let
2731 // legalization and shuffle lowering produce the optimal codegen. If we
2732 // choose an optimal type with a vector element larger than a byte,
2733 // getMemsetStores() may create an intermediate splat (using an integer
2734 // multiply) before we splat as a vector.
2735 return MVT::v32i8;
2736 }
2737 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2738 return MVT::v16i8;
2739 // TODO: Can SSE1 handle a byte vector?
2740 // If we have SSE1 registers we should be able to use them.
2741 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2742 (Subtarget.getPreferVectorWidth() >= 128))
2743 return MVT::v4f32;
2744 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2745 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2746 // Do not use f64 to lower memcpy if source is string constant. It's
2747 // better to use i32 to avoid the loads.
2748 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2749 // The gymnastics of splatting a byte value into an XMM register and then
2750 // only using 8-byte stores (because this is a CPU with slow unaligned
2751 // 16-byte accesses) makes that a loser.
2752 return MVT::f64;
2753 }
2754 }
2755 // This is a compromise. If we reach here, unaligned accesses may be slow on
2756 // this target. However, creating smaller, aligned accesses could be even
2757 // slower and would certainly be a lot more code.
2758 if (Subtarget.is64Bit() && Op.size() >= 8)
2759 return MVT::i64;
2760 return MVT::i32;
2761}
2762
2763bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2764 if (VT == MVT::f32)
2765 return Subtarget.hasSSE1();
2766 if (VT == MVT::f64)
2767 return Subtarget.hasSSE2();
2768 return true;
2769}
2770
2771static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
2772 return (8 * Alignment.value()) % SizeInBits == 0;
2773}
2774
2775bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
2776 if (isBitAligned(Alignment, VT.getSizeInBits()))
2777 return true;
2778 switch (VT.getSizeInBits()) {
2779 default:
2780 // 8-byte and under are always assumed to be fast.
2781 return true;
2782 case 128:
2783 return !Subtarget.isUnalignedMem16Slow();
2784 case 256:
2785 return !Subtarget.isUnalignedMem32Slow();
2786 // TODO: What about AVX-512 (512-bit) accesses?
2787 }
2788}
2789
2790bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2791 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2792 unsigned *Fast) const {
2793 if (Fast)
2794 *Fast = isMemoryAccessFast(VT, Alignment);
2795 // NonTemporal vector memory ops must be aligned.
2796 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2797 // NT loads can only be vector aligned, so if its less aligned than the
2798 // minimum vector size (which we can split the vector down to), we might as
2799 // well use a regular unaligned vector load.
2800 // We don't have any NT loads pre-SSE41.
2801 if (!!(Flags & MachineMemOperand::MOLoad))
2802 return (Alignment < 16 || !Subtarget.hasSSE41());
2803 return false;
2804 }
2805 // Misaligned accesses of any size are always allowed.
2806 return true;
2807}
2808
2809bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
2810 const DataLayout &DL, EVT VT,
2811 unsigned AddrSpace, Align Alignment,
2812 MachineMemOperand::Flags Flags,
2813 unsigned *Fast) const {
2814 if (Fast)
2815 *Fast = isMemoryAccessFast(VT, Alignment);
2816 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2817 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
2818 /*Fast=*/nullptr))
2819 return true;
2820 // NonTemporal vector memory ops are special, and must be aligned.
2821 if (!isBitAligned(Alignment, VT.getSizeInBits()))
2822 return false;
2823 switch (VT.getSizeInBits()) {
2824 case 128:
2825 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
2826 return true;
2827 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
2828 return true;
2829 return false;
2830 case 256:
2831 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
2832 return true;
2833 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
2834 return true;
2835 return false;
2836 case 512:
2837 if (Subtarget.hasAVX512())
2838 return true;
2839 return false;
2840 default:
2841 return false; // Don't have NonTemporal vector memory ops of this size.
2842 }
2843 }
2844 return true;
2845}
2846
2847/// Return the entry encoding for a jump table in the
2848/// current function. The returned value is a member of the
2849/// MachineJumpTableInfo::JTEntryKind enum.
2850unsigned X86TargetLowering::getJumpTableEncoding() const {
2851 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2852 // symbol.
2853 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2854 return MachineJumpTableInfo::EK_Custom32;
2855
2856 // Otherwise, use the normal jump table encoding heuristics.
2857 return TargetLowering::getJumpTableEncoding();
2858}
2859
2860bool X86TargetLowering::splitValueIntoRegisterParts(
2861 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
2862 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
2863 bool IsABIRegCopy = CC.has_value();
2864 EVT ValueVT = Val.getValueType();
2865 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2866 unsigned ValueBits = ValueVT.getSizeInBits();
2867 unsigned PartBits = PartVT.getSizeInBits();
2868 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
2869 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
2870 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
2871 Parts[0] = Val;
2872 return true;
2873 }
2874 return false;
2875}
2876
2877SDValue X86TargetLowering::joinRegisterPartsIntoValue(
2878 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
2879 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
2880 bool IsABIRegCopy = CC.has_value();
2881 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2882 unsigned ValueBits = ValueVT.getSizeInBits();
2883 unsigned PartBits = PartVT.getSizeInBits();
2884 SDValue Val = Parts[0];
2885
2886 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
2887 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
2888 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
2889 return Val;
2890 }
2891 return SDValue();
2892}
2893
2894bool X86TargetLowering::useSoftFloat() const {
2895 return Subtarget.useSoftFloat();
2896}
2897
2898void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2899 ArgListTy &Args) const {
2900
2901 // Only relabel X86-32 for C / Stdcall CCs.
2902 if (Subtarget.is64Bit())
2903 return;
2904 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2905 return;
2906 unsigned ParamRegs = 0;
2907 if (auto *M = MF->getFunction().getParent())
2908 ParamRegs = M->getNumberRegisterParameters();
2909
2910 // Mark the first N int arguments as having reg
2911 for (auto &Arg : Args) {
2912 Type *T = Arg.Ty;
2913 if (T->isIntOrPtrTy())
2914 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2915 unsigned numRegs = 1;
2916 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2917 numRegs = 2;
2918 if (ParamRegs < numRegs)
2919 return;
2920 ParamRegs -= numRegs;
2921 Arg.IsInReg = true;
2922 }
2923 }
2924}
2925
2926const MCExpr *
2927X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2928 const MachineBasicBlock *MBB,
2929 unsigned uid,MCContext &Ctx) const{
2930 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2930, __extension__
__PRETTY_FUNCTION__))
;
2931 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2932 // entries.
2933 return MCSymbolRefExpr::create(MBB->getSymbol(),
2934 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2935}
2936
2937/// Returns relocation base for the given PIC jumptable.
2938SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2939 SelectionDAG &DAG) const {
2940 if (!Subtarget.is64Bit())
2941 // This doesn't have SDLoc associated with it, but is not really the
2942 // same as a Register.
2943 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2944 getPointerTy(DAG.getDataLayout()));
2945 return Table;
2946}
2947
2948/// This returns the relocation base for the given PIC jumptable,
2949/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2950const MCExpr *X86TargetLowering::
2951getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2952 MCContext &Ctx) const {
2953 // X86-64 uses RIP relative addressing based on the jump table label.
2954 if (Subtarget.isPICStyleRIPRel())
2955 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2956
2957 // Otherwise, the reference is relative to the PIC base.
2958 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2959}
2960
2961std::pair<const TargetRegisterClass *, uint8_t>
2962X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2963 MVT VT) const {
2964 const TargetRegisterClass *RRC = nullptr;
2965 uint8_t Cost = 1;
2966 switch (VT.SimpleTy) {
2967 default:
2968 return TargetLowering::findRepresentativeClass(TRI, VT);
2969 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2970 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2971 break;
2972 case MVT::x86mmx:
2973 RRC = &X86::VR64RegClass;
2974 break;
2975 case MVT::f32: case MVT::f64:
2976 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2977 case MVT::v4f32: case MVT::v2f64:
2978 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2979 case MVT::v8f32: case MVT::v4f64:
2980 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2981 case MVT::v16f32: case MVT::v8f64:
2982 RRC = &X86::VR128XRegClass;
2983 break;
2984 }
2985 return std::make_pair(RRC, Cost);
2986}
2987
2988unsigned X86TargetLowering::getAddressSpace() const {
2989 if (Subtarget.is64Bit())
2990 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2991 return 256;
2992}
2993
2994static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2995 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2996 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2997}
2998
2999static Constant* SegmentOffset(IRBuilderBase &IRB,
3000 int Offset, unsigned AddressSpace) {
3001 return ConstantExpr::getIntToPtr(
3002 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
3003 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
3004}
3005
3006Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
3007 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
3008 // tcbhead_t; use it instead of the usual global variable (see
3009 // sysdeps/{i386,x86_64}/nptl/tls.h)
3010 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
3011 if (Subtarget.isTargetFuchsia()) {
3012 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
3013 return SegmentOffset(IRB, 0x10, getAddressSpace());
3014 } else {
3015 unsigned AddressSpace = getAddressSpace();
3016 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
3017 // Specially, some users may customize the base reg and offset.
3018 int Offset = M->getStackProtectorGuardOffset();
3019 // If we don't set -stack-protector-guard-offset value:
3020 // %fs:0x28, unless we're using a Kernel code model, in which case
3021 // it's %gs:0x28. gs:0x14 on i386.
3022 if (Offset == INT_MAX2147483647)
3023 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
3024
3025 StringRef GuardReg = M->getStackProtectorGuardReg();
3026 if (GuardReg == "fs")
3027 AddressSpace = X86AS::FS;
3028 else if (GuardReg == "gs")
3029 AddressSpace = X86AS::GS;
3030
3031 // Use symbol guard if user specify.
3032 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
3033 if (!GuardSymb.empty()) {
3034 GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
3035 if (!GV) {
3036 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
3037 : Type::getInt32Ty(M->getContext());
3038 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
3039 nullptr, GuardSymb, nullptr,
3040 GlobalValue::NotThreadLocal, AddressSpace);
3041 }
3042 return GV;
3043 }
3044
3045 return SegmentOffset(IRB, Offset, AddressSpace);
3046 }
3047 }
3048 return TargetLowering::getIRStackGuard(IRB);
3049}
3050
3051void X86TargetLowering::insertSSPDeclarations(Module &M) const {
3052 // MSVC CRT provides functionalities for stack protection.
3053 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3054 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3055 // MSVC CRT has a global variable holding security cookie.
3056 M.getOrInsertGlobal("__security_cookie",
3057 Type::getInt8PtrTy(M.getContext()));
3058
3059 // MSVC CRT has a function to validate security cookie.
3060 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
3061 "__security_check_cookie", Type::getVoidTy(M.getContext()),
3062 Type::getInt8PtrTy(M.getContext()));
3063 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
3064 F->setCallingConv(CallingConv::X86_FastCall);
3065 F->addParamAttr(0, Attribute::AttrKind::InReg);
3066 }
3067 return;
3068 }
3069
3070 StringRef GuardMode = M.getStackProtectorGuard();
3071
3072 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
3073 if ((GuardMode == "tls" || GuardMode.empty()) &&
3074 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
3075 return;
3076 TargetLowering::insertSSPDeclarations(M);
3077}
3078
3079Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
3080 // MSVC CRT has a global variable holding security cookie.
3081 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3082 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3083 return M.getGlobalVariable("__security_cookie");
3084 }
3085 return TargetLowering::getSDagStackGuard(M);
3086}
3087
3088Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
3089 // MSVC CRT has a function to validate security cookie.
3090 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3091 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3092 return M.getFunction("__security_check_cookie");
3093 }
3094 return TargetLowering::getSSPStackGuardCheck(M);
3095}
3096
3097Value *
3098X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
3099 if (Subtarget.getTargetTriple().isOSContiki())
3100 return getDefaultSafeStackPointerLocation(IRB, false);
3101
3102 // Android provides a fixed TLS slot for the SafeStack pointer. See the
3103 // definition of TLS_SLOT_SAFESTACK in
3104 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
3105 if (Subtarget.isTargetAndroid()) {
3106 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
3107 // %gs:0x24 on i386
3108 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
3109 return SegmentOffset(IRB, Offset, getAddressSpace());
3110 }
3111
3112 // Fuchsia is similar.
3113 if (Subtarget.isTargetFuchsia()) {
3114 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
3115 return SegmentOffset(IRB, 0x18, getAddressSpace());
3116 }
3117
3118 return TargetLowering::getSafeStackPointerLocation(IRB);
3119}
3120
3121//===----------------------------------------------------------------------===//
3122// Return Value Calling Convention Implementation
3123//===----------------------------------------------------------------------===//
3124
3125bool X86TargetLowering::CanLowerReturn(
3126 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3127 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3128 SmallVector<CCValAssign, 16> RVLocs;
3129 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3130 return CCInfo.CheckReturn(Outs, RetCC_X86);
3131}
3132
3133const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
3134 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
3135 return ScratchRegs;
3136}
3137
3138ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
3139 // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
3140 // tests at the moment, which is not what we expected.
3141 static const MCPhysReg RCRegs[] = {X86::MXCSR};
3142 return RCRegs;
3143}
3144
3145/// Lowers masks values (v*i1) to the local register values
3146/// \returns DAG node after lowering to register type
3147static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
3148 const SDLoc &Dl, SelectionDAG &DAG) {
3149 EVT ValVT = ValArg.getValueType();
3150
3151 if (ValVT == MVT::v1i1)
3152 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
3153 DAG.getIntPtrConstant(0, Dl));
3154
3155 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
3156 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
3157 // Two stage lowering might be required
3158 // bitcast: v8i1 -> i8 / v16i1 -> i16
3159 // anyextend: i8 -> i32 / i16 -> i32
3160 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
3161 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
3162 if (ValLoc == MVT::i32)
3163 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
3164 return ValToCopy;
3165 }
3166
3167 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
3168 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
3169 // One stage lowering is required
3170 // bitcast: v32i1 -> i32 / v64i1 -> i64
3171 return DAG.getBitcast(ValLoc, ValArg);
3172 }
3173
3174 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
3175}
3176
3177/// Breaks v64i1 value into two registers and adds the new node to the DAG
3178static void Passv64i1ArgInRegs(
3179 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
3180 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
3181 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
3182 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__))
;
3183 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3183, __extension__
__PRETTY_FUNCTION__))
;
3184 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
;
3185 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
3186 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
;
3187
3188 // Before splitting the value we cast it to i64
3189 Arg = DAG.getBitcast(MVT::i64, Arg);
3190
3191 // Splitting the value into two i32 types
3192 SDValue Lo, Hi;
3193 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);
3194
3195 // Attach the two i32 types into corresponding registers
3196 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
3197 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
3198}
3199
3200SDValue
3201X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3202 bool isVarArg,
3203 const SmallVectorImpl<ISD::OutputArg> &Outs,
3204 const SmallVectorImpl<SDValue> &OutVals,
3205 const SDLoc &dl, SelectionDAG &DAG) const {
3206 MachineFunction &MF = DAG.getMachineFunction();
3207 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3208
3209 // In some cases we need to disable registers from the default CSR list.
3210 // For example, when they are used as return registers (preserve_* and X86's
3211 // regcall) or for argument passing (X86's regcall).
3212 bool ShouldDisableCalleeSavedRegister =
3213 shouldDisableRetRegFromCSR(CallConv) ||
3214 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
3215
3216 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
3217 report_fatal_error("X86 interrupts may not return any value");
3218
3219 SmallVector<CCValAssign, 16> RVLocs;
3220 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
3221 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
3222
3223 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
3224 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
3225 ++I, ++OutsIndex) {
3226 CCValAssign &VA = RVLocs[I];
3227 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3227, __extension__
__PRETTY_FUNCTION__))
;
3228
3229 // Add the register to the CalleeSaveDisableRegs list.
3230 if (ShouldDisableCalleeSavedRegister)
3231 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
3232
3233 SDValue ValToCopy = OutVals[OutsIndex];
3234 EVT ValVT = ValToCopy.getValueType();
3235
3236 // Promote values to the appropriate types.
3237 if (VA.getLocInfo() == CCValAssign::SExt)
3238 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
3239 else if (VA.getLocInfo() == CCValAssign::ZExt)
3240 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
3241 else if (VA.getLocInfo() == CCValAssign::AExt) {
3242 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
3243 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
3244 else
3245 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
3246 }
3247 else if (VA.getLocInfo() == CCValAssign::BCvt)
3248 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
3249
3250 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__))
3251 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__))
;
3252
3253 // Report an error if we have attempted to return a value via an XMM
3254 // register and SSE was disabled.
3255 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3256 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3257 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3258 } else if (!Subtarget.hasSSE2() &&
3259 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3260 ValVT == MVT::f64) {
3261 // When returning a double via an XMM register, report an error if SSE2 is
3262 // not enabled.
3263 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3264 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3265 }
3266
3267 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
3268 // the RET instruction and handled by the FP Stackifier.
3269 if (VA.getLocReg() == X86::FP0 ||
3270 VA.getLocReg() == X86::FP1) {
3271 // If this is a copy from an xmm register to ST(0), use an FPExtend to
3272 // change the value to the FP stack register class.
3273 if (isScalarFPTypeInSSEReg(VA.getValVT()))
3274 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
3275 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3276 // Don't emit a copytoreg.
3277 continue;
3278 }
3279
3280 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
3281 // which is returned in RAX / RDX.
3282 if (Subtarget.is64Bit()) {
3283 if (ValVT == MVT::x86mmx) {
3284 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
3285 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
3286 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
3287 ValToCopy);
3288 // If we don't have SSE2 available, convert to v4f32 so the generated
3289 // register is legal.
3290 if (!Subtarget.hasSSE2())
3291 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3292 }
3293 }
3294 }
3295
3296 if (VA.needsCustom()) {
3297 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__))
3298 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__))
;
3299
3300 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3301 Subtarget);
3302
3303 // Add the second register to the CalleeSaveDisableRegs list.
3304 if (ShouldDisableCalleeSavedRegister)
3305 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3306 } else {
3307 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3308 }
3309 }
3310
3311 SDValue Glue;
3312 SmallVector<SDValue, 6> RetOps;
3313 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3314 // Operand #1 = Bytes To Pop
3315 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3316 MVT::i32));
3317
3318 // Copy the result values into the output registers.
3319 for (auto &RetVal : RetVals) {
3320 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3321 RetOps.push_back(RetVal.second);
3322 continue; // Don't emit a copytoreg.
3323 }
3324
3325 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
3326 Glue = Chain.getValue(1);
3327 RetOps.push_back(
3328 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3329 }
3330
3331 // Swift calling convention does not require we copy the sret argument
3332 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3333
3334 // All x86 ABIs require that for returning structs by value we copy
3335 // the sret argument into %rax/%eax (depending on ABI) for the return.
3336 // We saved the argument into a virtual register in the entry block,
3337 // so now we copy the value out and into %rax/%eax.
3338 //
3339 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3340 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3341 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3342 // either case FuncInfo->setSRetReturnReg() will have been called.
3343 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3344 // When we have both sret and another return value, we should use the
3345 // original Chain stored in RetOps[0], instead of the current Chain updated
3346 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3347
3348 // For the case of sret and another return value, we have
3349 // Chain_0 at the function entry
3350 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3351 // If we use Chain_1 in getCopyFromReg, we will have
3352 // Val = getCopyFromReg(Chain_1)
3353 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3354
3355 // getCopyToReg(Chain_0) will be glued together with
3356 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3357 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3358 // Data dependency from Unit B to Unit A due to usage of Val in
3359 // getCopyToReg(Chain_1, Val)
3360 // Chain dependency from Unit A to Unit B
3361
3362 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3363 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3364 getPointerTy(MF.getDataLayout()));
3365
3366 Register RetValReg
3367 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3368 X86::RAX : X86::EAX;
3369 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
3370 Glue = Chain.getValue(1);
3371
3372 // RAX/EAX now acts like a return value.
3373 RetOps.push_back(
3374 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3375
3376 // Add the returned register to the CalleeSaveDisableRegs list. Don't do
3377 // this however for preserve_most/preserve_all to minimize the number of
3378 // callee-saved registers for these CCs.
3379 if (ShouldDisableCalleeSavedRegister &&
3380 CallConv != CallingConv::PreserveAll &&
3381 CallConv != CallingConv::PreserveMost)
3382 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3383 }
3384
3385 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3386 const MCPhysReg *I =
3387 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3388 if (I) {
3389 for (; *I; ++I) {
3390 if (X86::GR64RegClass.contains(*I))
3391 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3392 else
3393 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3393)
;
3394 }
3395 }
3396
3397 RetOps[0] = Chain; // Update chain.
3398
3399 // Add the glue if we have it.
3400 if (Glue.getNode())
3401 RetOps.push_back(Glue);
3402
3403 X86ISD::NodeType opcode = X86ISD::RET_GLUE;
3404 if (CallConv == CallingConv::X86_INTR)
3405 opcode = X86ISD::IRET;
3406 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3407}
3408
3409bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3410 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3411 return false;
3412
3413 SDValue TCChain = Chain;
3414 SDNode *Copy = *N->use_begin();
3415 if (Copy->getOpcode() == ISD::CopyToReg) {
3416 // If the copy has a glue operand, we conservatively assume it isn't safe to
3417 // perform a tail call.
3418 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3419 return false;
3420 TCChain = Copy->getOperand(0);
3421 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3422 return false;
3423
3424 bool HasRet = false;
3425 for (const SDNode *U : Copy->uses()) {
3426 if (U->getOpcode() != X86ISD::RET_GLUE)
3427 return false;
3428 // If we are returning more than one value, we can definitely
3429 // not make a tail call see PR19530
3430 if (U->getNumOperands() > 4)
3431 return false;
3432 if (U->getNumOperands() == 4 &&
3433 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3434 return false;
3435 HasRet = true;
3436 }
3437
3438 if (!HasRet)
3439 return false;
3440
3441 Chain = TCChain;
3442 return true;
3443}
3444
3445EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3446 ISD::NodeType ExtendKind) const {
3447 MVT ReturnMVT = MVT::i32;
3448
3449 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3450 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3451 // The ABI does not require i1, i8 or i16 to be extended.
3452 //
3453 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3454 // always extending i8/i16 return values, so keep doing that for now.
3455 // (PR26665).
3456 ReturnMVT = MVT::i8;
3457 }
3458
3459 EVT MinVT = getRegisterType(Context, ReturnMVT);
3460 return VT.bitsLT(MinVT) ? MinVT : VT;
3461}
3462
3463/// Reads two 32 bit registers and creates a 64 bit mask value.
3464/// \param VA The current 32 bit value that need to be assigned.
3465/// \param NextVA The next 32 bit value that need to be assigned.
3466/// \param Root The parent DAG node.
3467/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
3468/// glue purposes. In the case the DAG is already using
3469/// physical register instead of virtual, we should glue
3470/// our new SDValue to InGlue SDvalue.
3471/// \return a new SDvalue of size 64bit.
3472static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3473 SDValue &Root, SelectionDAG &DAG,
3474 const SDLoc &Dl, const X86Subtarget &Subtarget,
3475 SDValue *InGlue = nullptr) {
3476 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3476, __extension__
__PRETTY_FUNCTION__))
;
3477 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3477, __extension__
__PRETTY_FUNCTION__))
;
3478 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__))
3479 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__))
;
3480 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__))
3481 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__))
;
3482 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__))
3483 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__))
;
3484
3485 SDValue Lo, Hi;
3486 SDValue ArgValueLo, ArgValueHi;
3487
3488 MachineFunction &MF = DAG.getMachineFunction();
3489 const TargetRegisterClass *RC = &X86::GR32RegClass;
3490
3491 // Read a 32 bit value from the registers.
3492 if (nullptr == InGlue) {
3493 // When no physical register is present,
3494 // create an intermediate virtual register.
3495 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3496 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3497 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3498 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3499 } else {
3500 // When a physical register is available read the value from it and glue
3501 // the reads together.
3502 ArgValueLo =
3503 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);
3504 *InGlue = ArgValueLo.getValue(2);
3505 ArgValueHi =
3506 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);
3507 *InGlue = ArgValueHi.getValue(2);
3508 }
3509
3510 // Convert the i32 type into v32i1 type.
3511 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3512
3513 // Convert the i32 type into v32i1 type.
3514 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3515
3516 // Concatenate the two values together.
3517 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3518}
3519
3520/// The function will lower a register of various sizes (8/16/32/64)
3521/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3522/// \returns a DAG node contains the operand after lowering to mask type.
3523static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3524 const EVT &ValLoc, const SDLoc &Dl,
3525 SelectionDAG &DAG) {
3526 SDValue ValReturned = ValArg;
3527
3528 if (ValVT == MVT::v1i1)
3529 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3530
3531 if (ValVT == MVT::v64i1) {
3532 // In 32 bit machine, this case is handled by getv64i1Argument
3533 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3533, __extension__
__PRETTY_FUNCTION__))
;
3534 // In 64 bit machine, There is no need to truncate the value only bitcast
3535 } else {
3536 MVT maskLen;
3537 switch (ValVT.getSimpleVT().SimpleTy) {
3538 case MVT::v8i1:
3539 maskLen = MVT::i8;
3540 break;
3541 case MVT::v16i1:
3542 maskLen = MVT::i16;
3543 break;
3544 case MVT::v32i1:
3545 maskLen = MVT::i32;
3546 break;
3547 default:
3548 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3548)
;
3549 }
3550
3551 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3552 }
3553 return DAG.getBitcast(ValVT, ValReturned);
3554}
3555
3556/// Lower the result values of a call into the
3557/// appropriate copies out of appropriate physical registers.
3558///
3559SDValue X86TargetLowering::LowerCallResult(
3560 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
3561 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3562 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3563 uint32_t *RegMask) const {
3564
3565 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3566 // Assign locations to each value returned by this call.
3567 SmallVector<CCValAssign, 16> RVLocs;
3568 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3569 *DAG.getContext());
3570 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3571
3572 // Copy all of the result registers out of their specified physreg.
3573 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3574 ++I, ++InsIndex) {
3575 CCValAssign &VA = RVLocs[I];
3576 EVT CopyVT = VA.getLocVT();
3577
3578 // In some calling conventions we need to remove the used registers
3579 // from the register mask.
3580 if (RegMask) {
3581 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3582 SubRegs.isValid(); ++SubRegs)
3583 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3584 }
3585
3586 // Report an error if there was an attempt to return FP values via XMM
3587 // registers.
3588 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3589 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3590 if (VA.getLocReg() == X86::XMM1)
3591 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3592 else
3593 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3594 } else if (!Subtarget.hasSSE2() &&
3595 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3596 CopyVT == MVT::f64) {
3597 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3598 if (VA.getLocReg() == X86::XMM1)
3599 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3600 else
3601 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3602 }
3603
3604 // If we prefer to use the value in xmm registers, copy it out as f80 and
3605 // use a truncate to move it from fp stack reg to xmm reg.
3606 bool RoundAfterCopy = false;
3607 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3608 isScalarFPTypeInSSEReg(VA.getValVT())) {
3609 if (!Subtarget.hasX87())
3610 report_fatal_error("X87 register return with X87 disabled");
3611 CopyVT = MVT::f80;
3612 RoundAfterCopy = (CopyVT != VA.getLocVT());
3613 }
3614
3615 SDValue Val;
3616 if (VA.needsCustom()) {
3617 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__))
3618 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__))
;
3619 Val =
3620 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
3621 } else {
3622 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
3623 .getValue(1);
3624 Val = Chain.getValue(0);
3625 InGlue = Chain.getValue(2);
3626 }
3627
3628 if (RoundAfterCopy)
3629 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3630 // This truncation won't change the value.
3631 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
3632
3633 if (VA.isExtInLoc()) {
3634 if (VA.getValVT().isVector() &&
3635 VA.getValVT().getScalarType() == MVT::i1 &&
3636 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3637 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3638 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3639 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3640 } else
3641 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3642 }
3643
3644 if (VA.getLocInfo() == CCValAssign::BCvt)
3645 Val = DAG.getBitcast(VA.getValVT(), Val);
3646
3647 InVals.push_back(Val);
3648 }
3649
3650 return Chain;
3651}
3652
3653//===----------------------------------------------------------------------===//
3654// C & StdCall & Fast Calling Convention implementation
3655//===----------------------------------------------------------------------===//
3656// StdCall calling convention seems to be standard for many Windows' API
3657// routines and around. It differs from C calling convention just a little:
3658// callee should clean up the stack, not caller. Symbols should be also
3659// decorated in some fancy way :) It doesn't support any vector arguments.
3660// For info on fast calling convention see Fast Calling Convention (tail call)
3661// implementation LowerX86_32FastCCCallTo.
3662
3663/// Determines whether Args, either a set of outgoing arguments to a call, or a
3664/// set of incoming args of a call, contains an sret pointer that the callee
3665/// pops
3666template <typename T>
3667static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3668 const X86Subtarget &Subtarget) {
3669 // Not C++20 (yet), so no concepts available.
3670 static_assert(std::is_same_v<T, ISD::OutputArg> ||
3671 std::is_same_v<T, ISD::InputArg>,
3672 "requires ISD::OutputArg or ISD::InputArg");
3673
3674 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3675 // for most compilations.
3676 if (!Subtarget.is32Bit())
3677 return false;
3678
3679 if (Args.empty())
3680 return false;
3681
3682 // Most calls do not have an sret argument, check the arg next.
3683 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3684 if (!Flags.isSRet() || Flags.isInReg())
3685 return false;
3686
3687 // The MSVCabi does not pop the sret.
3688 if (Subtarget.getTargetTriple().isOSMSVCRT())
3689 return false;
3690
3691 // MCUs don't pop the sret
3692 if (Subtarget.isTargetMCU())
3693 return false;
3694
3695 // Callee pops argument
3696 return true;
3697}
3698
3699/// Make a copy of an aggregate at address specified by "Src" to address
3700/// "Dst" with size and alignment information specified by the specific
3701/// parameter attribute. The copy will be passed as a byval function parameter.
3702static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3703 SDValue Chain, ISD::ArgFlagsTy Flags,
3704 SelectionDAG &DAG, const SDLoc &dl) {
3705 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3706
3707 return DAG.getMemcpy(
3708 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3709 /*isVolatile*/ false, /*AlwaysInline=*/true,
3710 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3711}
3712
3713/// Return true if the calling convention is one that we can guarantee TCO for.
3714static bool canGuaranteeTCO(CallingConv::ID CC) {
3715 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3716 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3717 CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
3718}
3719
3720/// Return true if we might ever do TCO for calls with this calling convention.
3721static bool mayTailCallThisCC(CallingConv::ID CC) {
3722 switch (CC) {
3723 // C calling conventions:
3724 case CallingConv::C:
3725 case CallingConv::Win64:
3726 case CallingConv::X86_64_SysV:
3727 // Callee pop conventions:
3728 case CallingConv::X86_ThisCall:
3729 case CallingConv::X86_StdCall:
3730 case CallingConv::X86_VectorCall:
3731 case CallingConv::X86_FastCall:
3732 // Swift:
3733 case CallingConv::Swift:
3734 return true;
3735 default:
3736 return canGuaranteeTCO(CC);
3737 }
3738}
3739
3740/// Return true if the function is being made into a tailcall target by
3741/// changing its ABI.
3742static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3743 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3744 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3745}
3746
3747bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3748 if (!CI->isTailCall())
3749 return false;
3750
3751 CallingConv::ID CalleeCC = CI->getCallingConv();
3752 if (!mayTailCallThisCC(CalleeCC))
3753 return false;
3754
3755 return true;
3756}
3757
3758SDValue
3759X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3760 const SmallVectorImpl<ISD::InputArg> &Ins,
3761 const SDLoc &dl, SelectionDAG &DAG,
3762 const CCValAssign &VA,
3763 MachineFrameInfo &MFI, unsigned i) const {
3764 // Create the nodes corresponding to a load from this parameter slot.
3765 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3766 bool AlwaysUseMutable = shouldGuaranteeTCO(
3767 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3768 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3769 EVT ValVT;
3770 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3771
3772 // If value is passed by pointer we have address passed instead of the value
3773 // itself. No need to extend if the mask value and location share the same
3774 // absolute size.
3775 bool ExtendedInMem =
3776 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3777 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3778
3779 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3780 ValVT = VA.getLocVT();
3781 else
3782 ValVT = VA.getValVT();
3783
3784 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3785 // changed with more analysis.
3786 // In case of tail call optimization mark all arguments mutable. Since they
3787 // could be overwritten by lowering of arguments in case of a tail call.
3788 if (Flags.isByVal()) {
3789 unsigned Bytes = Flags.getByValSize();
3790 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3791
3792 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3793 // can be improved with deeper analysis.
3794 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3795 /*isAliased=*/true);
3796 return DAG.getFrameIndex(FI, PtrVT);
3797 }
3798
3799 EVT ArgVT = Ins[i].ArgVT;
3800
3801 // If this is a vector that has been split into multiple parts, and the
3802 // scalar size of the parts don't match the vector element size, then we can't
3803 // elide the copy. The parts will have padding between them instead of being
3804 // packed like a vector.
3805 bool ScalarizedAndExtendedVector =
3806 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3807 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3808
3809 // This is an argument in memory. We might be able to perform copy elision.
3810 // If the argument is passed directly in memory without any extension, then we
3811 // can perform copy elision. Large vector types, for example, may be passed
3812 // indirectly by pointer.
3813 if (Flags.isCopyElisionCandidate() &&
3814 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3815 !ScalarizedAndExtendedVector) {
3816 SDValue PartAddr;
3817 if (Ins[i].PartOffset == 0) {
3818 // If this is a one-part value or the first part of a multi-part value,
3819 // create a stack object for the entire argument value type and return a
3820 // load from our portion of it. This assumes that if the first part of an
3821 // argument is in memory, the rest will also be in memory.
3822 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3823 /*IsImmutable=*/false);
3824 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3825 return DAG.getLoad(
3826 ValVT, dl, Chain, PartAddr,
3827 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3828 } else {
3829 // This is not the first piece of an argument in memory. See if there is
3830 // already a fixed stack object including this offset. If so, assume it
3831 // was created by the PartOffset == 0 branch above and create a load from
3832 // the appropriate offset into it.
3833 int64_t PartBegin = VA.getLocMemOffset();
3834 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3835 int FI = MFI.getObjectIndexBegin();
3836 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3837 int64_t ObjBegin = MFI.getObjectOffset(FI);
3838 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3839 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3840 break;
3841 }
3842 if (MFI.isFixedObjectIndex(FI)) {
3843 SDValue Addr =
3844 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3845 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3846 return DAG.getLoad(
3847 ValVT, dl, Chain, Addr,
3848 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3849 Ins[i].PartOffset));
3850 }
3851 }
3852 }
3853
3854 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3855 VA.getLocMemOffset(), isImmutable);
3856
3857 // Set SExt or ZExt flag.
3858 if (VA.getLocInfo() == CCValAssign::ZExt) {
3859 MFI.setObjectZExt(FI, true);
3860 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3861 MFI.setObjectSExt(FI, true);
3862 }
3863
3864 MaybeAlign Alignment;
3865 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3866 ValVT != MVT::f80)
3867 Alignment = MaybeAlign(4);
3868 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3869 SDValue Val = DAG.getLoad(
3870 ValVT, dl, Chain, FIN,
3871 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3872 Alignment);
3873 return ExtendedInMem
3874 ? (VA.getValVT().isVector()
3875 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3876 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3877 : Val;
3878}
3879
3880// FIXME: Get this from tablegen.
3881static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3882 const X86Subtarget &Subtarget) {
3883 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3883, __extension__ __PRETTY_FUNCTION__))
;
3884
3885 if (Subtarget.isCallingConvWin64(CallConv)) {
3886 static const MCPhysReg GPR64ArgRegsWin64[] = {
3887 X86::RCX, X86::RDX, X86::R8, X86::R9
3888 };
3889 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3890 }
3891
3892 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3893 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3894 };
3895 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3896}
3897
3898// FIXME: Get this from tablegen.
3899static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3900 CallingConv::ID CallConv,
3901 const X86Subtarget &Subtarget) {
3902 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3902, __extension__ __PRETTY_FUNCTION__))
;
3903 if (Subtarget.isCallingConvWin64(CallConv)) {
3904 // The XMM registers which might contain var arg parameters are shadowed
3905 // in their paired GPR. So we only need to save the GPR to their home
3906 // slots.
3907 // TODO: __vectorcall will change this.
3908 return std::nullopt;
3909 }
3910
3911 bool isSoftFloat = Subtarget.useSoftFloat();
3912 if (isSoftFloat || !Subtarget.hasSSE1())
3913 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3914 // registers.
3915 return std::nullopt;
3916
3917 static const MCPhysReg XMMArgRegs64Bit[] = {
3918 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3919 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3920 };
3921 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3922}
3923
3924#ifndef NDEBUG
3925static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3926 return llvm::is_sorted(
3927 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3928 return A.getValNo() < B.getValNo();
3929 });
3930}
3931#endif
3932
3933namespace {
3934/// This is a helper class for lowering variable arguments parameters.
3935class VarArgsLoweringHelper {
3936public:
3937 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3938 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3939 CallingConv::ID CallConv, CCState &CCInfo)
3940 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3941 TheMachineFunction(DAG.getMachineFunction()),
3942 TheFunction(TheMachineFunction.getFunction()),
3943 FrameInfo(TheMachineFunction.getFrameInfo()),
3944 FrameLowering(*Subtarget.getFrameLowering()),
3945 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3946 CCInfo(CCInfo) {}
3947
3948 // Lower variable arguments parameters.
3949 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3950
3951private:
3952 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3953
3954 void forwardMustTailParameters(SDValue &Chain);
3955
3956 bool is64Bit() const { return Subtarget.is64Bit(); }
3957 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3958
3959 X86MachineFunctionInfo *FuncInfo;
3960 const SDLoc &DL;
3961 SelectionDAG &DAG;
3962 const X86Subtarget &Subtarget;
3963 MachineFunction &TheMachineFunction;
3964 const Function &TheFunction;
3965 MachineFrameInfo &FrameInfo;
3966 const TargetFrameLowering &FrameLowering;
3967 const TargetLowering &TargLowering;
3968 CallingConv::ID CallConv;
3969 CCState &CCInfo;
3970};
3971} // namespace
3972
3973void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3974 SDValue &Chain, unsigned StackSize) {
3975 // If the function takes variable number of arguments, make a frame index for
3976 // the start of the first vararg value... for expansion of llvm.va_start. We
3977 // can skip this if there are no va_start calls.
3978 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3979 CallConv != CallingConv::X86_ThisCall)) {
3980 FuncInfo->setVarArgsFrameIndex(
3981 FrameInfo.CreateFixedObject(1, StackSize, true));
3982 }
3983
3984 // 64-bit calling conventions support varargs and register parameters, so we
3985 // have to do extra work to spill them in the prologue.
3986 if (is64Bit()) {
3987 // Find the first unallocated argument registers.
3988 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3989 ArrayRef<MCPhysReg> ArgXMMs =
3990 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3991 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3992 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3993
3994 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__))
3995 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__))
;
3996
3997 if (isWin64()) {
3998 // Get to the caller-allocated home save location. Add 8 to account
3999 // for the return address.
4000 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
4001 FuncInfo->setRegSaveFrameIndex(
4002 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
4003 // Fixup to set vararg frame on shadow area (4 x i64).
4004 if (NumIntRegs < 4)
4005 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
4006 } else {
4007 // For X86-64, if there are vararg parameters that are passed via
4008 // registers, then we must store them to their spots on the stack so
4009 // they may be loaded by dereferencing the result of va_next.
4010 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
4011 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
4012 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
4013 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
4014 }
4015
4016 SmallVector<SDValue, 6>
4017 LiveGPRs; // list of SDValue for GPR registers keeping live input value
4018 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
4019 // keeping live input value
4020 SDValue ALVal; // if applicable keeps SDValue for %al register
4021
4022 // Gather all the live in physical registers.
4023 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
4024 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
4025 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
4026 }
4027 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
4028 if (!AvailableXmms.empty()) {
4029 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4030 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
4031 for (MCPhysReg Reg : AvailableXmms) {
4032 // FastRegisterAllocator spills virtual registers at basic
4033 // block boundary. That leads to usages of xmm registers
4034 // outside of check for %al. Pass physical registers to
4035 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
4036 TheMachineFunction.getRegInfo().addLiveIn(Reg);
4037 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
4038 }
4039 }
4040
4041 // Store the integer parameter registers.
4042 SmallVector<SDValue, 8> MemOps;
4043 SDValue RSFIN =
4044 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
4045 TargLowering.getPointerTy(DAG.getDataLayout()));
4046 unsigned Offset = FuncInfo->getVarArgsGPOffset();
4047 for (SDValue Val : LiveGPRs) {
4048 SDValue FIN = DAG.getNode(ISD::ADD, DL,
4049 TargLowering.getPointerTy(DAG.getDataLayout()),
4050 RSFIN, DAG.getIntPtrConstant(Offset, DL));
4051 SDValue Store =
4052 DAG.getStore(Val.getValue(1), DL, Val, FIN,
4053 MachinePointerInfo::getFixedStack(
4054 DAG.getMachineFunction(),
4055 FuncInfo->getRegSaveFrameIndex(), Offset));
4056 MemOps.push_back(Store);
4057 Offset += 8;
4058 }
4059
4060 // Now store the XMM (fp + vector) parameter registers.
4061 if (!LiveXMMRegs.empty()) {
4062 SmallVector<SDValue, 12> SaveXMMOps;
4063 SaveXMMOps.push_back(Chain);
4064 SaveXMMOps.push_back(ALVal);
4065 SaveXMMOps.push_back(RSFIN);
4066 SaveXMMOps.push_back(
4067 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
4068 llvm::append_range(SaveXMMOps, LiveXMMRegs);
4069 MachineMemOperand *StoreMMO =
4070 DAG.getMachineFunction().getMachineMemOperand(
4071 MachinePointerInfo::getFixedStack(
4072 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
4073 Offset),
4074 MachineMemOperand::MOStore, 128, Align(16));
4075 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
4076 DL, DAG.getVTList(MVT::Other),
4077 SaveXMMOps, MVT::i8, StoreMMO));
4078 }
4079
4080 if (!MemOps.empty())
4081 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4082 }
4083}
4084
4085void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
4086 // Find the largest legal vector type.
4087 MVT VecVT = MVT::Other;
4088 // FIXME: Only some x86_32 calling conventions support AVX512.
4089 if (Subtarget.useAVX512Regs() &&
4090 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
4091 CallConv == CallingConv::Intel_OCL_BI)))
4092 VecVT = MVT::v16f32;
4093 else if (Subtarget.hasAVX())
4094 VecVT = MVT::v8f32;
4095 else if (Subtarget.hasSSE2())
4096 VecVT = MVT::v4f32;
4097
4098 // We forward some GPRs and some vector types.
4099 SmallVector<MVT, 2> RegParmTypes;
4100 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
4101 RegParmTypes.push_back(IntVT);
4102 if (VecVT != MVT::Other)
4103 RegParmTypes.push_back(VecVT);
4104
4105 // Compute the set of forwarded registers. The rest are scratch.
4106 SmallVectorImpl<ForwardedRegister> &Forwards =
4107 FuncInfo->getForwardedMustTailRegParms();
4108 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
4109
4110 // Forward AL for SysV x86_64 targets, since it is used for varargs.
4111 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
4112 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4113 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
4114 }
4115
4116 // Copy all forwards from physical to virtual registers.
4117 for (ForwardedRegister &FR : Forwards) {
4118 // FIXME: Can we use a less constrained schedule?
4119 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
4120 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
4121 TargLowering.getRegClassFor(FR.VT));
4122 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
4123 }
4124}
4125
4126void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
4127 unsigned StackSize) {
4128 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
4129 // If necessary, it would be set into the correct value later.
4130 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
4131 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4132
4133 if (FrameInfo.hasVAStart())
4134 createVarArgAreaAndStoreRegisters(Chain, StackSize);
4135
4136 if (FrameInfo.hasMustTailInVarArgFunc())
4137 forwardMustTailParameters(Chain);
4138}
4139
4140SDValue X86TargetLowering::LowerFormalArguments(
4141 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
4142 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4143 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4144 MachineFunction &MF = DAG.getMachineFunction();
4145 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4146
4147 const Function &F = MF.getFunction();
4148 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
4149 F.getName() == "main")
4150 FuncInfo->setForceFramePointer(true);
4151
4152 MachineFrameInfo &MFI = MF.getFrameInfo();
4153 bool Is64Bit = Subtarget.is64Bit();
4154 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4155
4156 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))
4157 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))
4158 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))
;
4159
4160 // Assign locations to all of the incoming arguments.
4161 SmallVector<CCValAssign, 16> ArgLocs;
4162 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4163
4164 // Allocate shadow area for Win64.
4165 if (IsWin64)
4166 CCInfo.AllocateStack(32, Align(8));
4167
4168 CCInfo.AnalyzeArguments(Ins, CC_X86);
4169
4170 // In vectorcall calling convention a second pass is required for the HVA
4171 // types.
4172 if (CallingConv::X86_VectorCall == CallConv) {
4173 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
4174 }
4175
4176 // The next loop assumes that the locations are in the same order of the
4177 // input arguments.
4178 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__))
4179 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__))
;
4180
4181 SDValue ArgValue;
4182 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
4183 ++I, ++InsIndex) {
4184 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4184, __extension__
__PRETTY_FUNCTION__))
;
4185 CCValAssign &VA = ArgLocs[I];
4186
4187 if (VA.isRegLoc()) {
4188 EVT RegVT = VA.getLocVT();
4189 if (VA.needsCustom()) {
4190 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))
4191 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))
4192 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))
;
4193
4194 // v64i1 values, in regcall calling convention, that are
4195 // compiled to 32 bit arch, are split up into two registers.
4196 ArgValue =
4197 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
4198 } else {
4199 const TargetRegisterClass *RC;
4200 if (RegVT == MVT::i8)
4201 RC = &X86::GR8RegClass;
4202 else if (RegVT == MVT::i16)
4203 RC = &X86::GR16RegClass;
4204 else if (RegVT == MVT::i32)
4205 RC = &X86::GR32RegClass;
4206 else if (Is64Bit && RegVT == MVT::i64)
4207 RC = &X86::GR64RegClass;
4208 else if (RegVT == MVT::f16)
4209 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
4210 else if (RegVT == MVT::f32)
4211 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
4212 else if (RegVT == MVT::f64)
4213 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
4214 else if (RegVT == MVT::f80)
4215 RC = &X86::RFP80RegClass;
4216 else if (RegVT == MVT::f128)
4217 RC = &X86::VR128RegClass;
4218 else if (RegVT.is512BitVector())
4219 RC = &X86::VR512RegClass;
4220 else if (RegVT.is256BitVector())
4221 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
4222 else if (RegVT.is128BitVector())
4223 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
4224 else if (RegVT == MVT::x86mmx)
4225 RC = &X86::VR64RegClass;
4226 else if (RegVT == MVT::v1i1)
4227 RC = &X86::VK1RegClass;
4228 else if (RegVT == MVT::v8i1)
4229 RC = &X86::VK8RegClass;
4230 else if (RegVT == MVT::v16i1)
4231 RC = &X86::VK16RegClass;
4232 else if (RegVT == MVT::v32i1)
4233 RC = &X86::VK32RegClass;
4234 else if (RegVT == MVT::v64i1)
4235 RC = &X86::VK64RegClass;
4236 else
4237 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4237)
;
4238
4239 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4240 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4241 }
4242
4243 // If this is an 8 or 16-bit value, it is really passed promoted to 32
4244 // bits. Insert an assert[sz]ext to capture this, then truncate to the
4245 // right size.
4246 if (VA.getLocInfo() == CCValAssign::SExt)
4247 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4248 DAG.getValueType(VA.getValVT()));
4249 else if (VA.getLocInfo() == CCValAssign::ZExt)
4250 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4251 DAG.getValueType(VA.getValVT()));
4252 else if (VA.getLocInfo() == CCValAssign::BCvt)
4253 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
4254
4255 if (VA.isExtInLoc()) {
4256 // Handle MMX values passed in XMM regs.
4257 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
4258 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
4259 else if (VA.getValVT().isVector() &&
4260 VA.getValVT().getScalarType() == MVT::i1 &&
4261 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
4262 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
4263 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
4264 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
4265 } else
4266 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4267 }
4268 } else {
4269 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4269, __extension__ __PRETTY_FUNCTION__))
;
4270 ArgValue =
4271 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
4272 }
4273
4274 // If value is passed via pointer - do a load.
4275 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
4276 ArgValue =
4277 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
4278
4279 InVals.push_back(ArgValue);
4280 }
4281
4282 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4283 if (Ins[I].Flags.isSwiftAsync()) {
4284 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
4285 if (Subtarget.is64Bit())
4286 X86FI->setHasSwiftAsyncContext(true);
4287 else {
4288 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
4289 X86FI->setSwiftAsyncContextFrameIdx(FI);
4290 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
4291 DAG.getFrameIndex(FI, MVT::i32),
4292 MachinePointerInfo::getFixedStack(MF, FI));
4293 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4294 }
4295 }
4296
4297 // Swift calling convention does not require we copy the sret argument
4298 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4299 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4300 continue;
4301
4302 // All x86 ABIs require that for returning structs by value we copy the
4303 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4304 // the argument into a virtual register so that we can access it from the
4305 // return points.
4306 if (Ins[I].Flags.isSRet()) {
4307 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__))
4308 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__))
;
4309 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4310 Register Reg =
4311 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4312 FuncInfo->setSRetReturnReg(Reg);
4313 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4314 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4315 break;
4316 }
4317 }
4318
4319 unsigned StackSize = CCInfo.getNextStackOffset();
4320 // Align stack specially for tail calls.
4321 if (shouldGuaranteeTCO(CallConv,
4322 MF.getTarget().Options.GuaranteedTailCallOpt))
4323 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4324
4325 if (IsVarArg)
4326 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4327 .lowerVarArgsParameters(Chain, StackSize);
4328
4329 // Some CCs need callee pop.
4330 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4331 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4332 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4333 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4334 // X86 interrupts must pop the error code (and the alignment padding) if
4335 // present.
4336 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4337 } else {
4338 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4339 // If this is an sret function, the return should pop the hidden pointer.
4340 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4341 FuncInfo->setBytesToPopOnReturn(4);
4342 }
4343
4344 if (!Is64Bit) {
4345 // RegSaveFrameIndex is X86-64 only.
4346 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4347 }
4348
4349 FuncInfo->setArgumentStackSize(StackSize);
4350
4351 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4352 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4353 if (Personality == EHPersonality::CoreCLR) {
4354 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4354,
__extension__ __PRETTY_FUNCTION__))
;
4355 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4356 // that we'd prefer this slot be allocated towards the bottom of the frame
4357 // (i.e. near the stack pointer after allocating the frame). Every
4358 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4359 // offset from the bottom of this and each funclet's frame must be the
4360 // same, so the size of funclets' (mostly empty) frames is dictated by
4361 // how far this slot is from the bottom (since they allocate just enough
4362 // space to accommodate holding this slot at the correct offset).
4363 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4364 EHInfo->PSPSymFrameIdx = PSPSymFI;
4365 }
4366 }
4367
4368 if (shouldDisableArgRegFromCSR(CallConv) ||
4369 F.hasFnAttribute("no_caller_saved_registers")) {
4370 MachineRegisterInfo &MRI = MF.getRegInfo();
4371 for (std::pair<Register, Register> Pair : MRI.liveins())
4372 MRI.disableCalleeSavedRegister(Pair.first);
4373 }
4374
4375 return Chain;
4376}
4377
4378SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4379 SDValue Arg, const SDLoc &dl,
4380 SelectionDAG &DAG,
4381 const CCValAssign &VA,
4382 ISD::ArgFlagsTy Flags,
4383 bool isByVal) const {
4384 unsigned LocMemOffset = VA.getLocMemOffset();
4385 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4386 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4387 StackPtr, PtrOff);
4388 if (isByVal)
4389 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4390
4391 MaybeAlign Alignment;
4392 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4393 Arg.getSimpleValueType() != MVT::f80)
4394 Alignment = MaybeAlign(4);
4395 return DAG.getStore(
4396 Chain, dl, Arg, PtrOff,
4397 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4398 Alignment);
4399}
4400
4401/// Emit a load of return address if tail call
4402/// optimization is performed and it is required.
4403SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4404 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4405 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4406 // Adjust the Return address stack slot.
4407 EVT VT = getPointerTy(DAG.getDataLayout());
4408 OutRetAddr = getReturnAddressFrameIndex(DAG);
4409
4410 // Load the "old" Return address.
4411 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4412 return SDValue(OutRetAddr.getNode(), 1);
4413}
4414
4415/// Emit a store of the return address if tail call
4416/// optimization is performed and it is required (FPDiff!=0).
4417static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4418 SDValue Chain, SDValue RetAddrFrIdx,
4419 EVT PtrVT, unsigned SlotSize,
4420 int FPDiff, const SDLoc &dl) {
4421 // Store the return address to the appropriate stack slot.
4422 if (!FPDiff) return Chain;
4423 // Calculate the new stack slot for the return address.
4424 int NewReturnAddrFI =
4425 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4426 false);
4427 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4428 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4429 MachinePointerInfo::getFixedStack(
4430 DAG.getMachineFunction(), NewReturnAddrFI));
4431 return Chain;
4432}
4433
4434/// Returns a vector_shuffle mask for an movs{s|d}, movd
4435/// operation of specified width.
4436static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4437 SDValue V2) {
4438 unsigned NumElems = VT.getVectorNumElements();
4439 SmallVector<int, 8> Mask;
4440 Mask.push_back(NumElems);
4441 for (unsigned i = 1; i != NumElems; ++i)
4442 Mask.push_back(i);
4443 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4444}
4445
4446SDValue
4447X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4448 SmallVectorImpl<SDValue> &InVals) const {
4449 SelectionDAG &DAG = CLI.DAG;
4450 SDLoc &dl = CLI.DL;
4451 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4452 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4453 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4454 SDValue Chain = CLI.Chain;
4455 SDValue Callee = CLI.Callee;
4456 CallingConv::ID CallConv = CLI.CallConv;
4457 bool &isTailCall = CLI.IsTailCall;
4458 bool isVarArg = CLI.IsVarArg;
4459 const auto *CB = CLI.CB;
4460
4461 MachineFunction &MF = DAG.getMachineFunction();
4462 bool Is64Bit = Subtarget.is64Bit();
4463 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4464 bool IsSibcall = false;
4465 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4466 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4467 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4468 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4469 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4470 CB->hasFnAttr("no_caller_saved_registers"));
4471 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4472 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4473 bool IsCFICall = IsIndirectCall && CLI.CFIType;
4474 const Module *M = MF.getMMI().getModule();
4475 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4476
4477 MachineFunction::CallSiteInfo CSInfo;
4478 if (CallConv == CallingConv::X86_INTR)
4479 report_fatal_error("X86 interrupts may not be called directly");
4480
4481 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4482 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4483 // If we are using a GOT, disable tail calls to external symbols with
4484 // default visibility. Tail calling such a symbol requires using a GOT
4485 // relocation, which forces early binding of the symbol. This breaks code
4486 // that require lazy function symbol resolution. Using musttail or
4487 // GuaranteedTailCallOpt will override this.
4488 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4489 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4490 G->getGlobal()->hasDefaultVisibility()))
4491 isTailCall = false;
4492 }
4493
4494 if (isTailCall && !IsMustTail) {
4495 // Check if it's really possible to do a tail call.
4496 isTailCall = IsEligibleForTailCallOptimization(
4497 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4498 Ins, DAG);
4499
4500 // Sibcalls are automatically detected tailcalls which do not require
4501 // ABI changes.
4502 if (!IsGuaranteeTCO && isTailCall)
4503 IsSibcall = true;
4504
4505 if (isTailCall)
4506 ++NumTailCalls;
4507 }
4508
4509 if (IsMustTail && !isTailCall)
4510 report_fatal_error("failed to perform tail call elimination on a call "
4511 "site marked musttail");
4512
4513 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
4514 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
;
4515
4516 // Analyze operands of the call, assigning locations to each operand.
4517 SmallVector<CCValAssign, 16> ArgLocs;
4518 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4519
4520 // Allocate shadow area for Win64.
4521 if (IsWin64)
4522 CCInfo.AllocateStack(32, Align(8));
4523
4524 CCInfo.AnalyzeArguments(Outs, CC_X86);
4525
4526 // In vectorcall calling convention a second pass is required for the HVA
4527 // types.
4528 if (CallingConv::X86_VectorCall == CallConv) {
4529 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4530 }
4531
4532 // Get a count of how many bytes are to be pushed on the stack.
4533 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4534 if (IsSibcall)
4535 // This is a sibcall. The memory operands are available in caller's
4536 // own caller's stack.
4537 NumBytes = 0;
4538 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4539 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4540
4541 int FPDiff = 0;
4542 if (isTailCall &&
4543 shouldGuaranteeTCO(CallConv,
4544 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4545 // Lower arguments at fp - stackoffset + fpdiff.
4546 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4547
4548 FPDiff = NumBytesCallerPushed - NumBytes;
4549
4550 // Set the delta of movement of the returnaddr stackslot.
4551 // But only set if delta is greater than previous delta.
4552 if (FPDiff < X86Info->getTCReturnAddrDelta())
4553 X86Info->setTCReturnAddrDelta(FPDiff);
4554 }
4555
4556 unsigned NumBytesToPush = NumBytes;
4557 unsigned NumBytesToPop = NumBytes;
4558
4559 // If we have an inalloca argument, all stack space has already been allocated
4560 // for us and be right at the top of the stack. We don't support multiple
4561 // arguments passed in memory when using inalloca.
4562 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4563 NumBytesToPush = 0;
4564 if (!ArgLocs.back().isMemLoc())
4565 report_fatal_error("cannot use inalloca attribute on a register "
4566 "parameter");
4567 if (ArgLocs.back().getLocMemOffset() != 0)
4568 report_fatal_error("any parameter with the inalloca attribute must be "
4569 "the only memory argument");
4570 } else if (CLI.IsPreallocated) {
4571 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
4572 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
4573 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
;
4574 SmallVector<size_t, 4> PreallocatedOffsets;
4575 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4576 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4577 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4578 }
4579 }
4580 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4581 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4582 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4583 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4584 NumBytesToPush = 0;
4585 }
4586
4587 if (!IsSibcall && !IsMustTail)
4588 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4589 NumBytes - NumBytesToPush, dl);
4590
4591 SDValue RetAddrFrIdx;
4592 // Load return address for tail calls.
4593 if (isTailCall && FPDiff)
4594 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4595 Is64Bit, FPDiff, dl);
4596
4597 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4598 SmallVector<SDValue, 8> MemOpChains;
4599 SDValue StackPtr;
4600
4601 // The next loop assumes that the locations are in the same order of the
4602 // input arguments.
4603 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__))
4604 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__))
;
4605
4606 // Walk the register/memloc assignments, inserting copies/loads. In the case
4607 // of tail call optimization arguments are handle later.
4608 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4609 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4610 ++I, ++OutIndex) {
4611 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4611, __extension__
__PRETTY_FUNCTION__))
;
4612 // Skip inalloca/preallocated arguments, they have already been written.
4613 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4614 if (Flags.isInAlloca() || Flags.isPreallocated())
4615 continue;
4616
4617 CCValAssign &VA = ArgLocs[I];
4618 EVT RegVT = VA.getLocVT();
4619 SDValue Arg = OutVals[OutIndex];
4620 bool isByVal = Flags.isByVal();
4621
4622 // Promote the value if needed.
4623 switch (VA.getLocInfo()) {
4624 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4624)
;
4625 case CCValAssign::Full: break;
4626 case CCValAssign::SExt:
4627 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4628 break;
4629 case CCValAssign::ZExt:
4630 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4631 break;
4632 case CCValAssign::AExt:
4633 if (Arg.getValueType().isVector() &&
4634 Arg.getValueType().getVectorElementType() == MVT::i1)
4635 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4636 else if (RegVT.is128BitVector()) {
4637 // Special case: passing MMX values in XMM registers.
4638 Arg = DAG.getBitcast(MVT::i64, Arg);
4639 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4640 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4641 } else
4642 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4643 break;
4644 case CCValAssign::BCvt:
4645 Arg = DAG.getBitcast(RegVT, Arg);
4646 break;
4647 case CCValAssign::Indirect: {
4648 if (isByVal) {
4649 // Memcpy the argument to a temporary stack slot to prevent
4650 // the caller from seeing any modifications the callee may make
4651 // as guaranteed by the `byval` attribute.
4652 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4653 Flags.getByValSize(),
4654 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4655 SDValue StackSlot =
4656 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4657 Chain =
4658 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4659 // From now on treat this as a regular pointer
4660 Arg = StackSlot;
4661 isByVal = false;
4662 } else {
4663 // Store the argument.
4664 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4665 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4666 Chain = DAG.getStore(
4667 Chain, dl, Arg, SpillSlot,
4668 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4669 Arg = SpillSlot;
4670 }
4671 break;
4672 }
4673 }
4674
4675 if (VA.needsCustom()) {
4676 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__))
4677 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__))
;
4678 // Split v64i1 value into two registers
4679 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4680 } else if (VA.isRegLoc()) {
4681 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4682 const TargetOptions &Options = DAG.getTarget().Options;
4683 if (Options.EmitCallSiteInfo)
4684 CSInfo.emplace_back(VA.getLocReg(), I);
4685 if (isVarArg && IsWin64) {
4686 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4687 // shadow reg if callee is a varargs function.
4688 Register ShadowReg;
4689 switch (VA.getLocReg()) {
4690 case X86::XMM0: ShadowReg = X86::RCX; break;
4691 case X86::XMM1: ShadowReg = X86::RDX; break;
4692 case X86::XMM2: ShadowReg = X86::R8; break;
4693 case X86::XMM3: ShadowReg = X86::R9; break;
4694 }
4695 if (ShadowReg)
4696 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4697 }
4698 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4699 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4699, __extension__ __PRETTY_FUNCTION__))
;
4700 if (!StackPtr.getNode())
4701 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4702 getPointerTy(DAG.getDataLayout()));
4703 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4704 dl, DAG, VA, Flags, isByVal));
4705 }
4706 }
4707
4708 if (!MemOpChains.empty())
4709 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4710
4711 if (Subtarget.isPICStyleGOT()) {
4712 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4713 // GOT pointer (except regcall).
4714 if (!isTailCall) {
4715 // Indirect call with RegCall calling convertion may use up all the
4716 // general registers, so it is not suitable to bind EBX reister for
4717 // GOT address, just let register allocator handle it.
4718 if (CallConv != CallingConv::X86_RegCall)
4719 RegsToPass.push_back(std::make_pair(
4720 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4721 getPointerTy(DAG.getDataLayout()))));
4722 } else {
4723 // If we are tail calling and generating PIC/GOT style code load the
4724 // address of the callee into ECX. The value in ecx is used as target of
4725 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4726 // for tail calls on PIC/GOT architectures. Normally we would just put the
4727 // address of GOT into ebx and then call target@PLT. But for tail calls
4728 // ebx would be restored (since ebx is callee saved) before jumping to the
4729 // target@PLT.
4730
4731 // Note: The actual moving to ECX is done further down.
4732 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4733 if (G && !G->getGlobal()->hasLocalLinkage() &&
4734 G->getGlobal()->hasDefaultVisibility())
4735 Callee = LowerGlobalAddress(Callee, DAG);
4736 else if (isa<ExternalSymbolSDNode>(Callee))
4737 Callee = LowerExternalSymbol(Callee, DAG);
4738 }
4739 }
4740
4741 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4742 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4743 // From AMD64 ABI document:
4744 // For calls that may call functions that use varargs or stdargs
4745 // (prototype-less calls or calls to functions containing ellipsis (...) in
4746 // the declaration) %al is used as hidden argument to specify the number
4747 // of SSE registers used. The contents of %al do not need to match exactly
4748 // the number of registers, but must be an ubound on the number of SSE
4749 // registers used and is in the range 0 - 8 inclusive.
4750
4751 // Count the number of XMM registers allocated.
4752 static const MCPhysReg XMMArgRegs[] = {
4753 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4754 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4755 };
4756 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4757 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__))
4758 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__))
;
4759 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4760 DAG.getConstant(NumXMMRegs, dl,
4761 MVT::i8)));
4762 }
4763
4764 if (isVarArg && IsMustTail) {
4765 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4766 for (const auto &F : Forwards) {
4767 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4768 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4769 }
4770 }
4771
4772 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4773 // don't need this because the eligibility check rejects calls that require
4774 // shuffling arguments passed in memory.
4775 if (!IsSibcall && isTailCall) {
4776 // Force all the incoming stack arguments to be loaded from the stack
4777 // before any new outgoing arguments are stored to the stack, because the
4778 // outgoing stack slots may alias the incoming argument stack slots, and
4779 // the alias isn't otherwise explicit. This is slightly more conservative
4780 // than necessary, because it means that each store effectively depends
4781 // on every argument instead of just those arguments it would clobber.
4782 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4783
4784 SmallVector<SDValue, 8> MemOpChains2;
4785 SDValue FIN;
4786 int FI = 0;
4787 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4788 ++I, ++OutsIndex) {
4789 CCValAssign &VA = ArgLocs[I];
4790
4791 if (VA.isRegLoc()) {
4792 if (VA.needsCustom()) {
4793 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__))
4794 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__))
;
4795 // This means that we are in special case where one argument was
4796 // passed through two register locations - Skip the next location
4797 ++I;
4798 }
4799
4800 continue;
4801 }
4802
4803 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4803, __extension__ __PRETTY_FUNCTION__))
;
4804 SDValue Arg = OutVals[OutsIndex];
4805 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4806 // Skip inalloca/preallocated arguments. They don't require any work.
4807 if (Flags.isInAlloca() || Flags.isPreallocated())
4808 continue;
4809 // Create frame index.
4810 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4811 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4812 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4813 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4814
4815 if (Flags.isByVal()) {
4816 // Copy relative to framepointer.
4817 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4818 if (!StackPtr.getNode())
4819 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4820 getPointerTy(DAG.getDataLayout()));
4821 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4822 StackPtr, Source);
4823
4824 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4825 ArgChain,
4826 Flags, DAG, dl));
4827 } else {
4828 // Store relative to framepointer.
4829 MemOpChains2.push_back(DAG.getStore(
4830 ArgChain, dl, Arg, FIN,
4831 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4832 }
4833 }
4834
4835 if (!MemOpChains2.empty())
4836 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4837
4838 // Store the return address to the appropriate stack slot.
4839 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4840 getPointerTy(DAG.getDataLayout()),
4841 RegInfo->getSlotSize(), FPDiff, dl);
4842 }
4843
4844 // Build a sequence of copy-to-reg nodes chained together with token chain
4845 // and glue operands which copy the outgoing args into registers.
4846 SDValue InGlue;
4847 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4848 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4849 RegsToPass[i].second, InGlue);
4850 InGlue = Chain.getValue(1);
4851 }
4852
4853 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4854 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4854, __extension__
__PRETTY_FUNCTION__))
;
4855 // In the 64-bit large code model, we have to make all calls
4856 // through a register, since the call instruction's 32-bit
4857 // pc-relative offset may not be large enough to hold the whole
4858 // address.
4859 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4860 Callee->getOpcode() == ISD::ExternalSymbol) {
4861 // Lower direct calls to global addresses and external symbols. Setting
4862 // ForCall to true here has the effect of removing WrapperRIP when possible
4863 // to allow direct calls to be selected without first materializing the
4864 // address into a register.
4865 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4866 } else if (Subtarget.isTarget64BitILP32() &&
4867 Callee.getValueType() == MVT::i32) {
4868 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4869 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4870 }
4871
4872 // Returns a chain & a glue for retval copy to use.
4873 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4874 SmallVector<SDValue, 8> Ops;
4875
4876 if (!IsSibcall && isTailCall && !IsMustTail) {
4877 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
4878 InGlue = Chain.getValue(1);
4879 }
4880
4881 Ops.push_back(Chain);
4882 Ops.push_back(Callee);
4883
4884 if (isTailCall)
4885 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4886
4887 // Add argument registers to the end of the list so that they are known live
4888 // into the call.
4889 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4890 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4891 RegsToPass[i].second.getValueType()));
4892
4893 // Add a register mask operand representing the call-preserved registers.
4894 const uint32_t *Mask = [&]() {
4895 auto AdaptedCC = CallConv;
4896 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4897 // use X86_INTR calling convention because it has the same CSR mask
4898 // (same preserved registers).
4899 if (HasNCSR)
4900 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4901 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4902 // to use the CSR_NoRegs_RegMask.
4903 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4904 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4905 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4906 }();
4907 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4907, __extension__
__PRETTY_FUNCTION__))
;
4908
4909 // If this is an invoke in a 32-bit function using a funclet-based
4910 // personality, assume the function clobbers all registers. If an exception
4911 // is thrown, the runtime will not restore CSRs.
4912 // FIXME: Model this more precisely so that we can register allocate across
4913 // the normal edge and spill and fill across the exceptional edge.
4914 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4915 const Function &CallerFn = MF.getFunction();
4916 EHPersonality Pers =
4917 CallerFn.hasPersonalityFn()
4918 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4919 : EHPersonality::Unknown;
4920 if (isFuncletEHPersonality(Pers))
4921 Mask = RegInfo->getNoPreservedMask();
4922 }
4923
4924 // Define a new register mask from the existing mask.
4925 uint32_t *RegMask = nullptr;
4926
4927 // In some calling conventions we need to remove the used physical registers
4928 // from the reg mask. Create a new RegMask for such calling conventions.
4929 // RegMask for calling conventions that disable only return registers (e.g.
4930 // preserve_most) will be modified later in LowerCallResult.
4931 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
4932 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
4933 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4934
4935 // Allocate a new Reg Mask and copy Mask.
4936 RegMask = MF.allocateRegMask();
4937 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4938 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4939
4940 // Make sure all sub registers of the argument registers are reset
4941 // in the RegMask.
4942 if (ShouldDisableArgRegs) {
4943 for (auto const &RegPair : RegsToPass)
4944 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4945 SubRegs.isValid(); ++SubRegs)
4946 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4947 }
4948
4949 // Create the RegMask Operand according to our updated mask.
4950 Ops.push_back(DAG.getRegisterMask(RegMask));
4951 } else {
4952 // Create the RegMask Operand according to the static mask.
4953 Ops.push_back(DAG.getRegisterMask(Mask));
4954 }
4955
4956 if (InGlue.getNode())
4957 Ops.push_back(InGlue);
4958
4959 if (isTailCall) {
4960 // We used to do:
4961 //// If this is the first return lowered for this function, add the regs
4962 //// to the liveout set for the function.
4963 // This isn't right, although it's probably harmless on x86; liveouts
4964 // should be computed from returns not tail calls. Consider a void
4965 // function making a tail call to a function returning int.
4966 MF.getFrameInfo().setHasTailCall();
4967 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4968
4969 if (IsCFICall)
4970 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4971
4972 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4973 return Ret;
4974 }
4975
4976 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4977 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4978 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4979 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4980 // expanded to the call, directly followed by a special marker sequence and
4981 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4982 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
4983 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
;
4984 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4984, __extension__
__PRETTY_FUNCTION__))
;
4985
4986 // Add a target global address for the retainRV/claimRV runtime function
4987 // just before the call target.
4988 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
4989 auto PtrVT = getPointerTy(DAG.getDataLayout());
4990 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
4991 Ops.insert(Ops.begin() + 1, GA);
4992 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4993 } else {
4994 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4995 }
4996
4997 if (IsCFICall)
4998 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4999
5000 InGlue = Chain.getValue(1);
5001 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5002 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5003
5004 // Save heapallocsite metadata.
5005 if (CLI.CB)
5006 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
5007 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
5008
5009 // Create the CALLSEQ_END node.
5010 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
5011 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
5012 DAG.getTarget().Options.GuaranteedTailCallOpt))
5013 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
5014 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
5015 // If this call passes a struct-return pointer, the callee
5016 // pops that struct pointer.
5017 NumBytesForCalleeToPop = 4;
5018
5019 // Returns a glue for retval copy to use.
5020 if (!IsSibcall) {
5021 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
5022 InGlue, dl);
5023 InGlue = Chain.getValue(1);
5024 }
5025
5026 // Handle result values, copying them out of physregs into vregs that we
5027 // return.
5028 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
5029 InVals, RegMask);
5030}
5031
5032//===----------------------------------------------------------------------===//
5033// Fast Calling Convention (tail call) implementation
5034//===----------------------------------------------------------------------===//
5035
5036// Like std call, callee cleans arguments, convention except that ECX is
5037// reserved for storing the tail called function address. Only 2 registers are
5038// free for argument passing (inreg). Tail call optimization is performed
5039// provided:
5040// * tailcallopt is enabled
5041// * caller/callee are fastcc
5042// On X86_64 architecture with GOT-style position independent code only local
5043// (within module) calls are supported at the moment.
5044// To keep the stack aligned according to platform abi the function
5045// GetAlignedArgumentStackSize ensures that argument delta is always multiples
5046// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
5047// If a tail called function callee has more arguments than the caller the
5048// caller needs to make sure that there is room to move the RETADDR to. This is
5049// achieved by reserving an area the size of the argument delta right after the
5050// original RETADDR, but before the saved framepointer or the spilled registers
5051// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
5052// stack layout:
5053// arg1
5054// arg2
5055// RETADDR
5056// [ new RETADDR
5057// move area ]
5058// (possible EBP)
5059// ESI
5060// EDI
5061// local1 ..
5062
5063/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
5064/// requirement.
5065unsigned
5066X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
5067 SelectionDAG &DAG) const {
5068 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
5069 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
5070 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__))
5071 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__))
;
5072 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
5073}
5074
5075/// Return true if the given stack call argument is already available in the
5076/// same position (relatively) of the caller's incoming argument stack.
5077static
5078bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
5079 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
5080 const X86InstrInfo *TII, const CCValAssign &VA) {
5081 unsigned Bytes = Arg.getValueSizeInBits() / 8;
5082
5083 for (;;) {
5084 // Look through nodes that don't alter the bits of the incoming value.
5085 unsigned Op = Arg.getOpcode();
5086 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
5087 Arg = Arg.getOperand(0);
5088 continue;
5089 }
5090 if (Op == ISD::TRUNCATE) {
5091 const SDValue &TruncInput = Arg.getOperand(0);
5092 if (TruncInput.getOpcode() == ISD::AssertZext &&
5093 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
5094 Arg.getValueType()) {
5095 Arg = TruncInput.getOperand(0);
5096 continue;
5097 }
5098 }
5099 break;
5100 }
5101
5102 int FI = INT_MAX2147483647;
5103 if (Arg.getOpcode() == ISD::CopyFromReg) {
5104 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
5105 if (!VR.isVirtual())
5106 return false;
5107 MachineInstr *Def = MRI->getVRegDef(VR);
5108 if (!Def)
5109 return false;
5110 if (!Flags.isByVal()) {
5111 if (!TII->isLoadFromStackSlot(*Def, FI))
5112 return false;
5113 } else {
5114 unsigned Opcode = Def->getOpcode();
5115 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
5116 Opcode == X86::LEA64_32r) &&
5117 Def->getOperand(1).isFI()) {
5118 FI = Def->getOperand(1).getIndex();
5119 Bytes = Flags.getByValSize();
5120 } else
5121 return false;
5122 }
5123 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
5124 if (Flags.isByVal())
5125 // ByVal argument is passed in as a pointer but it's now being
5126 // dereferenced. e.g.
5127 // define @foo(%struct.X* %A) {
5128 // tail call @bar(%struct.X* byval %A)
5129 // }
5130 return false;
5131 SDValue Ptr = Ld->getBasePtr();
5132 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
5133 if (!FINode)
5134 return false;
5135 FI = FINode->getIndex();
5136 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
5137 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
5138 FI = FINode->getIndex();
5139 Bytes = Flags.getByValSize();
5140 } else
5141 return false;
5142
5143 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5143, __extension__ __PRETTY_FUNCTION__))
;
5144 if (!MFI.isFixedObjectIndex(FI))
5145 return false;
5146
5147 if (Offset != MFI.getObjectOffset(FI))
5148 return false;
5149
5150 // If this is not byval, check that the argument stack object is immutable.
5151 // inalloca and argument copy elision can create mutable argument stack
5152 // objects. Byval objects can be mutated, but a byval call intends to pass the
5153 // mutated memory.
5154 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
5155 return false;
5156
5157 if (VA.getLocVT().getFixedSizeInBits() >
5158 Arg.getValueSizeInBits().getFixedValue()) {
5159 // If the argument location is wider than the argument type, check that any
5160 // extension flags match.
5161 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
5162 Flags.isSExt() != MFI.isObjectSExt(FI)) {
5163 return false;
5164 }
5165 }
5166
5167 return Bytes == MFI.getObjectSize(FI);
5168}
5169
5170/// Check whether the call is eligible for tail call optimization. Targets
5171/// that want to do tail call optimization should implement this function.
5172bool X86TargetLowering::IsEligibleForTailCallOptimization(
5173 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
5174 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
5175 const SmallVectorImpl<SDValue> &OutVals,
5176 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5177 if (!mayTailCallThisCC(CalleeCC))
5178 return false;
5179
5180 // If -tailcallopt is specified, make fastcc functions tail-callable.
5181 MachineFunction &MF = DAG.getMachineFunction();
5182 const Function &CallerF = MF.getFunction();
5183
5184 // If the function return type is x86_fp80 and the callee return type is not,
5185 // then the FP_EXTEND of the call result is not a nop. It's not safe to
5186 // perform a tailcall optimization here.
5187 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
5188 return false;
5189
5190 CallingConv::ID CallerCC = CallerF.getCallingConv();
5191 bool CCMatch = CallerCC == CalleeCC;
5192 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
5193 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
5194 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
5195 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
5196
5197 // Win64 functions have extra shadow space for argument homing. Don't do the
5198 // sibcall if the caller and callee have mismatched expectations for this
5199 // space.
5200 if (IsCalleeWin64 != IsCallerWin64)
5201 return false;
5202
5203 if (IsGuaranteeTCO) {
5204 if (canGuaranteeTCO(CalleeCC) && CCMatch)
5205 return true;
5206 return false;
5207 }
5208
5209 // Look for obvious safe cases to perform tail call optimization that do not
5210 // require ABI changes. This is what gcc calls sibcall.
5211
5212 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
5213 // emit a special epilogue.
5214 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5215 if (RegInfo->hasStackRealignment(MF))
5216 return false;
5217
5218 // Also avoid sibcall optimization if we're an sret return fn and the callee
5219 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
5220 // insufficient.
5221 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
5222 // For a compatible tail call the callee must return our sret pointer. So it
5223 // needs to be (a) an sret function itself and (b) we pass our sret as its
5224 // sret. Condition #b is harder to determine.
5225 return false;
5226 } else if (IsCalleePopSRet)
5227 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
5228 // expect that.
5229 return false;
5230
5231 // Do not sibcall optimize vararg calls unless all arguments are passed via
5232 // registers.
5233 LLVMContext &C = *DAG.getContext();
5234 if (isVarArg && !Outs.empty()) {
5235 // Optimizing for varargs on Win64 is unlikely to be safe without
5236 // additional testing.
5237 if (IsCalleeWin64 || IsCallerWin64)
5238 return false;
5239
5240 SmallVector<CCValAssign, 16> ArgLocs;
5241 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5242
5243 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5244 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
5245 if (!ArgLocs[i].isRegLoc())
5246 return false;
5247 }
5248
5249 // If the call result is in ST0 / ST1, it needs to be popped off the x87
5250 // stack. Therefore, if it's not used by the call it is not safe to optimize
5251 // this into a sibcall.
5252 bool Unused = false;
5253 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5254 if (!Ins[i].Used) {
5255 Unused = true;
5256 break;
5257 }
5258 }
5259 if (Unused) {
5260 SmallVector<CCValAssign, 16> RVLocs;
5261 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
5262 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
5263 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5264 CCValAssign &VA = RVLocs[i];
5265 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
5266 return false;
5267 }
5268 }
5269
5270 // Check that the call results are passed in the same way.
5271 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5272 RetCC_X86, RetCC_X86))
5273 return false;
5274 // The callee has to preserve all registers the caller needs to preserve.
5275 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5276 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5277 if (!CCMatch) {
5278 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5279 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5280 return false;
5281 }
5282
5283 unsigned StackArgsSize = 0;
5284
5285 // If the callee takes no arguments then go on to check the results of the
5286 // call.
5287 if (!Outs.empty()) {
5288 // Check if stack adjustment is needed. For now, do not do this if any
5289 // argument is passed on the stack.
5290 SmallVector<CCValAssign, 16> ArgLocs;
5291 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5292
5293 // Allocate shadow area for Win64
5294 if (IsCalleeWin64)
5295 CCInfo.AllocateStack(32, Align(8));
5296
5297 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5298 StackArgsSize = CCInfo.getNextStackOffset();
5299
5300 if (CCInfo.getNextStackOffset()) {
5301 // Check if the arguments are already laid out in the right way as
5302 // the caller's fixed stack objects.
5303 MachineFrameInfo &MFI = MF.getFrameInfo();
5304 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5305 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5306 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5307 CCValAssign &VA = ArgLocs[i];
5308 SDValue Arg = OutVals[i];
5309 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5310 if (VA.getLocInfo() == CCValAssign::Indirect)
5311 return false;
5312 if (!VA.isRegLoc()) {
5313 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5314 MFI, MRI, TII, VA))
5315 return false;
5316 }
5317 }
5318 }
5319
5320 bool PositionIndependent = isPositionIndependent();
5321 // If the tailcall address may be in a register, then make sure it's
5322 // possible to register allocate for it. In 32-bit, the call address can
5323 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5324 // callee-saved registers are restored. These happen to be the same
5325 // registers used to pass 'inreg' arguments so watch out for those.
5326 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5327 !isa<ExternalSymbolSDNode>(Callee)) ||
5328 PositionIndependent)) {
5329 unsigned NumInRegs = 0;
5330 // In PIC we need an extra register to formulate the address computation
5331 // for the callee.
5332 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5333
5334 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5335 CCValAssign &VA = ArgLocs[i];
5336 if (!VA.isRegLoc())
5337 continue;
5338 Register Reg = VA.getLocReg();
5339 switch (Reg) {
5340 default: break;
5341 case X86::EAX: case X86::EDX: case X86::ECX:
5342 if (++NumInRegs == MaxInRegs)
5343 return false;
5344 break;
5345 }
5346 }
5347 }
5348
5349 const MachineRegisterInfo &MRI = MF.getRegInfo();
5350 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5351 return false;
5352 }
5353
5354 bool CalleeWillPop =
5355 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5356 MF.getTarget().Options.GuaranteedTailCallOpt);
5357
5358 if (unsigned BytesToPop =
5359 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5360 // If we have bytes to pop, the callee must pop them.
5361 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5362 if (!CalleePopMatches)
5363 return false;
5364 } else if (CalleeWillPop && StackArgsSize > 0) {
5365 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5366 return false;
5367 }
5368
5369 return true;
5370}
5371
5372FastISel *
5373X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5374 const TargetLibraryInfo *libInfo) const {
5375 return X86::createFastISel(funcInfo, libInfo);
5376}
5377
5378//===----------------------------------------------------------------------===//
5379// Other Lowering Hooks
5380//===----------------------------------------------------------------------===//
5381
5382bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5383 bool AssumeSingleUse) {
5384 if (!AssumeSingleUse && !Op.hasOneUse())
5385 return false;
5386 if (!ISD::isNormalLoad(Op.getNode()))
5387 return false;
5388
5389 // If this is an unaligned vector, make sure the target supports folding it.
5390 auto *Ld = cast<LoadSDNode>(Op.getNode());
5391 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5392 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
5393 return false;
5394
5395 // TODO: If this is a non-temporal load and the target has an instruction
5396 // for it, it should not be folded. See "useNonTemporalLoad()".
5397
5398 return true;
5399}
5400
5401bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5402 const X86Subtarget &Subtarget,
5403 bool AssumeSingleUse) {
5404 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5404, __extension__
__PRETTY_FUNCTION__))
;
5405 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5406 return false;
5407
5408 // We can not replace a wide volatile load with a broadcast-from-memory,
5409 // because that would narrow the load, which isn't legal for volatiles.
5410 auto *Ld = cast<LoadSDNode>(Op.getNode());
5411 return !Ld->isVolatile() ||
5412 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5413}
5414
5415bool X86::mayFoldIntoStore(SDValue Op) {
5416 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5417}
5418
5419bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5420 if (Op.hasOneUse()) {
5421 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5422 return (ISD::ZERO_EXTEND == Opcode);
5423 }
5424 return false;
5425}
5426
5427static bool isTargetShuffle(unsigned Opcode) {
5428 switch(Opcode) {
5429 default: return false;
5430 case X86ISD::BLENDI:
5431 case X86ISD::PSHUFB:
5432 case X86ISD::PSHUFD:
5433 case X86ISD::PSHUFHW:
5434 case X86ISD::PSHUFLW:
5435 case X86ISD::SHUFP:
5436 case X86ISD::INSERTPS:
5437 case X86ISD::EXTRQI:
5438 case X86ISD::INSERTQI:
5439 case X86ISD::VALIGN:
5440 case X86ISD::PALIGNR:
5441 case X86ISD::VSHLDQ:
5442 case X86ISD::VSRLDQ:
5443 case X86ISD::MOVLHPS:
5444 case X86ISD::MOVHLPS:
5445 case X86ISD::MOVSHDUP:
5446 case X86ISD::MOVSLDUP:
5447 case X86ISD::MOVDDUP:
5448 case X86ISD::MOVSS:
5449 case X86ISD::MOVSD:
5450 case X86ISD::MOVSH:
5451 case X86ISD::UNPCKL:
5452 case X86ISD::UNPCKH:
5453 case X86ISD::VBROADCAST:
5454 case X86ISD::VPERMILPI:
5455 case X86ISD::VPERMILPV:
5456 case X86ISD::VPERM2X128:
5457 case X86ISD::SHUF128:
5458 case X86ISD::VPERMIL2:
5459 case X86ISD::VPERMI:
5460 case X86ISD::VPPERM:
5461 case X86ISD::VPERMV:
5462 case X86ISD::VPERMV3:
5463 case X86ISD::VZEXT_MOVL:
5464 return true;
5465 }
5466}
5467
5468static bool isTargetShuffleVariableMask(unsigned Opcode) {
5469 switch (Opcode) {
5470 default: return false;
5471 // Target Shuffles.
5472 case X86ISD::PSHUFB:
5473 case X86ISD::VPERMILPV:
5474 case X86ISD::VPERMIL2:
5475 case X86ISD::VPPERM:
5476 case X86ISD::VPERMV:
5477 case X86ISD::VPERMV3:
5478 return true;
5479 // 'Faux' Target Shuffles.
5480 case ISD::OR:
5481 case ISD::AND:
5482 case X86ISD::ANDNP:
5483 return true;
5484 }
5485}
5486
5487SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5488 MachineFunction &MF = DAG.getMachineFunction();
5489 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5490 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5491 int ReturnAddrIndex = FuncInfo->getRAIndex();
5492
5493 if (ReturnAddrIndex == 0) {
5494 // Set up a frame object for the return address.
5495 unsigned SlotSize = RegInfo->getSlotSize();
5496 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5497 -(int64_t)SlotSize,
5498 false);
5499 FuncInfo->setRAIndex(ReturnAddrIndex);
5500 }
5501
5502 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5503}
5504
5505bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5506 bool hasSymbolicDisplacement) {
5507 // Offset should fit into 32 bit immediate field.
5508 if (!isInt<32>(Offset))
5509 return false;
5510
5511 // If we don't have a symbolic displacement - we don't have any extra
5512 // restrictions.
5513 if (!hasSymbolicDisplacement)
5514 return true;
5515
5516 // FIXME: Some tweaks might be needed for medium code model.
5517 if (M != CodeModel::Small && M != CodeModel::Kernel)
5518 return false;
5519
5520 // For small code model we assume that latest object is 16MB before end of 31
5521 // bits boundary. We may also accept pretty large negative constants knowing
5522 // that all objects are in the positive half of address space.
5523 if (M == CodeModel::Small && Offset < 16*1024*1024)
5524 return true;
5525
5526 // For kernel code model we know that all object resist in the negative half
5527 // of 32bits address space. We may not accept negative offsets, since they may
5528 // be just off and we may accept pretty large positive ones.
5529 if (M == CodeModel::Kernel && Offset >= 0)
5530 return true;
5531
5532 return false;
5533}
5534
5535/// Determines whether the callee is required to pop its own arguments.
5536/// Callee pop is necessary to support tail calls.
5537bool X86::isCalleePop(CallingConv::ID CallingConv,
5538 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5539 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5540 // can guarantee TCO.
5541 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5542 return true;
5543
5544 switch (CallingConv) {
5545 default:
5546 return false;
5547 case CallingConv::X86_StdCall:
5548 case CallingConv::X86_FastCall:
5549 case CallingConv::X86_ThisCall:
5550 case CallingConv::X86_VectorCall:
5551 return !is64Bit;
5552 }
5553}
5554
5555/// Return true if the condition is an signed comparison operation.
5556static bool isX86CCSigned(unsigned X86CC) {
5557 switch (X86CC) {
5558 default:
5559 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5559)
;
5560 case X86::COND_E:
5561 case X86::COND_NE:
5562 case X86::COND_B:
5563 case X86::COND_A:
5564 case X86::COND_BE:
5565 case X86::COND_AE:
5566 return false;
5567 case X86::COND_G:
5568 case X86::COND_GE:
5569 case X86::COND_L:
5570 case X86::COND_LE:
5571 return true;
5572 }
5573}
5574
5575static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5576 switch (SetCCOpcode) {
5577 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5577)
;
5578 case ISD::SETEQ: return X86::COND_E;
5579 case ISD::SETGT: return X86::COND_G;
5580 case ISD::SETGE: return X86::COND_GE;
5581 case ISD::SETLT: return X86::COND_L;
5582 case ISD::SETLE: return X86::COND_LE;
5583 case ISD::SETNE: return X86::COND_NE;
5584 case ISD::SETULT: return X86::COND_B;
5585 case ISD::SETUGT: return X86::COND_A;
5586 case ISD::SETULE: return X86::COND_BE;
5587 case ISD::SETUGE: return X86::COND_AE;
5588 }
5589}
5590
5591/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5592/// condition code, returning the condition code and the LHS/RHS of the
5593/// comparison to make.
5594static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5595 bool isFP, SDValue &LHS, SDValue &RHS,
5596 SelectionDAG &DAG) {
5597 if (!isFP) {
5598 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5599 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5600 // X > -1 -> X == 0, jump !sign.
5601 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5602 return X86::COND_NS;
5603 }
5604 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5605 // X < 0 -> X == 0, jump on sign.
5606 return X86::COND_S;
5607 }
5608 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5609 // X >= 0 -> X == 0, jump on !sign.
5610 return X86::COND_NS;
5611 }
5612 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5613 // X < 1 -> X <= 0
5614 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5615 return X86::COND_LE;
5616 }
5617 }
5618
5619 return TranslateIntegerX86CC(SetCCOpcode);
5620 }
5621
5622 // First determine if it is required or is profitable to flip the operands.
5623
5624 // If LHS is a foldable load, but RHS is not, flip the condition.
5625 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5626 !ISD::isNON_EXTLoad(RHS.getNode())) {
5627 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5628 std::swap(LHS, RHS);
5629 }
5630
5631 switch (SetCCOpcode) {
5632 default: break;
5633 case ISD::SETOLT:
5634 case ISD::SETOLE:
5635 case ISD::SETUGT:
5636 case ISD::SETUGE:
5637 std::swap(LHS, RHS);
5638 break;
5639 }
5640
5641 // On a floating point condition, the flags are set as follows:
5642 // ZF PF CF op
5643 // 0 | 0 | 0 | X > Y
5644 // 0 | 0 | 1 | X < Y
5645 // 1 | 0 | 0 | X == Y
5646 // 1 | 1 | 1 | unordered
5647 switch (SetCCOpcode) {
5648 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5648)
;
5649 case ISD::SETUEQ:
5650 case ISD::SETEQ: return X86::COND_E;
5651 case ISD::SETOLT: // flipped
5652 case ISD::SETOGT:
5653 case ISD::SETGT: return X86::COND_A;
5654 case ISD::SETOLE: // flipped
5655 case ISD::SETOGE:
5656 case ISD::SETGE: return X86::COND_AE;
5657 case ISD::SETUGT: // flipped
5658 case ISD::SETULT:
5659 case ISD::SETLT: return X86::COND_B;
5660 case ISD::SETUGE: // flipped
5661 case ISD::SETULE:
5662 case ISD::SETLE: return X86::COND_BE;
5663 case ISD::SETONE:
5664 case ISD::SETNE: return X86::COND_NE;
5665 case ISD::SETUO: return X86::COND_P;
5666 case ISD::SETO: return X86::COND_NP;
5667 case ISD::SETOEQ:
5668 case ISD::SETUNE: return X86::COND_INVALID;
5669 }
5670}
5671
5672/// Is there a floating point cmov for the specific X86 condition code?
5673/// Current x86 isa includes the following FP cmov instructions:
5674/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5675static bool hasFPCMov(unsigned X86CC) {
5676 switch (X86CC) {
5677 default:
5678 return false;
5679 case X86::COND_B:
5680 case X86::COND_BE:
5681 case X86::COND_E:
5682 case X86::COND_P:
5683 case X86::COND_A:
5684 case X86::COND_AE:
5685 case X86::COND_NE:
5686 case X86::COND_NP:
5687 return true;
5688 }
5689}
5690
5691static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5692 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5693 VT.is512BitVector();
5694}
5695
5696bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5697 const CallInst &I,
5698 MachineFunction &MF,
5699 unsigned Intrinsic) const {
5700 Info.flags = MachineMemOperand::MONone;
5701 Info.offset = 0;
5702
5703 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5704 if (!IntrData) {
5705 switch (Intrinsic) {
5706 case Intrinsic::x86_aesenc128kl:
5707 case Intrinsic::x86_aesdec128kl:
5708 Info.opc = ISD::INTRINSIC_W_CHAIN;
5709 Info.ptrVal = I.getArgOperand(1);
5710 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5711 Info.align = Align(1);
5712 Info.flags |= MachineMemOperand::MOLoad;
5713 return true;
5714 case Intrinsic::x86_aesenc256kl:
5715 case Intrinsic::x86_aesdec256kl:
5716 Info.opc = ISD::INTRINSIC_W_CHAIN;
5717 Info.ptrVal = I.getArgOperand(1);
5718 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5719 Info.align = Align(1);
5720 Info.flags |= MachineMemOperand::MOLoad;
5721 return true;
5722 case Intrinsic::x86_aesencwide128kl:
5723 case Intrinsic::x86_aesdecwide128kl:
5724 Info.opc = ISD::INTRINSIC_W_CHAIN;
5725 Info.ptrVal = I.getArgOperand(0);
5726 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5727 Info.align = Align(1);
5728 Info.flags |= MachineMemOperand::MOLoad;
5729 return true;
5730 case Intrinsic::x86_aesencwide256kl:
5731 case Intrinsic::x86_aesdecwide256kl:
5732 Info.opc = ISD::INTRINSIC_W_CHAIN;
5733 Info.ptrVal = I.getArgOperand(0);
5734 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5735 Info.align = Align(1);
5736 Info.flags |= MachineMemOperand::MOLoad;
5737 return true;
5738 case Intrinsic::x86_cmpccxadd32:
5739 case Intrinsic::x86_cmpccxadd64:
5740 case Intrinsic::x86_atomic_bts:
5741 case Intrinsic::x86_atomic_btc:
5742 case Intrinsic::x86_atomic_btr: {
5743 Info.opc = ISD::INTRINSIC_W_CHAIN;
5744 Info.ptrVal = I.getArgOperand(0);
5745 unsigned Size = I.getType()->getScalarSizeInBits();
5746 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5747 Info.align = Align(Size);
5748 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5749 MachineMemOperand::MOVolatile;
5750 return true;
5751 }
5752 case Intrinsic::x86_atomic_bts_rm:
5753 case Intrinsic::x86_atomic_btc_rm:
5754 case Intrinsic::x86_atomic_btr_rm: {
5755 Info.opc = ISD::INTRINSIC_W_CHAIN;
5756 Info.ptrVal = I.getArgOperand(0);
5757 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5758 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5759 Info.align = Align(Size);
5760 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5761 MachineMemOperand::MOVolatile;
5762 return true;
5763 }
5764 case Intrinsic::x86_aadd32:
5765 case Intrinsic::x86_aadd64:
5766 case Intrinsic::x86_aand32:
5767 case Intrinsic::x86_aand64:
5768 case Intrinsic::x86_aor32:
5769 case Intrinsic::x86_aor64:
5770 case Intrinsic::x86_axor32:
5771 case Intrinsic::x86_axor64:
5772 case Intrinsic::x86_atomic_add_cc:
5773 case Intrinsic::x86_atomic_sub_cc:
5774 case Intrinsic::x86_atomic_or_cc:
5775 case Intrinsic::x86_atomic_and_cc:
5776 case Intrinsic::x86_atomic_xor_cc: {
5777 Info.opc = ISD::INTRINSIC_W_CHAIN;
5778 Info.ptrVal = I.getArgOperand(0);
5779 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5780 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5781 Info.align = Align(Size);
5782 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5783 MachineMemOperand::MOVolatile;
5784 return true;
5785 }
5786 }
5787 return false;
5788 }
5789
5790 switch (IntrData->Type) {
5791 case TRUNCATE_TO_MEM_VI8:
5792 case TRUNCATE_TO_MEM_VI16:
5793 case TRUNCATE_TO_MEM_VI32: {
5794 Info.opc = ISD::INTRINSIC_VOID;
5795 Info.ptrVal = I.getArgOperand(0);
5796 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5797 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5798 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5799 ScalarVT = MVT::i8;
5800 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5801 ScalarVT = MVT::i16;
5802 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5803 ScalarVT = MVT::i32;
5804
5805 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5806 Info.align = Align(1);
5807 Info.flags |= MachineMemOperand::MOStore;
5808 break;
5809 }
5810 case GATHER:
5811 case GATHER_AVX2: {
5812 Info.opc = ISD::INTRINSIC_W_CHAIN;
5813 Info.ptrVal = nullptr;
5814 MVT DataVT = MVT::getVT(I.getType());
5815 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5816 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5817 IndexVT.getVectorNumElements());
5818 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5819 Info.align = Align(1);
5820 Info.flags |= MachineMemOperand::MOLoad;
5821 break;
5822 }
5823 case SCATTER: {
5824 Info.opc = ISD::INTRINSIC_VOID;
5825 Info.ptrVal = nullptr;
5826 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5827 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5828 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5829 IndexVT.getVectorNumElements());
5830 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5831 Info.align = Align(1);
5832 Info.flags |= MachineMemOperand::MOStore;
5833 break;
5834 }
5835 default:
5836 return false;
5837 }
5838
5839 return true;
5840}
5841
5842/// Returns true if the target can instruction select the
5843/// specified FP immediate natively. If false, the legalizer will
5844/// materialize the FP immediate as a load from a constant pool.
5845bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5846 bool ForCodeSize) const {
5847 for (const APFloat &FPImm : LegalFPImmediates)
5848 if (Imm.bitwiseIsEqual(FPImm))
5849 return true;
5850 return false;
5851}
5852
5853bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5854 ISD::LoadExtType ExtTy,
5855 EVT NewVT) const {
5856 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5856, __extension__
__PRETTY_FUNCTION__))
;
5857
5858 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5859 // relocation target a movq or addq instruction: don't let the load shrink.
5860 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5861 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5862 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5863 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5864
5865 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5866 // those uses are extracted directly into a store, then the extract + store
5867 // can be store-folded. Therefore, it's probably not worth splitting the load.
5868 EVT VT = Load->getValueType(0);
5869 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5870 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5871 // Skip uses of the chain value. Result 0 of the node is the load value.
5872 if (UI.getUse().getResNo() != 0)
5873 continue;
5874
5875 // If this use is not an extract + store, it's probably worth splitting.
5876 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5877 UI->use_begin()->getOpcode() != ISD::STORE)
5878 return true;
5879 }
5880 // All non-chain uses are extract + store.
5881 return false;
5882 }
5883
5884 return true;
5885}
5886
5887/// Returns true if it is beneficial to convert a load of a constant
5888/// to just the constant itself.
5889bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5890 Type *Ty) const {
5891 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5891, __extension__ __PRETTY_FUNCTION__))
;
5892
5893 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5894 if (BitSize == 0 || BitSize > 64)
5895 return false;
5896 return true;
5897}
5898
5899bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5900 // If we are using XMM registers in the ABI and the condition of the select is
5901 // a floating-point compare and we have blendv or conditional move, then it is
5902 // cheaper to select instead of doing a cross-register move and creating a
5903 // load that depends on the compare result.
5904 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5905 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5906}
5907
5908bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5909 // TODO: It might be a win to ease or lift this restriction, but the generic
5910 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5911 if (VT.isVector() && Subtarget.hasAVX512())
5912 return false;
5913
5914 return true;
5915}
5916
5917bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5918 SDValue C) const {
5919 // TODO: We handle scalars using custom code, but generic combining could make
5920 // that unnecessary.
5921 APInt MulC;
5922 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5923 return false;
5924
5925 // Find the type this will be legalized too. Otherwise we might prematurely
5926 // convert this to shl+add/sub and then still have to type legalize those ops.
5927 // Another choice would be to defer the decision for illegal types until
5928 // after type legalization. But constant splat vectors of i64 can't make it
5929 // through type legalization on 32-bit targets so we would need to special
5930 // case vXi64.
5931 while (getTypeAction(Context, VT) != TypeLegal)
5932 VT = getTypeToTransformTo(Context, VT);
5933
5934 // If vector multiply is legal, assume that's faster than shl + add/sub.
5935 // Multiply is a complex op with higher latency and lower throughput in
5936 // most implementations, sub-vXi32 vector multiplies are always fast,
5937 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5938 // is always going to be slow.
5939 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5940 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5941 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5942 return false;
5943
5944 // shl+add, shl+sub, shl+add+neg
5945 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5946 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5947}
5948
5949bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5950 unsigned Index) const {
5951 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5952 return false;
5953
5954 // Mask vectors support all subregister combinations and operations that
5955 // extract half of vector.
5956 if (ResVT.getVectorElementType() == MVT::i1)
5957 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5958 (Index == ResVT.getVectorNumElements()));
5959
5960 return (Index % ResVT.getVectorNumElements()) == 0;
5961}
5962
5963bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5964 unsigned Opc = VecOp.getOpcode();
5965
5966 // Assume target opcodes can't be scalarized.
5967 // TODO - do we have any exceptions?
5968 if (Opc >= ISD::BUILTIN_OP_END)
5969 return false;
5970
5971 // If the vector op is not supported, try to convert to scalar.
5972 EVT VecVT = VecOp.getValueType();
5973 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5974 return true;
5975
5976 // If the vector op is supported, but the scalar op is not, the transform may
5977 // not be worthwhile.
5978 EVT ScalarVT = VecVT.getScalarType();
5979 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5980}
5981
5982bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5983 bool) const {
5984 // TODO: Allow vectors?
5985 if (VT.isVector())
5986 return false;
5987 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5988}
5989
5990bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
5991 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
5992 return Subtarget.hasBMI() ||
5993 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
5994}
5995
5996bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
5997 // Speculate ctlz only if we can directly use LZCNT.
5998 return Subtarget.hasLZCNT();
5999}
6000
6001bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
6002 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
6003 // expensive than a straight movsd. On the other hand, it's important to
6004 // shrink long double fp constant since fldt is very slow.
6005 return !Subtarget.hasSSE2() || VT == MVT::f80;
6006}
6007
6008bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
6009 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
6010 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
6011}
6012
6013bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
6014 const SelectionDAG &DAG,
6015 const MachineMemOperand &MMO) const {
6016 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
6017 BitcastVT.getVectorElementType() == MVT::i1)
6018 return false;
6019
6020 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
6021 return false;
6022
6023 // If both types are legal vectors, it's always ok to convert them.
6024 if (LoadVT.isVector() && BitcastVT.isVector() &&
6025 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
6026 return true;
6027
6028 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
6029}
6030
6031bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
6032 const MachineFunction &MF) const {
6033 // Do not merge to float value size (128 bytes) if no implicit
6034 // float attribute is set.
6035 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
6036
6037 if (NoFloat) {
6038 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
6039 return (MemVT.getSizeInBits() <= MaxIntSize);
6040 }
6041 // Make sure we don't merge greater than our preferred vector
6042 // width.
6043 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
6044 return false;
6045
6046 return true;
6047}
6048
6049bool X86TargetLowering::isCtlzFast() const {
6050 return Subtarget.hasFastLZCNT();
6051}
6052
6053bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
6054 const Instruction &AndI) const {
6055 return true;
6056}
6057
6058bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
6059 EVT VT = Y.getValueType();
6060
6061 if (VT.isVector())
6062 return false;
6063
6064 if (!Subtarget.hasBMI())
6065 return false;
6066
6067 // There are only 32-bit and 64-bit forms for 'andn'.
6068 if (VT != MVT::i32 && VT != MVT::i64)
6069 return false;
6070
6071 return !isa<ConstantSDNode>(Y);
6072}
6073
6074bool X86TargetLowering::hasAndNot(SDValue Y) const {
6075 EVT VT = Y.getValueType();
6076
6077 if (!VT.isVector())
6078 return hasAndNotCompare(Y);
6079
6080 // Vector.
6081
6082 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
6083 return false;
6084
6085 if (VT == MVT::v4i32)
6086 return true;
6087
6088 return Subtarget.hasSSE2();
6089}
6090
6091bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
6092 return X.getValueType().isScalarInteger(); // 'bt'
6093}
6094
6095bool X86TargetLowering::
6096 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6097 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
6098 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
6099 SelectionDAG &DAG) const {
6100 // Does baseline recommend not to perform the fold by default?
6101 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6102 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
6103 return false;
6104 // For scalars this transform is always beneficial.
6105 if (X.getValueType().isScalarInteger())
6106 return true;
6107 // If all the shift amounts are identical, then transform is beneficial even
6108 // with rudimentary SSE2 shifts.
6109 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
6110 return true;
6111 // If we have AVX2 with it's powerful shift operations, then it's also good.
6112 if (Subtarget.hasAVX2())
6113 return true;
6114 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
6115 return NewShiftOpcode == ISD::SHL;
6116}
6117
6118bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
6119 return N->getOpcode() != ISD::FP_EXTEND;
6120}
6121
6122bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
6123 const SDNode *N, CombineLevel Level) const {
6124 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6125 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6126 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6127 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6128 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
;
6129 // TODO: Should we always create i64 masks? Or only folded immediates?
6130 EVT VT = N->getValueType(0);
6131 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
6132 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
6133 // Only fold if the shift values are equal - so it folds to AND.
6134 // TODO - we should fold if either is a non-uniform vector but we don't do
6135 // the fold for non-splats yet.
6136 return N->getOperand(1) == N->getOperand(0).getOperand(1);
6137 }
6138 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
6139}
6140
6141bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
6142 EVT VT = Y.getValueType();
6143
6144 // For vectors, we don't have a preference, but we probably want a mask.
6145 if (VT.isVector())
6146 return false;
6147
6148 // 64-bit shifts on 32-bit targets produce really bad bloated code.
6149 if (VT == MVT::i64 && !Subtarget.is64Bit())
6150 return false;
6151
6152 return true;
6153}
6154
6155TargetLowering::ShiftLegalizationStrategy
6156X86TargetLowering::preferredShiftLegalizationStrategy(
6157 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
6158 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
6159 !Subtarget.isOSWindows())
6160 return ShiftLegalizationStrategy::LowerToLibcall;
6161 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6162 ExpansionFactor);
6163}
6164
6165bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
6166 // Any legal vector type can be splatted more efficiently than
6167 // loading/spilling from memory.
6168 return isTypeLegal(VT);
6169}
6170
6171MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
6172 MVT VT = MVT::getIntegerVT(NumBits);
6173 if (isTypeLegal(VT))
6174 return VT;
6175
6176 // PMOVMSKB can handle this.
6177 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
6178 return MVT::v16i8;
6179
6180 // VPMOVMSKB can handle this.
6181 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
6182 return MVT::v32i8;
6183
6184 // TODO: Allow 64-bit type for 32-bit target.
6185 // TODO: 512-bit types should be allowed, but make sure that those
6186 // cases are handled in combineVectorSizedSetCCEquality().
6187
6188 return MVT::INVALID_SIMPLE_VALUE_TYPE;
6189}
6190
6191/// Val is the undef sentinel value or equal to the specified value.
6192static bool isUndefOrEqual(int Val, int CmpVal) {
6193 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
6194}
6195
6196/// Return true if every element in Mask is the undef sentinel value or equal to
6197/// the specified value..
6198static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
6199 return llvm::all_of(Mask, [CmpVal](int M) {
6200 return (M == SM_SentinelUndef) || (M == CmpVal);
6201 });
6202}
6203
6204/// Val is either the undef or zero sentinel value.
6205static bool isUndefOrZero(int Val) {
6206 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
6207}
6208
6209/// Return true if every element in Mask, beginning from position Pos and ending
6210/// in Pos+Size is the undef sentinel value.
6211static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
6212 return llvm::all_of(Mask.slice(Pos, Size),
6213 [](int M) { return M == SM_SentinelUndef; });
6214}
6215
6216/// Return true if the mask creates a vector whose lower half is undefined.
6217static bool isUndefLowerHalf(ArrayRef<int> Mask) {
6218 unsigned NumElts = Mask.size();
6219 return isUndefInRange(Mask, 0, NumElts / 2);
6220}
6221
6222/// Return true if the mask creates a vector whose upper half is undefined.
6223static bool isUndefUpperHalf(ArrayRef<int> Mask) {
6224 unsigned NumElts = Mask.size();
6225 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
6226}
6227
6228/// Return true if Val falls within the specified range (L, H].
6229static bool isInRange(int Val, int Low, int Hi) {
6230 return (Val >= Low && Val < Hi);
6231}
6232
6233/// Return true if the value of any element in Mask falls within the specified
6234/// range (L, H].
6235static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
6236 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
6237}
6238
6239/// Return true if the value of any element in Mask is the zero sentinel value.
6240static bool isAnyZero(ArrayRef<int> Mask) {
6241 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
6242}
6243
6244/// Return true if the value of any element in Mask is the zero or undef
6245/// sentinel values.
6246static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
6247 return llvm::any_of(Mask, [](int M) {
6248 return M == SM_SentinelZero || M == SM_SentinelUndef;
6249 });
6250}
6251
6252/// Return true if Val is undef or if its value falls within the
6253/// specified range (L, H].
6254static bool isUndefOrInRange(int Val, int Low, int Hi) {
6255 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
6256}
6257
6258/// Return true if every element in Mask is undef or if its value
6259/// falls within the specified range (L, H].
6260static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6261 return llvm::all_of(
6262 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
6263}
6264
6265/// Return true if Val is undef, zero or if its value falls within the
6266/// specified range (L, H].
6267static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
6268 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
6269}
6270
6271/// Return true if every element in Mask is undef, zero or if its value
6272/// falls within the specified range (L, H].
6273static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6274 return llvm::all_of(
6275 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
6276}
6277
6278/// Return true if every element in Mask, beginning
6279/// from position Pos and ending in Pos + Size, falls within the specified
6280/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
6281static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
6282 unsigned Size, int Low, int Step = 1) {
6283 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6284 if (!isUndefOrEqual(Mask[i], Low))
6285 return false;
6286 return true;
6287}
6288
6289/// Return true if every element in Mask, beginning
6290/// from position Pos and ending in Pos+Size, falls within the specified
6291/// sequential range (Low, Low+Size], or is undef or is zero.
6292static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6293 unsigned Size, int Low,
6294 int Step = 1) {
6295 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6296 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
6297 return false;
6298 return true;
6299}
6300
6301/// Return true if every element in Mask, beginning
6302/// from position Pos and ending in Pos+Size is undef or is zero.
6303static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6304 unsigned Size) {
6305 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
6306}
6307
6308/// Helper function to test whether a shuffle mask could be
6309/// simplified by widening the elements being shuffled.
6310///
6311/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
6312/// leaves it in an unspecified state.
6313///
6314/// NOTE: This must handle normal vector shuffle masks and *target* vector
6315/// shuffle masks. The latter have the special property of a '-2' representing
6316/// a zero-ed lane of a vector.
6317static bool canWidenShuffleElements(ArrayRef<int> Mask,
6318 SmallVectorImpl<int> &WidenedMask) {
6319 WidenedMask.assign(Mask.size() / 2, 0);
6320 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
6321 int M0 = Mask[i];
6322 int M1 = Mask[i + 1];
6323
6324 // If both elements are undef, its trivial.
6325 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
6326 WidenedMask[i / 2] = SM_SentinelUndef;
6327 continue;
6328 }
6329
6330 // Check for an undef mask and a mask value properly aligned to fit with
6331 // a pair of values. If we find such a case, use the non-undef mask's value.
6332 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6333 WidenedMask[i / 2] = M1 / 2;
6334 continue;
6335 }
6336 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6337 WidenedMask[i / 2] = M0 / 2;
6338 continue;
6339 }
6340
6341 // When zeroing, we need to spread the zeroing across both lanes to widen.
6342 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6343 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6344 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6345 WidenedMask[i / 2] = SM_SentinelZero;
6346 continue;
6347 }
6348 return false;
6349 }
6350
6351 // Finally check if the two mask values are adjacent and aligned with
6352 // a pair.
6353 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6354 WidenedMask[i / 2] = M0 / 2;
6355 continue;
6356 }
6357
6358 // Otherwise we can't safely widen the elements used in this shuffle.
6359 return false;
6360 }
6361 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__))
6362 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__))
;
6363
6364 return true;
6365}
6366
6367static bool canWidenShuffleElements(ArrayRef<int> Mask,
6368 const APInt &Zeroable,
6369 bool V2IsZero,
6370 SmallVectorImpl<int> &WidenedMask) {
6371 // Create an alternative mask with info about zeroable elements.
6372 // Here we do not set undef elements as zeroable.
6373 SmallVector<int, 64> ZeroableMask(Mask);
6374 if (V2IsZero) {
6375 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6375, __extension__
__PRETTY_FUNCTION__))
;
6376 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6377 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6378 ZeroableMask[i] = SM_SentinelZero;
6379 }
6380 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6381}
6382
6383static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6384 SmallVector<int, 32> WidenedMask;
6385 return canWidenShuffleElements(Mask, WidenedMask);
6386}
6387
6388// Attempt to narrow/widen shuffle mask until it matches the target number of
6389// elements.
6390static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6391 SmallVectorImpl<int> &ScaledMask) {
6392 unsigned NumSrcElts = Mask.size();
6393 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__))
6394 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__))
;
6395
6396 // Narrowing is guaranteed to work.
6397 if (NumDstElts >= NumSrcElts) {
6398 int Scale = NumDstElts / NumSrcElts;
6399 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6400 return true;
6401 }
6402
6403 // We have to repeat the widening until we reach the target size, but we can
6404 // split out the first widening as it sets up ScaledMask for us.
6405 if (canWidenShuffleElements(Mask, ScaledMask)) {
6406 while (ScaledMask.size() > NumDstElts) {
6407 SmallVector<int, 16> WidenedMask;
6408 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6409 return false;
6410 ScaledMask = std::move(WidenedMask);
6411 }
6412 return true;
6413 }
6414
6415 return false;
6416}
6417
6418/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6419bool X86::isZeroNode(SDValue Elt) {
6420 return isNullConstant(Elt) || isNullFPConstant(Elt);
6421}
6422
6423// Build a vector of constants.
6424// Use an UNDEF node if MaskElt == -1.
6425// Split 64-bit constants in the 32-bit mode.
6426static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6427 const SDLoc &dl, bool IsMask = false) {
6428
6429 SmallVector<SDValue, 32> Ops;
6430 bool Split = false;
6431
6432 MVT ConstVecVT = VT;
6433 unsigned NumElts = VT.getVectorNumElements();
6434 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6435 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6436 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6437 Split = true;
6438 }
6439
6440 MVT EltVT = ConstVecVT.getVectorElementType();
6441 for (unsigned i = 0; i < NumElts; ++i) {
6442 bool IsUndef = Values[i] < 0 && IsMask;
6443 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6444 DAG.getConstant(Values[i], dl, EltVT);
6445 Ops.push_back(OpNode);
6446 if (Split)
6447 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6448 DAG.getConstant(0, dl, EltVT));
6449 }
6450 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6451 if (Split)
6452 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6453 return ConstsNode;
6454}
6455
6456static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6457 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6458 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__))
6459 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__))
;
6460 SmallVector<SDValue, 32> Ops;
6461 bool Split = false;
6462
6463 MVT ConstVecVT = VT;
6464 unsigned NumElts = VT.getVectorNumElements();
6465 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6466 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6467 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6468 Split = true;
6469 }
6470
6471 MVT EltVT = ConstVecVT.getVectorElementType();
6472 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6473 if (Undefs[i]) {
6474 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6475 continue;
6476 }
6477 const APInt &V = Bits[i];
6478 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6478, __extension__
__PRETTY_FUNCTION__))
;
6479 if (Split) {
6480 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6481 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6482 } else if (EltVT == MVT::f32) {
6483 APFloat FV(APFloat::IEEEsingle(), V);
6484 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6485 } else if (EltVT == MVT::f64) {
6486 APFloat FV(APFloat::IEEEdouble(), V);
6487 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6488 } else {
6489 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6490 }
6491 }
6492
6493 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6494 return DAG.getBitcast(VT, ConstsNode);
6495}
6496
6497/// Returns a vector of specified type with all zero elements.
6498static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6499 SelectionDAG &DAG, const SDLoc &dl) {
6500 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))
6501 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))
6502 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))
;
6503
6504 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6505 // type. This ensures they get CSE'd. But if the integer type is not
6506 // available, use a floating-point +0.0 instead.
6507 SDValue Vec;
6508 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6509 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6510 } else if (VT.isFloatingPoint()) {
6511 Vec = DAG.getConstantFP(+0.0, dl, VT);
6512 } else if (VT.getVectorElementType() == MVT::i1) {
6513 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__))
6514 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__))
;
6515 Vec = DAG.getConstant(0, dl, VT);
6516 } else {
6517 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6518 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6519 }
6520 return DAG.getBitcast(VT, Vec);
6521}
6522
6523// Helper to determine if the ops are all the extracted subvectors come from a
6524// single source. If we allow commute they don't have to be in order (Lo/Hi).
6525static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6526 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6527 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6528 LHS.getValueType() != RHS.getValueType() ||
6529 LHS.getOperand(0) != RHS.getOperand(0))
6530 return SDValue();
6531
6532 SDValue Src = LHS.getOperand(0);
6533 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6534 return SDValue();
6535
6536 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6537 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6538 RHS.getConstantOperandAPInt(1) == NumElts) ||
6539 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6540 LHS.getConstantOperandAPInt(1) == NumElts))
6541 return Src;
6542
6543 return SDValue();
6544}
6545
6546static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6547 const SDLoc &dl, unsigned vectorWidth) {
6548 EVT VT = Vec.getValueType();
6549 EVT ElVT = VT.getVectorElementType();
6550 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6551 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6552 VT.getVectorNumElements() / Factor);
6553
6554 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6555 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6556 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6556, __extension__
__PRETTY_FUNCTION__))
;
6557
6558 // This is the index of the first element of the vectorWidth-bit chunk
6559 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6560 IdxVal &= ~(ElemsPerChunk - 1);
6561
6562 // If the input is a buildvector just emit a smaller one.
6563 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6564 return DAG.getBuildVector(ResultVT, dl,
6565 Vec->ops().slice(IdxVal, ElemsPerChunk));
6566
6567 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6568 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6569}
6570
6571/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6572/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6573/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6574/// instructions or a simple subregister reference. Idx is an index in the
6575/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6576/// lowering EXTRACT_VECTOR_ELT operations easier.
6577static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6578 SelectionDAG &DAG, const SDLoc &dl) {
6579 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__))
6580 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__))
;
6581 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6582}
6583
6584/// Generate a DAG to grab 256-bits from a 512-bit vector.
6585static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6586 SelectionDAG &DAG, const SDLoc &dl) {
6587 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
;
6588 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6589}
6590
6591static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6592 SelectionDAG &DAG, const SDLoc &dl,
6593 unsigned vectorWidth) {
6594 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__))
6595 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__))
;
6596 // Inserting UNDEF is Result
6597 if (Vec.isUndef())
6598 return Result;
6599 EVT VT = Vec.getValueType();
6600 EVT ElVT = VT.getVectorElementType();
6601 EVT ResultVT = Result.getValueType();
6602
6603 // Insert the relevant vectorWidth bits.
6604 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6605 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6605, __extension__
__PRETTY_FUNCTION__))
;
6606
6607 // This is the index of the first element of the vectorWidth-bit chunk
6608 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6609 IdxVal &= ~(ElemsPerChunk - 1);
6610
6611 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6612 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6613}
6614
6615/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6616/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6617/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6618/// simple superregister reference. Idx is an index in the 128 bits
6619/// we want. It need not be aligned to a 128-bit boundary. That makes
6620/// lowering INSERT_VECTOR_ELT operations easier.
6621static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6622 SelectionDAG &DAG, const SDLoc &dl) {
6623 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6623, __extension__
__PRETTY_FUNCTION__))
;
6624 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6625}
6626
6627/// Widen a vector to a larger size with the same scalar type, with the new
6628/// elements either zero or undef.
6629static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6630 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6631 const SDLoc &dl) {
6632 assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))
6633 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))
6634 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))
;
6635 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6636 : DAG.getUNDEF(VT);
6637 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6638 DAG.getIntPtrConstant(0, dl));
6639}
6640
6641/// Widen a vector to a larger size with the same scalar type, with the new
6642/// elements either zero or undef.
6643static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6644 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6645 const SDLoc &dl, unsigned WideSizeInBits) {
6646 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))
6647 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))
6648 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))
;
6649 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6650 MVT SVT = Vec.getSimpleValueType().getScalarType();
6651 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6652 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6653}
6654
6655// Helper function to collect subvector ops that are concatenated together,
6656// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6657// The subvectors in Ops are guaranteed to be the same type.
6658static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
6659 SelectionDAG &DAG) {
6660 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6660, __extension__
__PRETTY_FUNCTION__))
;
6661
6662 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6663 Ops.append(N->op_begin(), N->op_end());
6664 return true;
6665 }
6666
6667 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6668 SDValue Src = N->getOperand(0);
6669 SDValue Sub = N->getOperand(1);
6670 const APInt &Idx = N->getConstantOperandAPInt(2);
6671 EVT VT = Src.getValueType();
6672 EVT SubVT = Sub.getValueType();
6673
6674 // TODO - Handle more general insert_subvector chains.
6675 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
6676 // insert_subvector(undef, x, lo)
6677 if (Idx == 0 && Src.isUndef()) {
6678 Ops.push_back(Sub);
6679 Ops.push_back(DAG.getUNDEF(SubVT));
6680 return true;
6681 }
6682 if (Idx == (VT.getVectorNumElements() / 2)) {
6683 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6684 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6685 Src.getOperand(1).getValueType() == SubVT &&
6686 isNullConstant(Src.getOperand(2))) {
6687 Ops.push_back(Src.getOperand(1));
6688 Ops.push_back(Sub);
6689 return true;
6690 }
6691 // insert_subvector(x, extract_subvector(x, lo), hi)
6692 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6693 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6694 Ops.append(2, Sub);
6695 return true;
6696 }
6697 // insert_subvector(undef, x, hi)
6698 if (Src.isUndef()) {
6699 Ops.push_back(DAG.getUNDEF(SubVT));
6700 Ops.push_back(Sub);
6701 return true;
6702 }
6703 }
6704 }
6705 }
6706
6707 return false;
6708}
6709
6710static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6711 const SDLoc &dl) {
6712 EVT VT = Op.getValueType();
6713 unsigned NumElems = VT.getVectorNumElements();
6714 unsigned SizeInBits = VT.getSizeInBits();
6715 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__))
6716 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__))
;
6717
6718 // If this is a splat value (with no-undefs) then use the lower subvector,
6719 // which should be a free extraction.
6720 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6721 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6722 return std::make_pair(Lo, Lo);
6723
6724 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6725 return std::make_pair(Lo, Hi);
6726}
6727
6728/// Break an operation into 2 half sized ops and then concatenate the results.
6729static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6730 unsigned NumOps = Op.getNumOperands();
6731 EVT VT = Op.getValueType();
6732 SDLoc dl(Op);
6733
6734 // Extract the LHS Lo/Hi vectors
6735 SmallVector<SDValue> LoOps(NumOps, SDValue());
6736 SmallVector<SDValue> HiOps(NumOps, SDValue());
6737 for (unsigned I = 0; I != NumOps; ++I) {
6738 SDValue SrcOp = Op.getOperand(I);
6739 if (!SrcOp.getValueType().isVector()) {
6740 LoOps[I] = HiOps[I] = SrcOp;
6741 continue;
6742 }
6743 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6744 }
6745
6746 EVT LoVT, HiVT;
6747 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6748 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6749 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6750 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6751}
6752
6753/// Break an unary integer operation into 2 half sized ops and then
6754/// concatenate the result back.
6755static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6756 // Make sure we only try to split 256/512-bit types to avoid creating
6757 // narrow vectors.
6758 EVT VT = Op.getValueType();
6759 (void)VT;
6760 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))
6761 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))
6762 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))
;
6763 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
6764 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
6765 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
;
6766 return splitVectorOp(Op, DAG);
6767}
6768
6769/// Break a binary integer operation into 2 half sized ops and then
6770/// concatenate the result back.
6771static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6772 // Assert that all the types match.
6773 EVT VT = Op.getValueType();
6774 (void)VT;
6775 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__))
6776 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__))
;
6777 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6777, __extension__
__PRETTY_FUNCTION__))
;
6778 return splitVectorOp(Op, DAG);
6779}
6780
6781// Helper for splitting operands of an operation to legal target size and
6782// apply a function on each part.
6783// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6784// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6785// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6786// The argument Builder is a function that will be applied on each split part:
6787// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6788template <typename F>
6789SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6790 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6791 F Builder, bool CheckBWI = true) {
6792 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6792, __extension__
__PRETTY_FUNCTION__))
;
6793 unsigned NumSubs = 1;
6794 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6795 (!CheckBWI && Subtarget.useAVX512Regs())) {
6796 if (VT.getSizeInBits() > 512) {
6797 NumSubs = VT.getSizeInBits() / 512;
6798 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__))
;
6799 }
6800 } else if (Subtarget.hasAVX2()) {
6801 if (VT.getSizeInBits() > 256) {
6802 NumSubs = VT.getSizeInBits() / 256;
6803 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6803, __extension__
__PRETTY_FUNCTION__))
;
6804 }
6805 } else {
6806 if (VT.getSizeInBits() > 128) {
6807 NumSubs = VT.getSizeInBits() / 128;
6808 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6808, __extension__
__PRETTY_FUNCTION__))
;
6809 }
6810 }
6811
6812 if (NumSubs == 1)
6813 return Builder(DAG, DL, Ops);
6814
6815 SmallVector<SDValue, 4> Subs;
6816 for (unsigned i = 0; i != NumSubs; ++i) {
6817 SmallVector<SDValue, 2> SubOps;
6818 for (SDValue Op : Ops) {
6819 EVT OpVT = Op.getValueType();
6820 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6821 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6822 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6823 }
6824 Subs.push_back(Builder(DAG, DL, SubOps));
6825 }
6826 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6827}
6828
6829// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6830// targets.
6831static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6832 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6833 const X86Subtarget &Subtarget) {
6834 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6834, __extension__
__PRETTY_FUNCTION__))
;
6835 MVT SVT = VT.getScalarType();
6836
6837 // If we have a 32/64 splatted constant, splat it to DstTy to
6838 // encourage a foldable broadcast'd operand.
6839 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6840 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6841 // AVX512 broadcasts 32/64-bit operands.
6842 // TODO: Support float once getAVX512Node is used by fp-ops.
6843 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6844 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6845 return SDValue();
6846 // If we're not widening, don't bother if we're not bitcasting.
6847 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6848 return SDValue();
6849 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6850 APInt SplatValue, SplatUndef;
6851 unsigned SplatBitSize;
6852 bool HasAnyUndefs;
6853 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6854 HasAnyUndefs, OpEltSizeInBits) &&
6855 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6856 return DAG.getConstant(SplatValue, DL, DstVT);
6857 }
6858 return SDValue();
6859 };
6860
6861 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6862
6863 MVT DstVT = VT;
6864 if (Widen)
6865 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6866
6867 // Canonicalize src operands.
6868 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6869 for (SDValue &Op : SrcOps) {
6870 MVT OpVT = Op.getSimpleValueType();
6871 // Just pass through scalar operands.
6872 if (!OpVT.isVector())
6873 continue;
6874 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6874, __extension__
__PRETTY_FUNCTION__))
;
6875
6876 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6877 Op = BroadcastOp;
6878 continue;
6879 }
6880
6881 // Just widen the subvector by inserting into an undef wide vector.
6882 if (Widen)
6883 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6884 }
6885
6886 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6887
6888 // Perform the 512-bit op then extract the bottom subvector.
6889 if (Widen)
6890 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6891 return Res;
6892}
6893
6894/// Insert i1-subvector to i1-vector.
6895static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6896 const X86Subtarget &Subtarget) {
6897
6898 SDLoc dl(Op);
6899 SDValue Vec = Op.getOperand(0);
6900 SDValue SubVec = Op.getOperand(1);
6901 SDValue Idx = Op.getOperand(2);
6902 unsigned IdxVal = Op.getConstantOperandVal(2);
6903
6904 // Inserting undef is a nop. We can just return the original vector.
6905 if (SubVec.isUndef())
6906 return Vec;
6907
6908 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6909 return Op;
6910
6911 MVT OpVT = Op.getSimpleValueType();
6912 unsigned NumElems = OpVT.getVectorNumElements();
6913 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6914
6915 // Extend to natively supported kshift.
6916 MVT WideOpVT = OpVT;
6917 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6918 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6919
6920 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6921 // if necessary.
6922 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6923 // May need to promote to a legal type.
6924 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6925 DAG.getConstant(0, dl, WideOpVT),
6926 SubVec, Idx);
6927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6928 }
6929
6930 MVT SubVecVT = SubVec.getSimpleValueType();
6931 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6932 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))
6933 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))
6934 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))
;
6935
6936 SDValue Undef = DAG.getUNDEF(WideOpVT);
6937
6938 if (IdxVal == 0) {
6939 // Zero lower bits of the Vec
6940 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6941 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6942 ZeroIdx);
6943 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6944 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6945 // Merge them together, SubVec should be zero extended.
6946 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6947 DAG.getConstant(0, dl, WideOpVT),
6948 SubVec, ZeroIdx);
6949 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6950 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6951 }
6952
6953 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6954 Undef, SubVec, ZeroIdx);
6955
6956 if (Vec.isUndef()) {
6957 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6957, __extension__
__PRETTY_FUNCTION__))
;
6958 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6959 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6960 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6961 }
6962
6963 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6964 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6964, __extension__
__PRETTY_FUNCTION__))
;
6965 // If upper elements of Vec are known undef, then just shift into place.
6966 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6967 [](SDValue V) { return V.isUndef(); })) {
6968 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6969 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6970 } else {
6971 NumElems = WideOpVT.getVectorNumElements();
6972 unsigned ShiftLeft = NumElems - SubVecNumElems;
6973 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6974 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6975 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6976 if (ShiftRight != 0)
6977 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6978 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6979 }
6980 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6981 }
6982
6983 // Simple case when we put subvector in the upper part
6984 if (IdxVal + SubVecNumElems == NumElems) {
6985 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6986 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6987 if (SubVecNumElems * 2 == NumElems) {
6988 // Special case, use legal zero extending insert_subvector. This allows
6989 // isel to optimize when bits are known zero.
6990 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6991 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6992 DAG.getConstant(0, dl, WideOpVT),
6993 Vec, ZeroIdx);
6994 } else {
6995 // Otherwise use explicit shifts to zero the bits.
6996 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6997 Undef, Vec, ZeroIdx);
6998 NumElems = WideOpVT.getVectorNumElements();
6999 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
7000 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
7001 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
7002 }
7003 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7004 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7005 }
7006
7007 // Inserting into the middle is more complicated.
7008
7009 NumElems = WideOpVT.getVectorNumElements();
7010
7011 // Widen the vector if needed.
7012 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
7013
7014 unsigned ShiftLeft = NumElems - SubVecNumElems;
7015 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
7016
7017 // Do an optimization for the the most frequently used types.
7018 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
7019 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
7020 Mask0.flipAllBits();
7021 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
7022 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
7023 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
7024 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7025 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7026 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7027 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7028 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7029
7030 // Reduce to original width if needed.
7031 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7032 }
7033
7034 // Clear the upper bits of the subvector and move it to its insert position.
7035 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7036 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7037 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7038 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7039
7040 // Isolate the bits below the insertion point.
7041 unsigned LowShift = NumElems - IdxVal;
7042 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
7043 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7044 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
7045 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7046
7047 // Isolate the bits after the last inserted bit.
7048 unsigned HighShift = IdxVal + SubVecNumElems;
7049 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
7050 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7051 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
7052 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7053
7054 // Now OR all 3 pieces together.
7055 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
7056 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
7057
7058 // Reduce to original width if needed.
7059 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7060}
7061
7062static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
7063 const SDLoc &dl) {
7064 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7064, __extension__
__PRETTY_FUNCTION__))
;
7065 EVT SubVT = V1.getValueType();
7066 EVT SubSVT = SubVT.getScalarType();
7067 unsigned SubNumElts = SubVT.getVectorNumElements();
7068 unsigned SubVectorWidth = SubVT.getSizeInBits();
7069 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
7070 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
7071 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
7072}
7073
7074/// Returns a vector of specified type with all bits set.
7075/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
7076/// Then bitcast to their original type, ensuring they get CSE'd.
7077static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
7078 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__))
7079 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__))
;
7080
7081 APInt Ones = APInt::getAllOnes(32);
7082 unsigned NumElts = VT.getSizeInBits() / 32;
7083 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
7084 return DAG.getBitcast(VT, Vec);
7085}
7086
7087static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
7088 SDValue In, SelectionDAG &DAG) {
7089 EVT InVT = In.getValueType();
7090 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7090, __extension__
__PRETTY_FUNCTION__))
;
7091 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))
7092 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))
7093 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))
;
7094
7095 // For 256-bit vectors, we only need the lower (128-bit) input half.
7096 // For 512-bit vectors, we only need the lower input half or quarter.
7097 if (InVT.getSizeInBits() > 128) {
7098 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__))
7099 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__))
;
7100 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
7101 In = extractSubVector(In, 0, DAG, DL,
7102 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
7103 InVT = In.getValueType();
7104 }
7105
7106 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
7107 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
7108
7109 return DAG.getNode(Opcode, DL, VT, In);
7110}
7111
7112// Match (xor X, -1) -> X.
7113// Match extract_subvector(xor X, -1) -> extract_subvector(X).
7114// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
7115static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
7116 V = peekThroughBitcasts(V);
7117 if (V.getOpcode() == ISD::XOR &&
7118 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
7119 isAllOnesConstant(V.getOperand(1))))
7120 return V.getOperand(0);
7121 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7122 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
7123 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
7124 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
7125 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
7126 Not, V.getOperand(1));
7127 }
7128 }
7129 SmallVector<SDValue, 2> CatOps;
7130 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
7131 for (SDValue &CatOp : CatOps) {
7132 SDValue NotCat = IsNOT(CatOp, DAG);
7133 if (!NotCat) return SDValue();
7134 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
7135 }
7136 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
7137 }
7138 return SDValue();
7139}
7140
7141void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
7142 bool Lo, bool Unary) {
7143 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__))
7144 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__))
;
7145 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7145, __extension__
__PRETTY_FUNCTION__))
;
7146 int NumElts = VT.getVectorNumElements();
7147 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7148 for (int i = 0; i < NumElts; ++i) {
7149 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7150 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
7151 Pos += (Unary ? 0 : NumElts * (i % 2));
7152 Pos += (Lo ? 0 : NumEltsInLane / 2);
7153 Mask.push_back(Pos);
7154 }
7155}
7156
7157/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
7158/// imposed by AVX and specific to the unary pattern. Example:
7159/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
7160/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
7161void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7162 bool Lo) {
7163 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7163, __extension__
__PRETTY_FUNCTION__))
;
7164 int NumElts = VT.getVectorNumElements();
7165 for (int i = 0; i < NumElts; ++i) {
7166 int Pos = i / 2;
7167 Pos += (Lo ? 0 : NumElts / 2);
7168 Mask.push_back(Pos);
7169 }
7170}
7171
7172// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
7173static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
7174 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
7175 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
7176 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
7177 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
7178 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
7179 int M = Mask[I];
7180 if (M < 0)
7181 continue;
7182 SDValue V = (M < NumElts) ? V1 : V2;
7183 if (V.isUndef())
7184 continue;
7185 Ops[I] = V.getOperand(M % NumElts);
7186 }
7187 return DAG.getBuildVector(VT, dl, Ops);
7188 }
7189
7190 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
7191}
7192
7193/// Returns a vector_shuffle node for an unpackl operation.
7194static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7195 SDValue V1, SDValue V2) {
7196 SmallVector<int, 8> Mask;
7197 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
7198 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7199}
7200
7201/// Returns a vector_shuffle node for an unpackh operation.
7202static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7203 SDValue V1, SDValue V2) {
7204 SmallVector<int, 8> Mask;
7205 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
7206 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7207}
7208
7209/// Returns a node that packs the LHS + RHS nodes together at half width.
7210/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
7211/// TODO: Add subvector splitting if/when we have a need for it.
7212static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
7213 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
7214 bool PackHiHalf = false) {
7215 MVT OpVT = LHS.getSimpleValueType();
7216 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7217 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
7218 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
7219 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
7220 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
7221 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
;
7222 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__))
7223 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__))
;
7224
7225 // Rely on vector shuffles for vXi64 -> vXi32 packing.
7226 if (EltSizeInBits == 32) {
7227 SmallVector<int> PackMask;
7228 int Offset = PackHiHalf ? 1 : 0;
7229 int NumElts = VT.getVectorNumElements();
7230 for (int I = 0; I != NumElts; I += 4) {
7231 PackMask.push_back(I + Offset);
7232 PackMask.push_back(I + Offset + 2);
7233 PackMask.push_back(I + Offset + NumElts);
7234 PackMask.push_back(I + Offset + NumElts + 2);
7235 }
7236 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
7237 DAG.getBitcast(VT, RHS), PackMask);
7238 }
7239
7240 // See if we already have sufficient leading bits for PACKSS/PACKUS.
7241 if (!PackHiHalf) {
7242 if (UsePackUS &&
7243 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
7244 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
7245 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7246
7247 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
7248 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
7249 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7250 }
7251
7252 // Fallback to sign/zero extending the requested half and pack.
7253 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
7254 if (UsePackUS) {
7255 if (PackHiHalf) {
7256 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
7257 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
7258 } else {
7259 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
7260 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
7261 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
7262 };
7263 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7264 };
7265
7266 if (!PackHiHalf) {
7267 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
7268 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
7269 }
7270 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
7271 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
7272 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7273}
7274
7275/// Return a vector_shuffle of the specified vector of zero or undef vector.
7276/// This produces a shuffle where the low element of V2 is swizzled into the
7277/// zero/undef vector, landing at element Idx.
7278/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
7279static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
7280 bool IsZero,
7281 const X86Subtarget &Subtarget,
7282 SelectionDAG &DAG) {
7283 MVT VT = V2.getSimpleValueType();
7284 SDValue V1 = IsZero
7285 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
7286 int NumElems = VT.getVectorNumElements();
7287 SmallVector<int, 16> MaskVec(NumElems);
7288 for (int i = 0; i != NumElems; ++i)
7289 // If this is the insertion idx, put the low elt of V2 here.
7290 MaskVec[i] = (i == Idx) ? NumElems : i;
7291 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
7292}
7293
7294static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
7295 if (Ptr.getOpcode() == X86ISD::Wrapper ||
7296 Ptr.getOpcode() == X86ISD::WrapperRIP)
7297 Ptr = Ptr.getOperand(0);
7298
7299 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
7300 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
7301 return nullptr;
7302
7303 return CNode->getConstVal();
7304}
7305
7306static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
7307 if (!Load || !ISD::isNormalLoad(Load))
7308 return nullptr;
7309 return getTargetConstantFromBasePtr(Load->getBasePtr());
7310}
7311
7312static const Constant *getTargetConstantFromNode(SDValue Op) {
7313 Op = peekThroughBitcasts(Op);
7314 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
7315}
7316
7317const Constant *
7318X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7319 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7319, __extension__
__PRETTY_FUNCTION__))
;
7320 return getTargetConstantFromNode(LD);
7321}
7322
7323// Extract raw constant bits from constant pools.
7324static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7325 APInt &UndefElts,
7326 SmallVectorImpl<APInt> &EltBits,
7327 bool AllowWholeUndefs = true,
7328 bool AllowPartialUndefs = true) {
7329 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7329, __extension__
__PRETTY_FUNCTION__))
;
7330
7331 Op = peekThroughBitcasts(Op);
7332
7333 EVT VT = Op.getValueType();
7334 unsigned SizeInBits = VT.getSizeInBits();
7335 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7335, __extension__
__PRETTY_FUNCTION__))
;
7336 unsigned NumElts = SizeInBits / EltSizeInBits;
7337
7338 // Bitcast a source array of element bits to the target size.
7339 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7340 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7341 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7342 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__))
7343 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__))
;
7344
7345 // Don't split if we don't allow undef bits.
7346 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7347 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7348 return false;
7349
7350 // If we're already the right size, don't bother bitcasting.
7351 if (NumSrcElts == NumElts) {
7352 UndefElts = UndefSrcElts;
7353 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7354 return true;
7355 }
7356
7357 // Extract all the undef/constant element data and pack into single bitsets.
7358 APInt UndefBits(SizeInBits, 0);
7359 APInt MaskBits(SizeInBits, 0);
7360
7361 for (unsigned i = 0; i != NumSrcElts; ++i) {
7362 unsigned BitOffset = i * SrcEltSizeInBits;
7363 if (UndefSrcElts[i])
7364 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7365 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7366 }
7367
7368 // Split the undef/constant single bitset data into the target elements.
7369 UndefElts = APInt(NumElts, 0);
7370 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7371
7372 for (unsigned i = 0; i != NumElts; ++i) {
7373 unsigned BitOffset = i * EltSizeInBits;
7374 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7375
7376 // Only treat an element as UNDEF if all bits are UNDEF.
7377 if (UndefEltBits.isAllOnes()) {
7378 if (!AllowWholeUndefs)
7379 return false;
7380 UndefElts.setBit(i);
7381 continue;
7382 }
7383
7384 // If only some bits are UNDEF then treat them as zero (or bail if not
7385 // supported).
7386 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7387 return false;
7388
7389 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7390 }
7391 return true;
7392 };
7393
7394 // Collect constant bits and insert into mask/undef bit masks.
7395 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7396 unsigned UndefBitIndex) {
7397 if (!Cst)
7398 return false;
7399 if (isa<UndefValue>(Cst)) {
7400 Undefs.setBit(UndefBitIndex);
7401 return true;
7402 }
7403 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7404 Mask = CInt->getValue();
7405 return true;
7406 }
7407 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7408 Mask = CFP->getValueAPF().bitcastToAPInt();
7409 return true;
7410 }
7411 return false;
7412 };
7413
7414 // Handle UNDEFs.
7415 if (Op.isUndef()) {
7416 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7417 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7418 return CastBitData(UndefSrcElts, SrcEltBits);
7419 }
7420
7421 // Extract scalar constant bits.
7422 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7423 APInt UndefSrcElts = APInt::getZero(1);
7424 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7425 return CastBitData(UndefSrcElts, SrcEltBits);
7426 }
7427 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7428 APInt UndefSrcElts = APInt::getZero(1);
7429 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7430 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7431 return CastBitData(UndefSrcElts, SrcEltBits);
7432 }
7433
7434 // Extract constant bits from build vector.
7435 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7436 BitVector Undefs;
7437 SmallVector<APInt> SrcEltBits;
7438 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7439 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7440 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
7441 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7442 if (Undefs[I])
7443 UndefSrcElts.setBit(I);
7444 return CastBitData(UndefSrcElts, SrcEltBits);
7445 }
7446 }
7447
7448 // Extract constant bits from constant pool vector.
7449 if (auto *Cst = getTargetConstantFromNode(Op)) {
7450 Type *CstTy = Cst->getType();
7451 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7452 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7453 return false;
7454
7455 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7456 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7457
7458 APInt UndefSrcElts(NumSrcElts, 0);
7459 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7460 for (unsigned i = 0; i != NumSrcElts; ++i)
7461 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7462 UndefSrcElts, i))
7463 return false;
7464
7465 return CastBitData(UndefSrcElts, SrcEltBits);
7466 }
7467
7468 // Extract constant bits from a broadcasted constant pool scalar.
7469 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7470 EltSizeInBits <= VT.getScalarSizeInBits()) {
7471 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7472 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7473 return false;
7474
7475 SDValue Ptr = MemIntr->getBasePtr();
7476 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7477 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7478 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7479
7480 APInt UndefSrcElts(NumSrcElts, 0);
7481 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7482 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7483 if (UndefSrcElts[0])
7484 UndefSrcElts.setBits(0, NumSrcElts);
7485 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7486 return CastBitData(UndefSrcElts, SrcEltBits);
7487 }
7488 }
7489 }
7490
7491 // Extract constant bits from a subvector broadcast.
7492 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7493 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7494 SDValue Ptr = MemIntr->getBasePtr();
7495 // The source constant may be larger than the subvector broadcast,
7496 // ensure we extract the correct subvector constants.
7497 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7498 Type *CstTy = Cst->getType();
7499 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7500 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7501 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7502 (SizeInBits % SubVecSizeInBits) != 0)
7503 return false;
7504 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7505 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7506 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7507 APInt UndefSubElts(NumSubElts, 0);
7508 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7509 APInt(CstEltSizeInBits, 0));
7510 for (unsigned i = 0; i != NumSubElts; ++i) {
7511 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7512 UndefSubElts, i))
7513 return false;
7514 for (unsigned j = 1; j != NumSubVecs; ++j)
7515 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7516 }
7517 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7518 UndefSubElts);
7519 return CastBitData(UndefSubElts, SubEltBits);
7520 }
7521 }
7522
7523 // Extract a rematerialized scalar constant insertion.
7524 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7525 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7526 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7527 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7528 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7529
7530 APInt UndefSrcElts(NumSrcElts, 0);
7531 SmallVector<APInt, 64> SrcEltBits;
7532 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7533 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7534 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7535 return CastBitData(UndefSrcElts, SrcEltBits);
7536 }
7537
7538 // Insert constant bits from a base and sub vector sources.
7539 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7540 // If bitcasts to larger elements we might lose track of undefs - don't
7541 // allow any to be safe.
7542 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7543 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7544
7545 APInt UndefSrcElts, UndefSubElts;
7546 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7547 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7548 UndefSubElts, EltSubBits,
7549 AllowWholeUndefs && AllowUndefs,
7550 AllowPartialUndefs && AllowUndefs) &&
7551 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7552 UndefSrcElts, EltSrcBits,
7553 AllowWholeUndefs && AllowUndefs,
7554 AllowPartialUndefs && AllowUndefs)) {
7555 unsigned BaseIdx = Op.getConstantOperandVal(2);
7556 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7557 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7558 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7559 return CastBitData(UndefSrcElts, EltSrcBits);
7560 }
7561 }
7562
7563 // Extract constant bits from a subvector's source.
7564 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7565 // TODO - support extract_subvector through bitcasts.
7566 if (EltSizeInBits != VT.getScalarSizeInBits())
7567 return false;
7568
7569 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7570 UndefElts, EltBits, AllowWholeUndefs,
7571 AllowPartialUndefs)) {
7572 EVT SrcVT = Op.getOperand(0).getValueType();
7573 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7574 unsigned NumSubElts = VT.getVectorNumElements();
7575 unsigned BaseIdx = Op.getConstantOperandVal(1);
7576 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7577 if ((BaseIdx + NumSubElts) != NumSrcElts)
7578 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7579 if (BaseIdx != 0)
7580 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7581 return true;
7582 }
7583 }
7584
7585 // Extract constant bits from shuffle node sources.
7586 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7587 // TODO - support shuffle through bitcasts.
7588 if (EltSizeInBits != VT.getScalarSizeInBits())
7589 return false;
7590
7591 ArrayRef<int> Mask = SVN->getMask();
7592 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7593 llvm::any_of(Mask, [](int M) { return M < 0; }))
7594 return false;
7595
7596 APInt UndefElts0, UndefElts1;
7597 SmallVector<APInt, 32> EltBits0, EltBits1;
7598 if (isAnyInRange(Mask, 0, NumElts) &&
7599 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7600 UndefElts0, EltBits0, AllowWholeUndefs,
7601 AllowPartialUndefs))
7602 return false;
7603 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7604 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7605 UndefElts1, EltBits1, AllowWholeUndefs,
7606 AllowPartialUndefs))
7607 return false;
7608
7609 UndefElts = APInt::getZero(NumElts);
7610 for (int i = 0; i != (int)NumElts; ++i) {
7611 int M = Mask[i];
7612 if (M < 0) {
7613 UndefElts.setBit(i);
7614 EltBits.push_back(APInt::getZero(EltSizeInBits));
7615 } else if (M < (int)NumElts) {
7616 if (UndefElts0[M])
7617 UndefElts.setBit(i);
7618 EltBits.push_back(EltBits0[M]);
7619 } else {
7620 if (UndefElts1[M - NumElts])
7621 UndefElts.setBit(i);
7622 EltBits.push_back(EltBits1[M - NumElts]);
7623 }
7624 }
7625 return true;
7626 }
7627
7628 return false;
7629}
7630
7631namespace llvm {
7632namespace X86 {
7633bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7634 APInt UndefElts;
7635 SmallVector<APInt, 16> EltBits;
7636 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7637 UndefElts, EltBits, true,
7638 AllowPartialUndefs)) {
7639 int SplatIndex = -1;
7640 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7641 if (UndefElts[i])
7642 continue;
7643 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7644 SplatIndex = -1;
7645 break;
7646 }
7647 SplatIndex = i;
7648 }
7649 if (0 <= SplatIndex) {
7650 SplatVal = EltBits[SplatIndex];
7651 return true;
7652 }
7653 }
7654
7655 return false;
7656}
7657} // namespace X86
7658} // namespace llvm
7659
7660static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7661 unsigned MaskEltSizeInBits,
7662 SmallVectorImpl<uint64_t> &RawMask,
7663 APInt &UndefElts) {
7664 // Extract the raw target constant bits.
7665 SmallVector<APInt, 64> EltBits;
7666 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7667 EltBits, /* AllowWholeUndefs */ true,
7668 /* AllowPartialUndefs */ false))
7669 return false;
7670
7671 // Insert the extracted elements into the mask.
7672 for (const APInt &Elt : EltBits)
7673 RawMask.push_back(Elt.getZExtValue());
7674
7675 return true;
7676}
7677
7678/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7679/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7680/// Note: This ignores saturation, so inputs must be checked first.
7681static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7682 bool Unary, unsigned NumStages = 1) {
7683 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7683, __extension__
__PRETTY_FUNCTION__))
;
7684 unsigned NumElts = VT.getVectorNumElements();
7685 unsigned NumLanes = VT.getSizeInBits() / 128;
7686 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7687 unsigned Offset = Unary ? 0 : NumElts;
7688 unsigned Repetitions = 1u << (NumStages - 1);
7689 unsigned Increment = 1u << NumStages;
7690 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__))
;
7691
7692 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7693 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7694 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7695 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7696 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7697 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7698 }
7699 }
7700}
7701
7702// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7703static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7704 APInt &DemandedLHS, APInt &DemandedRHS) {
7705 int NumLanes = VT.getSizeInBits() / 128;
7706 int NumElts = DemandedElts.getBitWidth();
7707 int NumInnerElts = NumElts / 2;
7708 int NumEltsPerLane = NumElts / NumLanes;
7709 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7710
7711 DemandedLHS = APInt::getZero(NumInnerElts);
7712 DemandedRHS = APInt::getZero(NumInnerElts);
7713
7714 // Map DemandedElts to the packed operands.
7715 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7716 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7717 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7718 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7719 if (DemandedElts[OuterIdx])
7720 DemandedLHS.setBit(InnerIdx);
7721 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7722 DemandedRHS.setBit(InnerIdx);
7723 }
7724 }
7725}
7726
7727// Split the demanded elts of a HADD/HSUB node between its operands.
7728static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7729 APInt &DemandedLHS, APInt &DemandedRHS) {
7730 int NumLanes = VT.getSizeInBits() / 128;
7731 int NumElts = DemandedElts.getBitWidth();
7732 int NumEltsPerLane = NumElts / NumLanes;
7733 int HalfEltsPerLane = NumEltsPerLane / 2;
7734
7735 DemandedLHS = APInt::getZero(NumElts);
7736 DemandedRHS = APInt::getZero(NumElts);
7737
7738 // Map DemandedElts to the horizontal operands.
7739 for (int Idx = 0; Idx != NumElts; ++Idx) {
7740 if (!DemandedElts[Idx])
7741 continue;
7742 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7743 int LocalIdx = Idx % NumEltsPerLane;
7744 if (LocalIdx < HalfEltsPerLane) {
7745 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7746 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7747 } else {
7748 LocalIdx -= HalfEltsPerLane;
7749 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7750 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7751 }
7752 }
7753}
7754
7755/// Calculates the shuffle mask corresponding to the target-specific opcode.
7756/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7757/// operands in \p Ops, and returns true.
7758/// Sets \p IsUnary to true if only one source is used. Note that this will set
7759/// IsUnary for shuffles which use a single input multiple times, and in those
7760/// cases it will adjust the mask to only have indices within that single input.
7761/// It is an error to call this with non-empty Mask/Ops vectors.
7762static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7763 SmallVectorImpl<SDValue> &Ops,
7764 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7765 unsigned NumElems = VT.getVectorNumElements();
7766 unsigned MaskEltSize = VT.getScalarSizeInBits();
7767 SmallVector<uint64_t, 32> RawMask;
7768 APInt RawUndefs;
7769 uint64_t ImmN;
7770
7771 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7771, __extension__
__PRETTY_FUNCTION__))
;
7772 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7772, __extension__
__PRETTY_FUNCTION__))
;
7773
7774 IsUnary = false;
7775 bool IsFakeUnary = false;
7776 switch (N->getOpcode()) {
7777 case X86ISD::BLENDI:
7778 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7778, __extension__
__PRETTY_FUNCTION__))
;
7779 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7779, __extension__
__PRETTY_FUNCTION__))
;
7780 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7781 DecodeBLENDMask(NumElems, ImmN, Mask);
7782 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7783 break;
7784 case X86ISD::SHUFP:
7785 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7785, __extension__
__PRETTY_FUNCTION__))
;
7786 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7786, __extension__
__PRETTY_FUNCTION__))
;
7787 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7788 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7789 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7790 break;
7791 case X86ISD::INSERTPS:
7792 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__))
;
7793 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__))
;
7794 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7795 DecodeINSERTPSMask(ImmN, Mask);
7796 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7797 break;
7798 case X86ISD::EXTRQI:
7799 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__))
;
7800 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7801 isa<ConstantSDNode>(N->getOperand(2))) {
7802 int BitLen = N->getConstantOperandVal(1);
7803 int BitIdx = N->getConstantOperandVal(2);
7804 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7805 IsUnary = true;
7806 }
7807 break;
7808 case X86ISD::INSERTQI:
7809 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7809, __extension__
__PRETTY_FUNCTION__))
;
7810 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7810, __extension__
__PRETTY_FUNCTION__))
;
7811 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7812 isa<ConstantSDNode>(N->getOperand(3))) {
7813 int BitLen = N->getConstantOperandVal(2);
7814 int BitIdx = N->getConstantOperandVal(3);
7815 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7816 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7817 }
7818 break;
7819 case X86ISD::UNPCKH:
7820 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__))
;
7821 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7821, __extension__
__PRETTY_FUNCTION__))
;
7822 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7823 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7824 break;
7825 case X86ISD::UNPCKL:
7826 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7826, __extension__
__PRETTY_FUNCTION__))
;
7827 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7827, __extension__
__PRETTY_FUNCTION__))
;
7828 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7829 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7830 break;
7831 case X86ISD::MOVHLPS:
7832 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7832, __extension__
__PRETTY_FUNCTION__))
;
7833 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7833, __extension__
__PRETTY_FUNCTION__))
;
7834 DecodeMOVHLPSMask(NumElems, Mask);
7835 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7836 break;
7837 case X86ISD::MOVLHPS:
7838 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7838, __extension__
__PRETTY_FUNCTION__))
;
7839 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7839, __extension__
__PRETTY_FUNCTION__))
;
7840 DecodeMOVLHPSMask(NumElems, Mask);
7841 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7842 break;
7843 case X86ISD::VALIGN:
7844 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__))
7845 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__))
;
7846 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7846, __extension__
__PRETTY_FUNCTION__))
;
7847 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__))
;
7848 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7849 DecodeVALIGNMask(NumElems, ImmN, Mask);
7850 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7851 Ops.push_back(N->getOperand(1));
7852 Ops.push_back(N->getOperand(0));
7853 break;
7854 case X86ISD::PALIGNR:
7855 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7855, __extension__
__PRETTY_FUNCTION__))
;
7856 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7856, __extension__
__PRETTY_FUNCTION__))
;
7857 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7857, __extension__
__PRETTY_FUNCTION__))
;
7858 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7859 DecodePALIGNRMask(NumElems, ImmN, Mask);
7860 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7861 Ops.push_back(N->getOperand(1));
7862 Ops.push_back(N->getOperand(0));
7863 break;
7864 case X86ISD::VSHLDQ:
7865 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7865, __extension__
__PRETTY_FUNCTION__))
;
7866 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
;
7867 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7868 DecodePSLLDQMask(NumElems, ImmN, Mask);
7869 IsUnary = true;
7870 break;
7871 case X86ISD::VSRLDQ:
7872 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7872, __extension__
__PRETTY_FUNCTION__))
;
7873 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7873, __extension__
__PRETTY_FUNCTION__))
;
7874 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7875 DecodePSRLDQMask(NumElems, ImmN, Mask);
7876 IsUnary = true;
7877 break;
7878 case X86ISD::PSHUFD:
7879 case X86ISD::VPERMILPI:
7880 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7880, __extension__
__PRETTY_FUNCTION__))
;
7881 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7882 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7883 IsUnary = true;
7884 break;
7885 case X86ISD::PSHUFHW:
7886 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__))
;
7887 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7888 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7889 IsUnary = true;
7890 break;
7891 case X86ISD::PSHUFLW:
7892 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7892, __extension__
__PRETTY_FUNCTION__))
;
7893 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7894 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7895 IsUnary = true;
7896 break;
7897 case X86ISD::VZEXT_MOVL:
7898 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7898, __extension__
__PRETTY_FUNCTION__))
;
7899 DecodeZeroMoveLowMask(NumElems, Mask);
7900 IsUnary = true;
7901 break;
7902 case X86ISD::VBROADCAST:
7903 // We only decode broadcasts of same-sized vectors, peeking through to
7904 // extracted subvectors is likely to cause hasOneUse issues with
7905 // SimplifyDemandedBits etc.
7906 if (N->getOperand(0).getValueType() == VT) {
7907 DecodeVectorBroadcast(NumElems, Mask);
7908 IsUnary = true;
7909 break;
7910 }
7911 return false;
7912 case X86ISD::VPERMILPV: {
7913 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__))
;
7914 IsUnary = true;
7915 SDValue MaskNode = N->getOperand(1);
7916 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7917 RawUndefs)) {
7918 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7919 break;
7920 }
7921 return false;
7922 }
7923 case X86ISD::PSHUFB: {
7924 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7924, __extension__
__PRETTY_FUNCTION__))
;
7925 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7925, __extension__
__PRETTY_FUNCTION__))
;
7926 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7926, __extension__
__PRETTY_FUNCTION__))
;
7927 IsUnary = true;
7928 SDValue MaskNode = N->getOperand(1);
7929 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7930 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7931 break;
7932 }
7933 return false;
7934 }
7935 case X86ISD::VPERMI:
7936 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7936, __extension__
__PRETTY_FUNCTION__))
;
7937 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7938 DecodeVPERMMask(NumElems, ImmN, Mask);
7939 IsUnary = true;
7940 break;
7941 case X86ISD::MOVSS:
7942 case X86ISD::MOVSD:
7943 case X86ISD::MOVSH:
7944 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7944, __extension__
__PRETTY_FUNCTION__))
;
7945 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__))
;
7946 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7947 break;
7948 case X86ISD::VPERM2X128:
7949 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7949, __extension__
__PRETTY_FUNCTION__))
;
7950 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7950, __extension__
__PRETTY_FUNCTION__))
;
7951 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7952 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7953 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7954 break;
7955 case X86ISD::SHUF128:
7956 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7956, __extension__
__PRETTY_FUNCTION__))
;
7957 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__))
;
7958 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7959 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7960 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7961 break;
7962 case X86ISD::MOVSLDUP:
7963 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7963, __extension__
__PRETTY_FUNCTION__))
;
7964 DecodeMOVSLDUPMask(NumElems, Mask);
7965 IsUnary = true;
7966 break;
7967 case X86ISD::MOVSHDUP:
7968 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7968, __extension__
__PRETTY_FUNCTION__))
;
7969 DecodeMOVSHDUPMask(NumElems, Mask);
7970 IsUnary = true;
7971 break;
7972 case X86ISD::MOVDDUP:
7973 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7973, __extension__
__PRETTY_FUNCTION__))
;
7974 DecodeMOVDDUPMask(NumElems, Mask);
7975 IsUnary = true;
7976 break;
7977 case X86ISD::VPERMIL2: {
7978 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__))
;
7979 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7979, __extension__
__PRETTY_FUNCTION__))
;
7980 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7981 SDValue MaskNode = N->getOperand(2);
7982 SDValue CtrlNode = N->getOperand(3);
7983 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7984 unsigned CtrlImm = CtrlOp->getZExtValue();
7985 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7986 RawUndefs)) {
7987 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7988 Mask);
7989 break;
7990 }
7991 }
7992 return false;
7993 }
7994 case X86ISD::VPPERM: {
7995 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7995, __extension__
__PRETTY_FUNCTION__))
;
7996 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7996, __extension__
__PRETTY_FUNCTION__))
;
7997 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7998 SDValue MaskNode = N->getOperand(2);
7999 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
8000 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
8001 break;
8002 }
8003 return false;
8004 }
8005 case X86ISD::VPERMV: {
8006 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8006, __extension__
__PRETTY_FUNCTION__))
;
8007 IsUnary = true;
8008 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
8009 Ops.push_back(N->getOperand(1));
8010 SDValue MaskNode = N->getOperand(0);
8011 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8012 RawUndefs)) {
8013 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
8014 break;
8015 }
8016 return false;
8017 }
8018 case X86ISD::VPERMV3: {
8019 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8019, __extension__
__PRETTY_FUNCTION__))
;
8020 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8020, __extension__
__PRETTY_FUNCTION__))
;
8021 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
8022 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
8023 Ops.push_back(N->getOperand(0));
8024 Ops.push_back(N->getOperand(2));
8025 SDValue MaskNode = N->getOperand(1);
8026 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8027 RawUndefs)) {
8028 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
8029 break;
8030 }
8031 return false;
8032 }
8033 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8033)
;
8034 }
8035
8036 // Empty mask indicates the decode failed.
8037 if (Mask.empty())
8038 return false;
8039
8040 // Check if we're getting a shuffle mask with zero'd elements.
8041 if (!AllowSentinelZero && isAnyZero(Mask))
8042 return false;
8043
8044 // If we have a fake unary shuffle, the shuffle mask is spread across two
8045 // inputs that are actually the same node. Re-map the mask to always point
8046 // into the first input.
8047 if (IsFakeUnary)
8048 for (int &M : Mask)
8049 if (M >= (int)Mask.size())
8050 M -= Mask.size();
8051
8052 // If we didn't already add operands in the opcode-specific code, default to
8053 // adding 1 or 2 operands starting at 0.
8054 if (Ops.empty()) {
8055 Ops.push_back(N->getOperand(0));
8056 if (!IsUnary || IsFakeUnary)
8057 Ops.push_back(N->getOperand(1));
8058 }
8059
8060 return true;
8061}
8062
8063// Wrapper for getTargetShuffleMask with InUnary;
8064static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
8065 SmallVectorImpl<SDValue> &Ops,
8066 SmallVectorImpl<int> &Mask) {
8067 bool IsUnary;
8068 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
8069}
8070
8071/// Compute whether each element of a shuffle is zeroable.
8072///
8073/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8074/// Either it is an undef element in the shuffle mask, the element of the input
8075/// referenced is undef, or the element of the input referenced is known to be
8076/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8077/// as many lanes with this technique as possible to simplify the remaining
8078/// shuffle.
8079static void computeZeroableShuffleElements(ArrayRef<int> Mask,
8080 SDValue V1, SDValue V2,
8081 APInt &KnownUndef, APInt &KnownZero) {
8082 int Size = Mask.size();
8083 KnownUndef = KnownZero = APInt::getZero(Size);
8084
8085 V1 = peekThroughBitcasts(V1);
8086 V2 = peekThroughBitcasts(V2);
8087
8088 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8089 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8090
8091 int VectorSizeInBits = V1.getValueSizeInBits();
8092 int ScalarSizeInBits = VectorSizeInBits / Size;
8093 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8093, __extension__
__PRETTY_FUNCTION__))
;
8094
8095 for (int i = 0; i < Size; ++i) {
8096 int M = Mask[i];
8097 // Handle the easy cases.
8098 if (M < 0) {
8099 KnownUndef.setBit(i);
8100 continue;
8101 }
8102 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8103 KnownZero.setBit(i);
8104 continue;
8105 }
8106
8107 // Determine shuffle input and normalize the mask.
8108 SDValue V = M < Size ? V1 : V2;
8109 M %= Size;
8110
8111 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8112 if (V.getOpcode() != ISD::BUILD_VECTOR)
8113 continue;
8114
8115 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8116 // the (larger) source element must be UNDEF/ZERO.
8117 if ((Size % V.getNumOperands()) == 0) {
8118 int Scale = Size / V->getNumOperands();
8119 SDValue Op = V.getOperand(M / Scale);
8120 if (Op.isUndef())
8121 KnownUndef.setBit(i);
8122 if (X86::isZeroNode(Op))
8123 KnownZero.setBit(i);
8124 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8125 APInt Val = Cst->getAPIntValue();
8126 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8127 if (Val == 0)
8128 KnownZero.setBit(i);
8129 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8130 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8131 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8132 if (Val == 0)
8133 KnownZero.setBit(i);
8134 }
8135 continue;
8136 }
8137
8138 // If the BUILD_VECTOR has more elements then all the (smaller) source
8139 // elements must be UNDEF or ZERO.
8140 if ((V.getNumOperands() % Size) == 0) {
8141 int Scale = V->getNumOperands() / Size;
8142 bool AllUndef = true;
8143 bool AllZero = true;
8144 for (int j = 0; j < Scale; ++j) {
8145 SDValue Op = V.getOperand((M * Scale) + j);
8146 AllUndef &= Op.isUndef();
8147 AllZero &= X86::isZeroNode(Op);
8148 }
8149 if (AllUndef)
8150 KnownUndef.setBit(i);
8151 if (AllZero)
8152 KnownZero.setBit(i);
8153 continue;
8154 }
8155 }
8156}
8157
8158/// Decode a target shuffle mask and inputs and see if any values are
8159/// known to be undef or zero from their inputs.
8160/// Returns true if the target shuffle mask was decoded.
8161/// FIXME: Merge this with computeZeroableShuffleElements?
8162static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
8163 SmallVectorImpl<SDValue> &Ops,
8164 APInt &KnownUndef, APInt &KnownZero) {
8165 bool IsUnary;
8166 if (!isTargetShuffle(N.getOpcode()))
8167 return false;
8168
8169 MVT VT = N.getSimpleValueType();
8170 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
8171 return false;
8172
8173 int Size = Mask.size();
8174 SDValue V1 = Ops[0];
8175 SDValue V2 = IsUnary ? V1 : Ops[1];
8176 KnownUndef = KnownZero = APInt::getZero(Size);
8177
8178 V1 = peekThroughBitcasts(V1);
8179 V2 = peekThroughBitcasts(V2);
8180
8181 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__))
8182 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__))
;
8183 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
8184
8185 // Extract known constant input data.
8186 APInt UndefSrcElts[2];
8187 SmallVector<APInt, 32> SrcEltBits[2];
8188 bool IsSrcConstant[2] = {
8189 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
8190 SrcEltBits[0], true, false),
8191 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
8192 SrcEltBits[1], true, false)};
8193
8194 for (int i = 0; i < Size; ++i) {
8195 int M = Mask[i];
8196
8197 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
8198 if (M < 0) {
8199 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8199, __extension__
__PRETTY_FUNCTION__))
;
8200 if (SM_SentinelUndef == M)
8201 KnownUndef.setBit(i);
8202 if (SM_SentinelZero == M)
8203 KnownZero.setBit(i);
8204 continue;
8205 }
8206
8207 // Determine shuffle input and normalize the mask.
8208 unsigned SrcIdx = M / Size;
8209 SDValue V = M < Size ? V1 : V2;
8210 M %= Size;
8211
8212 // We are referencing an UNDEF input.
8213 if (V.isUndef()) {
8214 KnownUndef.setBit(i);
8215 continue;
8216 }
8217
8218 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
8219 // TODO: We currently only set UNDEF for integer types - floats use the same
8220 // registers as vectors and many of the scalar folded loads rely on the
8221 // SCALAR_TO_VECTOR pattern.
8222 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8223 (Size % V.getValueType().getVectorNumElements()) == 0) {
8224 int Scale = Size / V.getValueType().getVectorNumElements();
8225 int Idx = M / Scale;
8226 if (Idx != 0 && !VT.isFloatingPoint())
8227 KnownUndef.setBit(i);
8228 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
8229 KnownZero.setBit(i);
8230 continue;
8231 }
8232
8233 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
8234 // base vectors.
8235 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
8236 SDValue Vec = V.getOperand(0);
8237 int NumVecElts = Vec.getValueType().getVectorNumElements();
8238 if (Vec.isUndef() && Size == NumVecElts) {
8239 int Idx = V.getConstantOperandVal(2);
8240 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
8241 if (M < Idx || (Idx + NumSubElts) <= M)
8242 KnownUndef.setBit(i);
8243 }
8244 continue;
8245 }
8246
8247 // Attempt to extract from the source's constant bits.
8248 if (IsSrcConstant[SrcIdx]) {
8249 if (UndefSrcElts[SrcIdx][M])
8250 KnownUndef.setBit(i);
8251 else if (SrcEltBits[SrcIdx][M] == 0)
8252 KnownZero.setBit(i);
8253 }
8254 }
8255
8256 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__))
8257 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__))
;
8258 return true;
8259}
8260
8261// Replace target shuffle mask elements with known undef/zero sentinels.
8262static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
8263 const APInt &KnownUndef,
8264 const APInt &KnownZero,
8265 bool ResolveKnownZeros= true) {
8266 unsigned NumElts = Mask.size();
8267 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__))
8268 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__))
;
8269
8270 for (unsigned i = 0; i != NumElts; ++i) {
8271 if (KnownUndef[i])
8272 Mask[i] = SM_SentinelUndef;
8273 else if (ResolveKnownZeros && KnownZero[i])
8274 Mask[i] = SM_SentinelZero;
8275 }
8276}
8277
8278// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
8279static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
8280 APInt &KnownUndef,
8281 APInt &KnownZero) {
8282 unsigned NumElts = Mask.size();
8283 KnownUndef = KnownZero = APInt::getZero(NumElts);
8284
8285 for (unsigned i = 0; i != NumElts; ++i) {
8286 int M = Mask[i];
8287 if (SM_SentinelUndef == M)
8288 KnownUndef.setBit(i);
8289 if (SM_SentinelZero == M)
8290 KnownZero.setBit(i);
8291 }
8292}
8293
8294// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
8295static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
8296 SDValue Cond, bool IsBLENDV = false) {
8297 EVT CondVT = Cond.getValueType();
8298 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
8299 unsigned NumElts = CondVT.getVectorNumElements();
8300
8301 APInt UndefElts;
8302 SmallVector<APInt, 32> EltBits;
8303 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
8304 true, false))
8305 return false;
8306
8307 Mask.resize(NumElts, SM_SentinelUndef);
8308
8309 for (int i = 0; i != (int)NumElts; ++i) {
8310 Mask[i] = i;
8311 // Arbitrarily choose from the 2nd operand if the select condition element
8312 // is undef.
8313 // TODO: Can we do better by matching patterns such as even/odd?
8314 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
8315 (IsBLENDV && EltBits[i].isNonNegative()))
8316 Mask[i] += NumElts;
8317 }
8318
8319 return true;
8320}
8321
8322// Forward declaration (for getFauxShuffleMask recursive check).
8323static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8324 SmallVectorImpl<SDValue> &Inputs,
8325 SmallVectorImpl<int> &Mask,
8326 const SelectionDAG &DAG, unsigned Depth,
8327 bool ResolveKnownElts);
8328
8329// Attempt to decode ops that could be represented as a shuffle mask.
8330// The decoded shuffle mask may contain a different number of elements to the
8331// destination value type.
8332// TODO: Merge into getTargetShuffleInputs()
8333static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
8334 SmallVectorImpl<int> &Mask,
8335 SmallVectorImpl<SDValue> &Ops,
8336 const SelectionDAG &DAG, unsigned Depth,
8337 bool ResolveKnownElts) {
8338 Mask.clear();
8339 Ops.clear();
8340
8341 MVT VT = N.getSimpleValueType();
8342 unsigned NumElts = VT.getVectorNumElements();
8343 unsigned NumSizeInBits = VT.getSizeInBits();
8344 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8345 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8346 return false;
8347 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8347, __extension__
__PRETTY_FUNCTION__))
;
8348 unsigned NumSizeInBytes = NumSizeInBits / 8;
8349 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8350
8351 unsigned Opcode = N.getOpcode();
8352 switch (Opcode) {
8353 case ISD::VECTOR_SHUFFLE: {
8354 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8355 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8356 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8357 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8358 Ops.push_back(N.getOperand(0));
8359 Ops.push_back(N.getOperand(1));
8360 return true;
8361 }
8362 return false;
8363 }
8364 case ISD::AND:
8365 case X86ISD::ANDNP: {
8366 // Attempt to decode as a per-byte mask.
8367 APInt UndefElts;
8368 SmallVector<APInt, 32> EltBits;
8369 SDValue N0 = N.getOperand(0);
8370 SDValue N1 = N.getOperand(1);
8371 bool IsAndN = (X86ISD::ANDNP == Opcode);
8372 uint64_t ZeroMask = IsAndN ? 255 : 0;
8373 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8374 return false;
8375 // We can't assume an undef src element gives an undef dst - the other src
8376 // might be zero.
8377 if (!UndefElts.isZero())
8378 return false;
8379 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8380 const APInt &ByteBits = EltBits[i];
8381 if (ByteBits != 0 && ByteBits != 255)
8382 return false;
8383 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8384 }
8385 Ops.push_back(IsAndN ? N1 : N0);
8386 return true;
8387 }
8388 case ISD::OR: {
8389 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8390 // is a valid shuffle index.
8391 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8392 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8393 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8394 return false;
8395
8396 SmallVector<int, 64> SrcMask0, SrcMask1;
8397 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8398 APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
8399 APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
8400 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
8401 Depth + 1, true) ||
8402 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
8403 Depth + 1, true))
8404 return false;
8405
8406 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8407 SmallVector<int, 64> Mask0, Mask1;
8408 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8409 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8410 for (int i = 0; i != (int)MaskSize; ++i) {
8411 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8412 // loops converting between OR and BLEND shuffles due to
8413 // canWidenShuffleElements merging away undef elements, meaning we
8414 // fail to recognise the OR as the undef element isn't known zero.
8415 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8416 Mask.push_back(SM_SentinelZero);
8417 else if (Mask1[i] == SM_SentinelZero)
8418 Mask.push_back(i);
8419 else if (Mask0[i] == SM_SentinelZero)
8420 Mask.push_back(i + MaskSize);
8421 else
8422 return false;
8423 }
8424 Ops.push_back(N0);
8425 Ops.push_back(N1);
8426 return true;
8427 }
8428 case ISD::INSERT_SUBVECTOR: {
8429 SDValue Src = N.getOperand(0);
8430 SDValue Sub = N.getOperand(1);
8431 EVT SubVT = Sub.getValueType();
8432 unsigned NumSubElts = SubVT.getVectorNumElements();
8433 if (!N->isOnlyUserOf(Sub.getNode()))
8434 return false;
8435 uint64_t InsertIdx = N.getConstantOperandVal(2);
8436 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8437 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8438 Sub.getOperand(0).getValueType() == VT) {
8439 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8440 for (int i = 0; i != (int)NumElts; ++i)
8441 Mask.push_back(i);
8442 for (int i = 0; i != (int)NumSubElts; ++i)
8443 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8444 Ops.push_back(Src);
8445 Ops.push_back(Sub.getOperand(0));
8446 return true;
8447 }
8448 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8449 SmallVector<int, 64> SubMask;
8450 SmallVector<SDValue, 2> SubInputs;
8451 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
8452 EVT SubSrcVT = SubSrc.getValueType();
8453 if (!SubSrcVT.isVector())
8454 return false;
8455
8456 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
8457 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
8458 Depth + 1, ResolveKnownElts))
8459 return false;
8460
8461 // Subvector shuffle inputs must not be larger than the subvector.
8462 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8463 return SubVT.getFixedSizeInBits() <
8464 SubInput.getValueSizeInBits().getFixedValue();
8465 }))
8466 return false;
8467
8468 if (SubMask.size() != NumSubElts) {
8469 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__))
8470 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__))
;
8471 if ((NumSubElts % SubMask.size()) == 0) {
8472 int Scale = NumSubElts / SubMask.size();
8473 SmallVector<int,64> ScaledSubMask;
8474 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8475 SubMask = ScaledSubMask;
8476 } else {
8477 int Scale = SubMask.size() / NumSubElts;
8478 NumSubElts = SubMask.size();
8479 NumElts *= Scale;
8480 InsertIdx *= Scale;
8481 }
8482 }
8483 Ops.push_back(Src);
8484 Ops.append(SubInputs.begin(), SubInputs.end());
8485 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8486 Mask.append(NumElts, SM_SentinelZero);
8487 else
8488 for (int i = 0; i != (int)NumElts; ++i)
8489 Mask.push_back(i);
8490 for (int i = 0; i != (int)NumSubElts; ++i) {
8491 int M = SubMask[i];
8492 if (0 <= M) {
8493 int InputIdx = M / NumSubElts;
8494 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8495 }
8496 Mask[i + InsertIdx] = M;
8497 }
8498 return true;
8499 }
8500 case X86ISD::PINSRB:
8501 case X86ISD::PINSRW:
8502 case ISD::SCALAR_TO_VECTOR:
8503 case ISD::INSERT_VECTOR_ELT: {
8504 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8505 // vector, for matching src/dst vector types.
8506 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8507
8508 unsigned DstIdx = 0;
8509 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8510 // Check we have an in-range constant insertion index.
8511 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8512 N.getConstantOperandAPInt(2).uge(NumElts))
8513 return false;
8514 DstIdx = N.getConstantOperandVal(2);
8515
8516 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8517 if (X86::isZeroNode(Scl)) {
8518 Ops.push_back(N.getOperand(0));
8519 for (unsigned i = 0; i != NumElts; ++i)
8520 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8521 return true;
8522 }
8523 }
8524
8525 // Peek through trunc/aext/zext.
8526 // TODO: aext shouldn't require SM_SentinelZero padding.
8527 // TODO: handle shift of scalars.
8528 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8529 while (Scl.getOpcode() == ISD::TRUNCATE ||
8530 Scl.getOpcode() == ISD::ANY_EXTEND ||
8531 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8532 Scl = Scl.getOperand(0);
8533 MinBitsPerElt =
8534 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8535 }
8536 if ((MinBitsPerElt % 8) != 0)
8537 return false;
8538
8539 // Attempt to find the source vector the scalar was extracted from.
8540 SDValue SrcExtract;
8541 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8542 Scl.getOpcode() == X86ISD::PEXTRW ||
8543 Scl.getOpcode() == X86ISD::PEXTRB) &&
8544 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8545 SrcExtract = Scl;
8546 }
8547 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8548 return false;
8549
8550 SDValue SrcVec = SrcExtract.getOperand(0);
8551 EVT SrcVT = SrcVec.getValueType();
8552 if (!SrcVT.getScalarType().isByteSized())
8553 return false;
8554 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8555 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8556 unsigned DstByte = DstIdx * NumBytesPerElt;
8557 MinBitsPerElt =
8558 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8559
8560 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8561 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8562 Ops.push_back(SrcVec);
8563 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8564 } else {
8565 Ops.push_back(SrcVec);
8566 Ops.push_back(N.getOperand(0));
8567 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8568 Mask.push_back(NumSizeInBytes + i);
8569 }
8570
8571 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8572 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8573 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8574 Mask[DstByte + i] = SrcByte + i;
8575 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8576 Mask[DstByte + i] = SM_SentinelZero;
8577 return true;
8578 }
8579 case X86ISD::PACKSS:
8580 case X86ISD::PACKUS: {
8581 SDValue N0 = N.getOperand(0);
8582 SDValue N1 = N.getOperand(1);
8583 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))
8584 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))
8585 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))
;
8586
8587 APInt EltsLHS, EltsRHS;
8588 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8589
8590 // If we know input saturation won't happen (or we don't care for particular
8591 // lanes), we can treat this as a truncation shuffle.
8592 bool Offset0 = false, Offset1 = false;
8593 if (Opcode == X86ISD::PACKSS) {
8594 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8595 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8596 (!(N1.isUndef() || EltsRHS.isZero()) &&
8597 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8598 return false;
8599 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8600 // PACKSS then it was likely being used for sign-extension for a
8601 // truncation, so just peek through and adjust the mask accordingly.
8602 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8603 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8604 Offset0 = true;
8605 N0 = N0.getOperand(0);
8606 }
8607 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8608 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8609 Offset1 = true;
8610 N1 = N1.getOperand(0);
8611 }
8612 } else {
8613 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8614 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8615 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8616 (!(N1.isUndef() || EltsRHS.isZero()) &&
8617 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8618 return false;
8619 }
8620
8621 bool IsUnary = (N0 == N1);
8622
8623 Ops.push_back(N0);
8624 if (!IsUnary)
8625 Ops.push_back(N1);
8626
8627 createPackShuffleMask(VT, Mask, IsUnary);
8628
8629 if (Offset0 || Offset1) {
8630 for (int &M : Mask)
8631 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8632 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8633 ++M;
8634 }
8635 return true;
8636 }
8637 case ISD::VSELECT:
8638 case X86ISD::BLENDV: {
8639 SDValue Cond = N.getOperand(0);
8640 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
8641 Ops.push_back(N.getOperand(1));
8642 Ops.push_back(N.getOperand(2));
8643 return true;
8644 }
8645 return false;
8646 }
8647 case X86ISD::VTRUNC: {
8648 SDValue Src = N.getOperand(0);
8649 EVT SrcVT = Src.getValueType();
8650 // Truncated source must be a simple vector.
8651 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8652 (SrcVT.getScalarSizeInBits() % 8) != 0)
8653 return false;
8654 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8655 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8656 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8657 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8657, __extension__
__PRETTY_FUNCTION__))
;
8658 for (unsigned i = 0; i != NumSrcElts; ++i)
8659 Mask.push_back(i * Scale);
8660 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8661 Ops.push_back(Src);
8662 return true;
8663 }
8664 case X86ISD::VSHLI:
8665 case X86ISD::VSRLI: {
8666 uint64_t ShiftVal = N.getConstantOperandVal(1);
8667 // Out of range bit shifts are guaranteed to be zero.
8668 if (NumBitsPerElt <= ShiftVal) {
8669 Mask.append(NumElts, SM_SentinelZero);
8670 return true;
8671 }
8672
8673 // We can only decode 'whole byte' bit shifts as shuffles.
8674 if ((ShiftVal % 8) != 0)
8675 break;
8676
8677 uint64_t ByteShift = ShiftVal / 8;
8678 Ops.push_back(N.getOperand(0));
8679
8680 // Clear mask to all zeros and insert the shifted byte indices.
8681 Mask.append(NumSizeInBytes, SM_SentinelZero);
8682
8683 if (X86ISD::VSHLI == Opcode) {
8684 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8685 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8686 Mask[i + j] = i + j - ByteShift;
8687 } else {
8688 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8689 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8690 Mask[i + j - ByteShift] = i + j;
8691 }
8692 return true;
8693 }
8694 case X86ISD::VROTLI:
8695 case X86ISD::VROTRI: {
8696 // We can only decode 'whole byte' bit rotates as shuffles.
8697 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8698 if ((RotateVal % 8) != 0)
8699 return false;
8700 Ops.push_back(N.getOperand(0));
8701 int Offset = RotateVal / 8;
8702 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8703 for (int i = 0; i != (int)NumElts; ++i) {
8704 int BaseIdx = i * NumBytesPerElt;
8705 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8706 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8707 }
8708 }
8709 return true;
8710 }
8711 case X86ISD::VBROADCAST: {
8712 SDValue Src = N.getOperand(0);
8713 if (!Src.getSimpleValueType().isVector()) {
8714 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8715 !isNullConstant(Src.getOperand(1)) ||
8716 Src.getOperand(0).getValueType().getScalarType() !=
8717 VT.getScalarType())
8718 return false;
8719 Src = Src.getOperand(0);
8720 }
8721 Ops.push_back(Src);
8722 Mask.append(NumElts, 0);
8723 return true;
8724 }
8725 case ISD::ZERO_EXTEND:
8726 case ISD::ANY_EXTEND:
8727 case ISD::ZERO_EXTEND_VECTOR_INREG:
8728 case ISD::ANY_EXTEND_VECTOR_INREG: {
8729 SDValue Src = N.getOperand(0);
8730 EVT SrcVT = Src.getValueType();
8731
8732 // Extended source must be a simple vector.
8733 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8734 (SrcVT.getScalarSizeInBits() % 8) != 0)
8735 return false;
8736
8737 bool IsAnyExtend =
8738 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8739 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8740 IsAnyExtend, Mask);
8741 Ops.push_back(Src);
8742 return true;
8743 }
8744 }
8745
8746 return false;
8747}
8748
8749/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8750static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8751 SmallVectorImpl<int> &Mask) {
8752 int MaskWidth = Mask.size();
8753 SmallVector<SDValue, 16> UsedInputs;
8754 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8755 int lo = UsedInputs.size() * MaskWidth;
8756 int hi = lo + MaskWidth;
8757
8758 // Strip UNDEF input usage.
8759 if (Inputs[i].isUndef())
8760 for (int &M : Mask)
8761 if ((lo <= M) && (M < hi))
8762 M = SM_SentinelUndef;
8763
8764 // Check for unused inputs.
8765 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8766 for (int &M : Mask)
8767 if (lo <= M)
8768 M -= MaskWidth;
8769 continue;
8770 }
8771
8772 // Check for repeated inputs.
8773 bool IsRepeat = false;
8774 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8775 if (UsedInputs[j] != Inputs[i])
8776 continue;
8777 for (int &M : Mask)
8778 if (lo <= M)
8779 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8780 IsRepeat = true;
8781 break;
8782 }
8783 if (IsRepeat)
8784 continue;
8785
8786 UsedInputs.push_back(Inputs[i]);
8787 }
8788 Inputs = UsedInputs;
8789}
8790
8791/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8792/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8793/// Returns true if the target shuffle mask was decoded.
8794static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8795 SmallVectorImpl<SDValue> &Inputs,
8796 SmallVectorImpl<int> &Mask,
8797 APInt &KnownUndef, APInt &KnownZero,
8798 const SelectionDAG &DAG, unsigned Depth,
8799 bool ResolveKnownElts) {
8800 if (Depth >= SelectionDAG::MaxRecursionDepth)
8801 return false; // Limit search depth.
8802
8803 EVT VT = Op.getValueType();
8804 if (!VT.isSimple() || !VT.isVector())
8805 return false;
8806
8807 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8808 if (ResolveKnownElts)
8809 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8810 return true;
8811 }
8812 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8813 ResolveKnownElts)) {
8814 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8815 return true;
8816 }
8817 return false;
8818}
8819
8820static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8821 SmallVectorImpl<SDValue> &Inputs,
8822 SmallVectorImpl<int> &Mask,
8823 const SelectionDAG &DAG, unsigned Depth,
8824 bool ResolveKnownElts) {
8825 APInt KnownUndef, KnownZero;
8826 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8827 KnownZero, DAG, Depth, ResolveKnownElts);
8828}
8829
8830static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8831 SmallVectorImpl<int> &Mask,
8832 const SelectionDAG &DAG, unsigned Depth = 0,
8833 bool ResolveKnownElts = true) {
8834 EVT VT = Op.getValueType();
8835 if (!VT.isSimple() || !VT.isVector())
8836 return false;
8837
8838 unsigned NumElts = Op.getValueType().getVectorNumElements();
8839 APInt DemandedElts = APInt::getAllOnes(NumElts);
8840 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
8841 ResolveKnownElts);
8842}
8843
8844// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8845static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8846 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8847 SelectionDAG &DAG) {
8848 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
8849 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
8850 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
;
8851
8852 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8853 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8854 return SDValue();
8855
8856 SDValue Ptr =
8857 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8858 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8859 SDValue Ops[] = {Mem->getChain(), Ptr};
8860 SDValue BcstLd = DAG.getMemIntrinsicNode(
8861 Opcode, DL, Tys, Ops, MemVT,
8862 DAG.getMachineFunction().getMachineMemOperand(
8863 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8864 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8865 return BcstLd;
8866}
8867
8868/// Returns the scalar element that will make up the i'th
8869/// element of the result of the vector shuffle.
8870static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8871 SelectionDAG &DAG, unsigned Depth) {
8872 if (Depth >= SelectionDAG::MaxRecursionDepth)
8873 return SDValue(); // Limit search depth.
8874
8875 EVT VT = Op.getValueType();
8876 unsigned Opcode = Op.getOpcode();
8877 unsigned NumElems = VT.getVectorNumElements();
8878
8879 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8880 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8881 int Elt = SV->getMaskElt(Index);
8882
8883 if (Elt < 0)
8884 return DAG.getUNDEF(VT.getVectorElementType());
8885
8886 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8887 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8888 }
8889
8890 // Recurse into target specific vector shuffles to find scalars.
8891 if (isTargetShuffle(Opcode)) {
8892 MVT ShufVT = VT.getSimpleVT();
8893 MVT ShufSVT = ShufVT.getVectorElementType();
8894 int NumElems = (int)ShufVT.getVectorNumElements();
8895 SmallVector<int, 16> ShuffleMask;
8896 SmallVector<SDValue, 16> ShuffleOps;
8897 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8898 ShuffleMask))
8899 return SDValue();
8900
8901 int Elt = ShuffleMask[Index];
8902 if (Elt == SM_SentinelZero)
8903 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8904 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8905 if (Elt == SM_SentinelUndef)
8906 return DAG.getUNDEF(ShufSVT);
8907
8908 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8908, __extension__
__PRETTY_FUNCTION__))
;
8909 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8910 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8911 }
8912
8913 // Recurse into insert_subvector base/sub vector to find scalars.
8914 if (Opcode == ISD::INSERT_SUBVECTOR) {
8915 SDValue Vec = Op.getOperand(0);
8916 SDValue Sub = Op.getOperand(1);
8917 uint64_t SubIdx = Op.getConstantOperandVal(2);
8918 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8919
8920 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8921 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8922 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8923 }
8924
8925 // Recurse into concat_vectors sub vector to find scalars.
8926 if (Opcode == ISD::CONCAT_VECTORS) {
8927 EVT SubVT = Op.getOperand(0).getValueType();
8928 unsigned NumSubElts = SubVT.getVectorNumElements();
8929 uint64_t SubIdx = Index / NumSubElts;
8930 uint64_t SubElt = Index % NumSubElts;
8931 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8932 }
8933
8934 // Recurse into extract_subvector src vector to find scalars.
8935 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8936 SDValue Src = Op.getOperand(0);
8937 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8938 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8939 }
8940
8941 // We only peek through bitcasts of the same vector width.
8942 if (Opcode == ISD::BITCAST) {
8943 SDValue Src = Op.getOperand(0);
8944 EVT SrcVT = Src.getValueType();
8945 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8946 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8947 return SDValue();
8948 }
8949
8950 // Actual nodes that may contain scalar elements
8951
8952 // For insert_vector_elt - either return the index matching scalar or recurse
8953 // into the base vector.
8954 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8955 isa<ConstantSDNode>(Op.getOperand(2))) {
8956 if (Op.getConstantOperandAPInt(2) == Index)
8957 return Op.getOperand(1);
8958 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8959 }
8960
8961 if (Opcode == ISD::SCALAR_TO_VECTOR)
8962 return (Index == 0) ? Op.getOperand(0)
8963 : DAG.getUNDEF(VT.getVectorElementType());
8964
8965 if (Opcode == ISD::BUILD_VECTOR)
8966 return Op.getOperand(Index);
8967
8968 return SDValue();
8969}
8970
8971// Use PINSRB/PINSRW/PINSRD to create a build vector.
8972static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8973 unsigned NumNonZero, unsigned NumZero,
8974 SelectionDAG &DAG,
8975 const X86Subtarget &Subtarget) {
8976 MVT VT = Op.getSimpleValueType();
8977 unsigned NumElts = VT.getVectorNumElements();
8978 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))
8979 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))
8980 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))
;
8981
8982 SDLoc dl(Op);
8983 SDValue V;
8984 bool First = true;
8985
8986 for (unsigned i = 0; i < NumElts; ++i) {
8987 bool IsNonZero = NonZeroMask[i];
8988 if (!IsNonZero)
8989 continue;
8990
8991 // If the build vector contains zeros or our first insertion is not the
8992 // first index then insert into zero vector to break any register
8993 // dependency else use SCALAR_TO_VECTOR.
8994 if (First) {
8995 First = false;
8996 if (NumZero || 0 != i)
8997 V = getZeroVector(VT, Subtarget, DAG, dl);
8998 else {
8999 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8999, __extension__
__PRETTY_FUNCTION__))
;
9000 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9001 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
9002 V = DAG.getBitcast(VT, V);
9003 continue;
9004 }
9005 }
9006 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
9007 DAG.getIntPtrConstant(i, dl));
9008 }
9009
9010 return V;
9011}
9012
9013/// Custom lower build_vector of v16i8.
9014static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
9015 unsigned NumNonZero, unsigned NumZero,
9016 SelectionDAG &DAG,
9017 const X86Subtarget &Subtarget) {
9018 if (NumNonZero > 8 && !Subtarget.hasSSE41())
9019 return SDValue();
9020
9021 // SSE4.1 - use PINSRB to insert each byte directly.
9022 if (Subtarget.hasSSE41())
9023 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9024 Subtarget);
9025
9026 SDLoc dl(Op);
9027 SDValue V;
9028
9029 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
9030 for (unsigned i = 0; i < 16; i += 2) {
9031 bool ThisIsNonZero = NonZeroMask[i];
9032 bool NextIsNonZero = NonZeroMask[i + 1];
9033 if (!ThisIsNonZero && !NextIsNonZero)
9034 continue;
9035
9036 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
9037 SDValue Elt;
9038 if (ThisIsNonZero) {
9039 if (NumZero || NextIsNonZero)
9040 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9041 else
9042 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9043 }
9044
9045 if (NextIsNonZero) {
9046 SDValue NextElt = Op.getOperand(i + 1);
9047 if (i == 0 && NumZero)
9048 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
9049 else
9050 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
9051 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
9052 DAG.getConstant(8, dl, MVT::i8));
9053 if (ThisIsNonZero)
9054 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
9055 else
9056 Elt = NextElt;
9057 }
9058
9059 // If our first insertion is not the first index or zeros are needed, then
9060 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
9061 // elements undefined).
9062 if (!V) {
9063 if (i != 0 || NumZero)
9064 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
9065 else {
9066 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
9067 V = DAG.getBitcast(MVT::v8i16, V);
9068 continue;
9069 }
9070 }
9071 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
9072 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
9073 DAG.getIntPtrConstant(i / 2, dl));
9074 }
9075
9076 return DAG.getBitcast(MVT::v16i8, V);
9077}
9078
9079/// Custom lower build_vector of v8i16.
9080static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
9081 unsigned NumNonZero, unsigned NumZero,
9082 SelectionDAG &DAG,
9083 const X86Subtarget &Subtarget) {
9084 if (NumNonZero > 4 && !Subtarget.hasSSE41())
9085 return SDValue();
9086
9087 // Use PINSRW to insert each byte directly.
9088 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9089 Subtarget);
9090}
9091
9092/// Custom lower build_vector of v4i32 or v4f32.
9093static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
9094 const X86Subtarget &Subtarget) {
9095 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
9096 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
9097 // Because we're creating a less complicated build vector here, we may enable
9098 // further folding of the MOVDDUP via shuffle transforms.
9099 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
9100 Op.getOperand(0) == Op.getOperand(2) &&
9101 Op.getOperand(1) == Op.getOperand(3) &&
9102 Op.getOperand(0) != Op.getOperand(1)) {
9103 SDLoc DL(Op);
9104 MVT VT = Op.getSimpleValueType();
9105 MVT EltVT = VT.getVectorElementType();
9106 // Create a new build vector with the first 2 elements followed by undef
9107 // padding, bitcast to v2f64, duplicate, and bitcast back.
9108 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9109 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9110 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
9111 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
9112 return DAG.getBitcast(VT, Dup);
9113 }
9114
9115 // Find all zeroable elements.
9116 std::bitset<4> Zeroable, Undefs;
9117 for (int i = 0; i < 4; ++i) {
9118 SDValue Elt = Op.getOperand(i);
9119 Undefs[i] = Elt.isUndef();
9120 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
9121 }
9122 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__))
9123 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__))
;
9124
9125 // We only know how to deal with build_vector nodes where elements are either
9126 // zeroable or extract_vector_elt with constant index.
9127 SDValue FirstNonZero;
9128 unsigned FirstNonZeroIdx;
9129 for (unsigned i = 0; i < 4; ++i) {
9130 if (Zeroable[i])
9131 continue;
9132 SDValue Elt = Op.getOperand(i);
9133 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9134 !isa<ConstantSDNode>(Elt.getOperand(1)))
9135 return SDValue();
9136 // Make sure that this node is extracting from a 128-bit vector.
9137 MVT VT = Elt.getOperand(0).getSimpleValueType();
9138 if (!VT.is128BitVector())
9139 return SDValue();
9140 if (!FirstNonZero.getNode()) {
9141 FirstNonZero = Elt;
9142 FirstNonZeroIdx = i;
9143 }
9144 }
9145
9146 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9146, __extension__
__PRETTY_FUNCTION__))
;
9147 SDValue V1 = FirstNonZero.getOperand(0);
9148 MVT VT = V1.getSimpleValueType();
9149
9150 // See if this build_vector can be lowered as a blend with zero.
9151 SDValue Elt;
9152 unsigned EltMaskIdx, EltIdx;
9153 int Mask[4];
9154 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
9155 if (Zeroable[EltIdx]) {
9156 // The zero vector will be on the right hand side.
9157 Mask[EltIdx] = EltIdx+4;
9158 continue;
9159 }
9160
9161 Elt = Op->getOperand(EltIdx);
9162 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
9163 EltMaskIdx = Elt.getConstantOperandVal(1);
9164 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
9165 break;
9166 Mask[EltIdx] = EltIdx;
9167 }
9168
9169 if (EltIdx == 4) {
9170 // Let the shuffle legalizer deal with blend operations.
9171 SDValue VZeroOrUndef = (Zeroable == Undefs)
9172 ? DAG.getUNDEF(VT)
9173 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
9174 if (V1.getSimpleValueType() != VT)
9175 V1 = DAG.getBitcast(VT, V1);
9176 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
9177 }
9178
9179 // See if we can lower this build_vector to a INSERTPS.
9180 if (!Subtarget.hasSSE41())
9181 return SDValue();
9182
9183 SDValue V2 = Elt.getOperand(0);
9184 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
9185 V1 = SDValue();
9186
9187 bool CanFold = true;
9188 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
9189 if (Zeroable[i])
9190 continue;
9191
9192 SDValue Current = Op->getOperand(i);
9193 SDValue SrcVector = Current->getOperand(0);
9194 if (!V1.getNode())
9195 V1 = SrcVector;
9196 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
9197 }
9198
9199 if (!CanFold)
9200 return SDValue();
9201
9202 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9202, __extension__
__PRETTY_FUNCTION__))
;
9203 if (V1.getSimpleValueType() != MVT::v4f32)
9204 V1 = DAG.getBitcast(MVT::v4f32, V1);
9205 if (V2.getSimpleValueType() != MVT::v4f32)
9206 V2 = DAG.getBitcast(MVT::v4f32, V2);
9207
9208 // Ok, we can emit an INSERTPS instruction.
9209 unsigned ZMask = Zeroable.to_ulong();
9210
9211 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
9212 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9212, __extension__
__PRETTY_FUNCTION__))
;
9213 SDLoc DL(Op);
9214 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9215 DAG.getIntPtrConstant(InsertPSMask, DL, true));
9216 return DAG.getBitcast(VT, Result);
9217}
9218
9219/// Return a vector logical shift node.
9220static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
9221 SelectionDAG &DAG, const TargetLowering &TLI,
9222 const SDLoc &dl) {
9223 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__))
;
9224 MVT ShVT = MVT::v16i8;
9225 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
9226 SrcOp = DAG.getBitcast(ShVT, SrcOp);
9227 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9227, __extension__
__PRETTY_FUNCTION__))
;
9228 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
9229 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
9230}
9231
9232static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
9233 SelectionDAG &DAG) {
9234
9235 // Check if the scalar load can be widened into a vector load. And if
9236 // the address is "base + cst" see if the cst can be "absorbed" into
9237 // the shuffle mask.
9238 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
9239 SDValue Ptr = LD->getBasePtr();
9240 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
9241 return SDValue();
9242 EVT PVT = LD->getValueType(0);
9243 if (PVT != MVT::i32 && PVT != MVT::f32)
9244 return SDValue();
9245
9246 int FI = -1;
9247 int64_t Offset = 0;
9248 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
9249 FI = FINode->getIndex();
9250 Offset = 0;
9251 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
9252 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
9253 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
9254 Offset = Ptr.getConstantOperandVal(1);
9255 Ptr = Ptr.getOperand(0);
9256 } else {
9257 return SDValue();
9258 }
9259
9260 // FIXME: 256-bit vector instructions don't require a strict alignment,
9261 // improve this code to support it better.
9262 Align RequiredAlign(VT.getSizeInBits() / 8);
9263 SDValue Chain = LD->getChain();
9264 // Make sure the stack object alignment is at least 16 or 32.
9265 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9266 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
9267 if (!InferredAlign || *InferredAlign < RequiredAlign) {
9268 if (MFI.isFixedObjectIndex(FI)) {
9269 // Can't change the alignment. FIXME: It's possible to compute
9270 // the exact stack offset and reference FI + adjust offset instead.
9271 // If someone *really* cares about this. That's the way to implement it.
9272 return SDValue();
9273 } else {
9274 MFI.setObjectAlignment(FI, RequiredAlign);
9275 }
9276 }
9277
9278 // (Offset % 16 or 32) must be multiple of 4. Then address is then
9279 // Ptr + (Offset & ~15).
9280 if (Offset < 0)
9281 return SDValue();
9282 if ((Offset % RequiredAlign.value()) & 3)
9283 return SDValue();
9284 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
9285 if (StartOffset) {
9286 SDLoc DL(Ptr);
9287 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9288 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
9289 }
9290
9291 int EltNo = (Offset - StartOffset) >> 2;
9292 unsigned NumElems = VT.getVectorNumElements();
9293
9294 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
9295 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
9296 LD->getPointerInfo().getWithOffset(StartOffset));
9297
9298 SmallVector<int, 8> Mask(NumElems, EltNo);
9299
9300 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
9301 }
9302
9303 return SDValue();
9304}
9305
9306// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
9307static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
9308 if (ISD::isNON_EXTLoad(Elt.getNode())) {
9309 auto *BaseLd = cast<LoadSDNode>(Elt);
9310 if (!BaseLd->isSimple())
9311 return false;
9312 Ld = BaseLd;
9313 ByteOffset = 0;
9314 return true;
9315 }
9316
9317 switch (Elt.getOpcode()) {
9318 case ISD::BITCAST:
9319 case ISD::TRUNCATE:
9320 case ISD::SCALAR_TO_VECTOR:
9321 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
9322 case ISD::SRL:
9323 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9324 uint64_t Amt = AmtC->getZExtValue();
9325 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
9326 ByteOffset += Amt / 8;
9327 return true;
9328 }
9329 }
9330 break;
9331 case ISD::EXTRACT_VECTOR_ELT:
9332 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9333 SDValue Src = Elt.getOperand(0);
9334 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
9335 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
9336 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
9337 findEltLoadSrc(Src, Ld, ByteOffset)) {
9338 uint64_t Idx = IdxC->getZExtValue();
9339 ByteOffset += Idx * (SrcSizeInBits / 8);
9340 return true;
9341 }
9342 }
9343 break;
9344 }
9345
9346 return false;
9347}
9348
9349/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
9350/// elements can be replaced by a single large load which has the same value as
9351/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
9352///
9353/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
9354static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
9355 const SDLoc &DL, SelectionDAG &DAG,
9356 const X86Subtarget &Subtarget,
9357 bool IsAfterLegalize) {
9358 if ((VT.getScalarSizeInBits() % 8) != 0)
9359 return SDValue();
9360
9361 unsigned NumElems = Elts.size();
9362
9363 int LastLoadedElt = -1;
9364 APInt LoadMask = APInt::getZero(NumElems);
9365 APInt ZeroMask = APInt::getZero(NumElems);
9366 APInt UndefMask = APInt::getZero(NumElems);
9367
9368 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
9369 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
9370
9371 // For each element in the initializer, see if we've found a load, zero or an
9372 // undef.
9373 for (unsigned i = 0; i < NumElems; ++i) {
9374 SDValue Elt = peekThroughBitcasts(Elts[i]);
9375 if (!Elt.getNode())
9376 return SDValue();
9377 if (Elt.isUndef()) {
9378 UndefMask.setBit(i);
9379 continue;
9380 }
9381 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9382 ZeroMask.setBit(i);
9383 continue;
9384 }
9385
9386 // Each loaded element must be the correct fractional portion of the
9387 // requested vector load.
9388 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9389 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9390 return SDValue();
9391
9392 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9393 return SDValue();
9394 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9395 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9396 return SDValue();
9397
9398 LoadMask.setBit(i);
9399 LastLoadedElt = i;
9400 }
9401 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))
9402 NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))
9403 "Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))
;
9404
9405 // Handle Special Cases - all undef or undef/zero.
9406 if (UndefMask.popcount() == NumElems)
9407 return DAG.getUNDEF(VT);
9408 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
9409 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9410 : DAG.getConstantFP(0.0, DL, VT);
9411
9412 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9413 int FirstLoadedElt = LoadMask.countr_zero();
9414 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9415 EVT EltBaseVT = EltBase.getValueType();
9416 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__))
9417 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__))
;
9418 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9419 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9419, __extension__
__PRETTY_FUNCTION__))
;
9420 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9421 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9422 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9423 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9424 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
;
9425
9426 // TODO: Support offsetting the base load.
9427 if (ByteOffsets[FirstLoadedElt] != 0)
9428 return SDValue();
9429
9430 // Check to see if the element's load is consecutive to the base load
9431 // or offset from a previous (already checked) load.
9432 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9433 LoadSDNode *Ld = Loads[EltIdx];
9434 int64_t ByteOffset = ByteOffsets[EltIdx];
9435 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9436 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9437 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9438 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9439 }
9440 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9441 EltIdx - FirstLoadedElt);
9442 };
9443
9444 // Consecutive loads can contain UNDEFS but not ZERO elements.
9445 // Consecutive loads with UNDEFs and ZEROs elements require a
9446 // an additional shuffle stage to clear the ZERO elements.
9447 bool IsConsecutiveLoad = true;
9448 bool IsConsecutiveLoadWithZeros = true;
9449 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9450 if (LoadMask[i]) {
9451 if (!CheckConsecutiveLoad(LDBase, i)) {
9452 IsConsecutiveLoad = false;
9453 IsConsecutiveLoadWithZeros = false;
9454 break;
9455 }
9456 } else if (ZeroMask[i]) {
9457 IsConsecutiveLoad = false;
9458 }
9459 }
9460
9461 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9462 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9463 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__))
9464 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__))
;
9465 SDValue NewLd =
9466 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9467 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9468 MMOFlags);
9469 for (auto *LD : Loads)
9470 if (LD)
9471 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9472 return NewLd;
9473 };
9474
9475 // Check if the base load is entirely dereferenceable.
9476 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9477 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9478
9479 // LOAD - all consecutive load/undefs (must start/end with a load or be
9480 // entirely dereferenceable). If we have found an entire vector of loads and
9481 // undefs, then return a large load of the entire vector width starting at the
9482 // base pointer. If the vector contains zeros, then attempt to shuffle those
9483 // elements.
9484 if (FirstLoadedElt == 0 &&
9485 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9486 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9487 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9488 return SDValue();
9489
9490 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9491 // will lower to regular temporal loads and use the cache.
9492 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
9493 VT.is256BitVector() && !Subtarget.hasInt256())
9494 return SDValue();
9495
9496 if (NumElems == 1)
9497 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9498
9499 if (!ZeroMask)
9500 return CreateLoad(VT, LDBase);
9501
9502 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9503 // vector and a zero vector to clear out the zero elements.
9504 if (!IsAfterLegalize && VT.isVector()) {
9505 unsigned NumMaskElts = VT.getVectorNumElements();
9506 if ((NumMaskElts % NumElems) == 0) {
9507 unsigned Scale = NumMaskElts / NumElems;
9508 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9509 for (unsigned i = 0; i < NumElems; ++i) {
9510 if (UndefMask[i])
9511 continue;
9512 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9513 for (unsigned j = 0; j != Scale; ++j)
9514 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9515 }
9516 SDValue V = CreateLoad(VT, LDBase);
9517 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9518 : DAG.getConstantFP(0.0, DL, VT);
9519 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9520 }
9521 }
9522 }
9523
9524 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9525 if (VT.is256BitVector() || VT.is512BitVector()) {
9526 unsigned HalfNumElems = NumElems / 2;
9527 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9528 EVT HalfVT =
9529 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9530 SDValue HalfLD =
9531 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9532 DAG, Subtarget, IsAfterLegalize);
9533 if (HalfLD)
9534 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9535 HalfLD, DAG.getIntPtrConstant(0, DL));
9536 }
9537 }
9538
9539 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9540 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9541 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9542 LoadSizeInBits == 64) &&
9543 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9544 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9545 : MVT::getIntegerVT(LoadSizeInBits);
9546 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9547 // Allow v4f32 on SSE1 only targets.
9548 // FIXME: Add more isel patterns so we can just use VT directly.
9549 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9550 VecVT = MVT::v4f32;
9551 if (TLI.isTypeLegal(VecVT)) {
9552 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9553 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9554 SDValue ResNode = DAG.getMemIntrinsicNode(
9555 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9556 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9557 for (auto *LD : Loads)
9558 if (LD)
9559 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9560 return DAG.getBitcast(VT, ResNode);
9561 }
9562 }
9563
9564 // BROADCAST - match the smallest possible repetition pattern, load that
9565 // scalar/subvector element and then broadcast to the entire vector.
9566 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9567 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9568 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9569 unsigned RepeatSize = SubElems * BaseSizeInBits;
9570 unsigned ScalarSize = std::min(RepeatSize, 64u);
9571 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9572 continue;
9573
9574 // Don't attempt a 1:N subvector broadcast - it should be caught by
9575 // combineConcatVectorOps, else will cause infinite loops.
9576 if (RepeatSize > ScalarSize && SubElems == 1)
9577 continue;
9578
9579 bool Match = true;
9580 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9581 for (unsigned i = 0; i != NumElems && Match; ++i) {
9582 if (!LoadMask[i])
9583 continue;
9584 SDValue Elt = peekThroughBitcasts(Elts[i]);
9585 if (RepeatedLoads[i % SubElems].isUndef())
9586 RepeatedLoads[i % SubElems] = Elt;
9587 else
9588 Match &= (RepeatedLoads[i % SubElems] == Elt);
9589 }
9590
9591 // We must have loads at both ends of the repetition.
9592 Match &= !RepeatedLoads.front().isUndef();
9593 Match &= !RepeatedLoads.back().isUndef();
9594 if (!Match)
9595 continue;
9596
9597 EVT RepeatVT =
9598 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9599 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9600 : EVT::getFloatingPointVT(ScalarSize);
9601 if (RepeatSize > ScalarSize)
9602 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9603 RepeatSize / ScalarSize);
9604 EVT BroadcastVT =
9605 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9606 VT.getSizeInBits() / ScalarSize);
9607 if (TLI.isTypeLegal(BroadcastVT)) {
9608 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9609 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9610 SDValue Broadcast = RepeatLoad;
9611 if (RepeatSize > ScalarSize) {
9612 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9613 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9614 } else {
9615 if (!Subtarget.hasAVX2() &&
9616 !X86::mayFoldLoadIntoBroadcastFromMem(
9617 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9618 Subtarget,
9619 /*AssumeSingleUse=*/true))
9620 return SDValue();
9621 Broadcast =
9622 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9623 }
9624 return DAG.getBitcast(VT, Broadcast);
9625 }
9626 }
9627 }
9628 }
9629
9630 return SDValue();
9631}
9632
9633// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9634// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9635// are consecutive, non-overlapping, and in the right order.
9636static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9637 SelectionDAG &DAG,
9638 const X86Subtarget &Subtarget,
9639 bool IsAfterLegalize) {
9640 SmallVector<SDValue, 64> Elts;
9641 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9642 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9643 Elts.push_back(Elt);
9644 continue;
9645 }
9646 return SDValue();
9647 }
9648 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9648, __extension__
__PRETTY_FUNCTION__))
;
9649 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9650 IsAfterLegalize);
9651}
9652
9653static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9654 unsigned SplatBitSize, LLVMContext &C) {
9655 unsigned ScalarSize = VT.getScalarSizeInBits();
9656 unsigned NumElm = SplatBitSize / ScalarSize;
9657
9658 SmallVector<Constant *, 32> ConstantVec;
9659 for (unsigned i = 0; i < NumElm; i++) {
9660 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9661 Constant *Const;
9662 if (VT.isFloatingPoint()) {
9663 if (ScalarSize == 16) {
9664 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9665 } else if (ScalarSize == 32) {
9666 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9667 } else {
9668 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9668, __extension__
__PRETTY_FUNCTION__))
;
9669 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9670 }
9671 } else
9672 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9673 ConstantVec.push_back(Const);
9674 }
9675 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9676}
9677
9678static bool isFoldableUseOfShuffle(SDNode *N) {
9679 for (auto *U : N->uses()) {
9680 unsigned Opc = U->getOpcode();
9681 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9682 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9683 return false;
9684 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9685 return false;
9686 if (isTargetShuffle(Opc))
9687 return true;
9688 if (Opc == ISD::BITCAST) // Ignore bitcasts
9689 return isFoldableUseOfShuffle(U);
9690 if (N->hasOneUse()) {
9691 // TODO, there may be some general way to know if a SDNode can
9692 // be folded. We now only know whether an MI is foldable.
9693 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9694 return false;
9695 return true;
9696 }
9697 }
9698 return false;
9699}
9700
9701/// Attempt to use the vbroadcast instruction to generate a splat value
9702/// from a splat BUILD_VECTOR which uses:
9703/// a. A single scalar load, or a constant.
9704/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9705///
9706/// The VBROADCAST node is returned when a pattern is found,
9707/// or SDValue() otherwise.
9708static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9709 const X86Subtarget &Subtarget,
9710 SelectionDAG &DAG) {
9711 // VBROADCAST requires AVX.
9712 // TODO: Splats could be generated for non-AVX CPUs using SSE
9713 // instructions, but there's less potential gain for only 128-bit vectors.
9714 if (!Subtarget.hasAVX())
9715 return SDValue();
9716
9717 MVT VT = BVOp->getSimpleValueType(0);
9718 unsigned NumElts = VT.getVectorNumElements();
9719 SDLoc dl(BVOp);
9720
9721 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__))
9722 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__))
;
9723
9724 // See if the build vector is a repeating sequence of scalars (inc. splat).
9725 SDValue Ld;
9726 BitVector UndefElements;
9727 SmallVector<SDValue, 16> Sequence;
9728 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9729 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9729, __extension__
__PRETTY_FUNCTION__))
;
9730 if (Sequence.size() == 1)
9731 Ld = Sequence[0];
9732 }
9733
9734 // Attempt to use VBROADCASTM
9735 // From this pattern:
9736 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9737 // b. t1 = (build_vector t0 t0)
9738 //
9739 // Create (VBROADCASTM v2i1 X)
9740 if (!Sequence.empty() && Subtarget.hasCDI()) {
9741 // If not a splat, are the upper sequence values zeroable?
9742 unsigned SeqLen = Sequence.size();
9743 bool UpperZeroOrUndef =
9744 SeqLen == 1 ||
9745 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
9746 return !V || V.isUndef() || isNullConstant(V);
9747 });
9748 SDValue Op0 = Sequence[0];
9749 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9750 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9751 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9752 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9753 ? Op0.getOperand(0)
9754 : Op0.getOperand(0).getOperand(0);
9755 MVT MaskVT = BOperand.getSimpleValueType();
9756 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9757 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9758 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9759 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9760 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9761 unsigned Scale = 512 / VT.getSizeInBits();
9762 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9763 }
9764 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9765 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9766 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9767 return DAG.getBitcast(VT, Bcst);
9768 }
9769 }
9770 }
9771
9772 unsigned NumUndefElts = UndefElements.count();
9773 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9774 APInt SplatValue, Undef;
9775 unsigned SplatBitSize;
9776 bool HasUndef;
9777 // Check if this is a repeated constant pattern suitable for broadcasting.
9778 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9779 SplatBitSize > VT.getScalarSizeInBits() &&
9780 SplatBitSize < VT.getSizeInBits()) {
9781 // Avoid replacing with broadcast when it's a use of a shuffle
9782 // instruction to preserve the present custom lowering of shuffles.
9783 if (isFoldableUseOfShuffle(BVOp))
9784 return SDValue();
9785 // replace BUILD_VECTOR with broadcast of the repeated constants.
9786 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9787 LLVMContext *Ctx = DAG.getContext();
9788 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9789 if (Subtarget.hasAVX()) {
9790 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9791 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9792 // Splatted value can fit in one INTEGER constant in constant pool.
9793 // Load the constant and broadcast it.
9794 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9795 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9796 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9797 SDValue CP = DAG.getConstantPool(C, PVT);
9798 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9799
9800 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9801 SDVTList Tys =
9802 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9803 SDValue Ops[] = {DAG.getEntryNode(), CP};
9804 MachinePointerInfo MPI =
9805 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9806 SDValue Brdcst = DAG.getMemIntrinsicNode(
9807 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9808 MachineMemOperand::MOLoad);
9809 return DAG.getBitcast(VT, Brdcst);
9810 }
9811 if (SplatBitSize > 64) {
9812 // Load the vector of constants and broadcast it.
9813 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9814 *Ctx);
9815 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9816 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9817 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9818 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9819 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9820 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9821 MachinePointerInfo MPI =
9822 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9823 return DAG.getMemIntrinsicNode(
9824 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9825 MachineMemOperand::MOLoad);
9826 }
9827 }
9828 }
9829
9830 // If we are moving a scalar into a vector (Ld must be set and all elements
9831 // but 1 are undef) and that operation is not obviously supported by
9832 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9833 // That's better than general shuffling and may eliminate a load to GPR and
9834 // move from scalar to vector register.
9835 if (!Ld || NumElts - NumUndefElts != 1)
9836 return SDValue();
9837 unsigned ScalarSize = Ld.getValueSizeInBits();
9838 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9839 return SDValue();
9840 }
9841
9842 bool ConstSplatVal =
9843 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9844 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9845
9846 // TODO: Handle broadcasts of non-constant sequences.
9847
9848 // Make sure that all of the users of a non-constant load are from the
9849 // BUILD_VECTOR node.
9850 // FIXME: Is the use count needed for non-constant, non-load case?
9851 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9852 return SDValue();
9853
9854 unsigned ScalarSize = Ld.getValueSizeInBits();
9855 bool IsGE256 = (VT.getSizeInBits() >= 256);
9856
9857 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9858 // instruction to save 8 or more bytes of constant pool data.
9859 // TODO: If multiple splats are generated to load the same constant,
9860 // it may be detrimental to overall size. There needs to be a way to detect
9861 // that condition to know if this is truly a size win.
9862 bool OptForSize = DAG.shouldOptForSize();
9863
9864 // Handle broadcasting a single constant scalar from the constant pool
9865 // into a vector.
9866 // On Sandybridge (no AVX2), it is still better to load a constant vector
9867 // from the constant pool and not to broadcast it from a scalar.
9868 // But override that restriction when optimizing for size.
9869 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9870 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9871 EVT CVT = Ld.getValueType();
9872 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9872, __extension__
__PRETTY_FUNCTION__))
;
9873
9874 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
9875 // For size optimization, also splat v2f64 and v2i64, and for size opt
9876 // with AVX2, also splat i8 and i16.
9877 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9878 if (ScalarSize == 32 ||
9879 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9880 CVT == MVT::f16 ||
9881 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9882 const Constant *C = nullptr;
9883 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9884 C = CI->getConstantIntValue();
9885 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9886 C = CF->getConstantFPValue();
9887
9888 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9888, __extension__
__PRETTY_FUNCTION__))
;
9889
9890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9891 SDValue CP =
9892 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9893 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9894
9895 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9896 SDValue Ops[] = {DAG.getEntryNode(), CP};
9897 MachinePointerInfo MPI =
9898 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9899 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9900 MPI, Alignment, MachineMemOperand::MOLoad);
9901 }
9902 }
9903
9904 // Handle AVX2 in-register broadcasts.
9905 if (!IsLoad && Subtarget.hasInt256() &&
9906 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9907 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9908
9909 // The scalar source must be a normal load.
9910 if (!IsLoad)
9911 return SDValue();
9912
9913 // Make sure the non-chain result is only used by this build vector.
9914 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9915 return SDValue();
9916
9917 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9918 (Subtarget.hasVLX() && ScalarSize == 64)) {
9919 auto *LN = cast<LoadSDNode>(Ld);
9920 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9921 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9922 SDValue BCast =
9923 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9924 LN->getMemoryVT(), LN->getMemOperand());
9925 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9926 return BCast;
9927 }
9928
9929 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9930 // double since there is no vbroadcastsd xmm
9931 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9932 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9933 auto *LN = cast<LoadSDNode>(Ld);
9934 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9935 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9936 SDValue BCast =
9937 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9938 LN->getMemoryVT(), LN->getMemOperand());
9939 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9940 return BCast;
9941 }
9942
9943 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9944 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9945
9946 // Unsupported broadcast.
9947 return SDValue();
9948}
9949
9950/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9951/// underlying vector and index.
9952///
9953/// Modifies \p ExtractedFromVec to the real vector and returns the real
9954/// index.
9955static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9956 SDValue ExtIdx) {
9957 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9958 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9959 return Idx;
9960
9961 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9962 // lowered this:
9963 // (extract_vector_elt (v8f32 %1), Constant<6>)
9964 // to:
9965 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9966 // (extract_subvector (v8f32 %0), Constant<4>),
9967 // undef)
9968 // Constant<0>)
9969 // In this case the vector is the extract_subvector expression and the index
9970 // is 2, as specified by the shuffle.
9971 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9972 SDValue ShuffleVec = SVOp->getOperand(0);
9973 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9974 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__))
9975 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__))
;
9976
9977 int ShuffleIdx = SVOp->getMaskElt(Idx);
9978 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9979 ExtractedFromVec = ShuffleVec;
9980 return ShuffleIdx;
9981 }
9982 return Idx;
9983}
9984
9985static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9986 MVT VT = Op.getSimpleValueType();
9987
9988 // Skip if insert_vec_elt is not supported.
9989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9990 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9991 return SDValue();
9992
9993 SDLoc DL(Op);
9994 unsigned NumElems = Op.getNumOperands();
9995
9996 SDValue VecIn1;
9997 SDValue VecIn2;
9998 SmallVector<unsigned, 4> InsertIndices;
9999 SmallVector<int, 8> Mask(NumElems, -1);
10000
10001 for (unsigned i = 0; i != NumElems; ++i) {
10002 unsigned Opc = Op.getOperand(i).getOpcode();
10003
10004 if (Opc == ISD::UNDEF)
10005 continue;
10006
10007 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
10008 // Quit if more than 1 elements need inserting.
10009 if (InsertIndices.size() > 1)
10010 return SDValue();
10011
10012 InsertIndices.push_back(i);
10013 continue;
10014 }
10015
10016 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
10017 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
10018
10019 // Quit if non-constant index.
10020 if (!isa<ConstantSDNode>(ExtIdx))
10021 return SDValue();
10022 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
10023
10024 // Quit if extracted from vector of different type.
10025 if (ExtractedFromVec.getValueType() != VT)
10026 return SDValue();
10027
10028 if (!VecIn1.getNode())
10029 VecIn1 = ExtractedFromVec;
10030 else if (VecIn1 != ExtractedFromVec) {
10031 if (!VecIn2.getNode())
10032 VecIn2 = ExtractedFromVec;
10033 else if (VecIn2 != ExtractedFromVec)
10034 // Quit if more than 2 vectors to shuffle
10035 return SDValue();
10036 }
10037
10038 if (ExtractedFromVec == VecIn1)
10039 Mask[i] = Idx;
10040 else if (ExtractedFromVec == VecIn2)
10041 Mask[i] = Idx + NumElems;
10042 }
10043
10044 if (!VecIn1.getNode())
10045 return SDValue();
10046
10047 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
10048 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
10049
10050 for (unsigned Idx : InsertIndices)
10051 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
10052 DAG.getIntPtrConstant(Idx, DL));
10053
10054 return NV;
10055}
10056
10057// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
10058static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
10059 const X86Subtarget &Subtarget) {
10060 MVT VT = Op.getSimpleValueType();
10061 MVT IVT = VT.changeVectorElementTypeToInteger();
10062 SmallVector<SDValue, 16> NewOps;
10063 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
10064 NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
10065 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
10066 return DAG.getBitcast(VT, Res);
10067}
10068
10069// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
10070static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
10071 const X86Subtarget &Subtarget) {
10072
10073 MVT VT = Op.getSimpleValueType();
10074 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__))
10075 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__))
;
10076
10077 SDLoc dl(Op);
10078 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
10079 ISD::isBuildVectorAllOnes(Op.getNode()))
10080 return Op;
10081
10082 uint64_t Immediate = 0;
10083 SmallVector<unsigned, 16> NonConstIdx;
10084 bool IsSplat = true;
10085 bool HasConstElts = false;
10086 int SplatIdx = -1;
10087 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
10088 SDValue In = Op.getOperand(idx);
10089 if (In.isUndef())
10090 continue;
10091 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
10092 Immediate |= (InC->getZExtValue() & 0x1) << idx;
10093 HasConstElts = true;
10094 } else {
10095 NonConstIdx.push_back(idx);
10096 }
10097 if (SplatIdx < 0)
10098 SplatIdx = idx;
10099 else if (In != Op.getOperand(SplatIdx))
10100 IsSplat = false;
10101 }
10102
10103 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
10104 if (IsSplat) {
10105 // The build_vector allows the scalar element to be larger than the vector
10106 // element type. We need to mask it to use as a condition unless we know
10107 // the upper bits are zero.
10108 // FIXME: Use computeKnownBits instead of checking specific opcode?
10109 SDValue Cond = Op.getOperand(SplatIdx);
10110 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10110, __extension__
__PRETTY_FUNCTION__))
;
10111 if (Cond.getOpcode() != ISD::SETCC)
10112 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
10113 DAG.getConstant(1, dl, MVT::i8));
10114
10115 // Perform the select in the scalar domain so we can use cmov.
10116 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10117 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
10118 DAG.getAllOnesConstant(dl, MVT::i32),
10119 DAG.getConstant(0, dl, MVT::i32));
10120 Select = DAG.getBitcast(MVT::v32i1, Select);
10121 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
10122 } else {
10123 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10124 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
10125 DAG.getAllOnesConstant(dl, ImmVT),
10126 DAG.getConstant(0, dl, ImmVT));
10127 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10128 Select = DAG.getBitcast(VecVT, Select);
10129 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
10130 DAG.getIntPtrConstant(0, dl));
10131 }
10132 }
10133
10134 // insert elements one by one
10135 SDValue DstVec;
10136 if (HasConstElts) {
10137 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10138 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
10139 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
10140 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
10141 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
10142 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
10143 } else {
10144 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10145 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
10146 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10147 DstVec = DAG.getBitcast(VecVT, Imm);
10148 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
10149 DAG.getIntPtrConstant(0, dl));
10150 }
10151 } else
10152 DstVec = DAG.getUNDEF(VT);
10153
10154 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
10155 unsigned InsertIdx = NonConstIdx[i];
10156 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10157 Op.getOperand(InsertIdx),
10158 DAG.getIntPtrConstant(InsertIdx, dl));
10159 }
10160 return DstVec;
10161}
10162
10163LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
10164 switch (Opcode) {
10165 case X86ISD::PACKSS:
10166 case X86ISD::PACKUS:
10167 case X86ISD::FHADD:
10168 case X86ISD::FHSUB:
10169 case X86ISD::HADD:
10170 case X86ISD::HSUB:
10171 return true;
10172 }
10173 return false;
10174}
10175
10176/// This is a helper function of LowerToHorizontalOp().
10177/// This function checks that the build_vector \p N in input implements a
10178/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
10179/// may not match the layout of an x86 256-bit horizontal instruction.
10180/// In other words, if this returns true, then some extraction/insertion will
10181/// be required to produce a valid horizontal instruction.
10182///
10183/// Parameter \p Opcode defines the kind of horizontal operation to match.
10184/// For example, if \p Opcode is equal to ISD::ADD, then this function
10185/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
10186/// is equal to ISD::SUB, then this function checks if this is a horizontal
10187/// arithmetic sub.
10188///
10189/// This function only analyzes elements of \p N whose indices are
10190/// in range [BaseIdx, LastIdx).
10191///
10192/// TODO: This function was originally used to match both real and fake partial
10193/// horizontal operations, but the index-matching logic is incorrect for that.
10194/// See the corrected implementation in isHopBuildVector(). Can we reduce this
10195/// code because it is only used for partial h-op matching now?
10196static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
10197 SelectionDAG &DAG,
10198 unsigned BaseIdx, unsigned LastIdx,
10199 SDValue &V0, SDValue &V1) {
10200 EVT VT = N->getValueType(0);
10201 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10201, __extension__
__PRETTY_FUNCTION__))
;
10202 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10202, __extension__
__PRETTY_FUNCTION__))
;
10203 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__))
10204 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__))
;
10205
10206 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
10207 bool CanFold = true;
10208 unsigned ExpectedVExtractIdx = BaseIdx;
10209 unsigned NumElts = LastIdx - BaseIdx;
10210 V0 = DAG.getUNDEF(VT);
10211 V1 = DAG.getUNDEF(VT);
10212
10213 // Check if N implements a horizontal binop.
10214 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
10215 SDValue Op = N->getOperand(i + BaseIdx);
10216
10217 // Skip UNDEFs.
10218 if (Op->isUndef()) {
10219 // Update the expected vector extract index.
10220 if (i * 2 == NumElts)
10221 ExpectedVExtractIdx = BaseIdx;
10222 ExpectedVExtractIdx += 2;
10223 continue;
10224 }
10225
10226 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
10227
10228 if (!CanFold)
10229 break;
10230
10231 SDValue Op0 = Op.getOperand(0);
10232 SDValue Op1 = Op.getOperand(1);
10233
10234 // Try to match the following pattern:
10235 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
10236 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10237 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10238 Op0.getOperand(0) == Op1.getOperand(0) &&
10239 isa<ConstantSDNode>(Op0.getOperand(1)) &&
10240 isa<ConstantSDNode>(Op1.getOperand(1)));
10241 if (!CanFold)
10242 break;
10243
10244 unsigned I0 = Op0.getConstantOperandVal(1);
10245 unsigned I1 = Op1.getConstantOperandVal(1);
10246
10247 if (i * 2 < NumElts) {
10248 if (V0.isUndef()) {
10249 V0 = Op0.getOperand(0);
10250 if (V0.getValueType() != VT)
10251 return false;
10252 }
10253 } else {
10254 if (V1.isUndef()) {
10255 V1 = Op0.getOperand(0);
10256 if (V1.getValueType() != VT)
10257 return false;
10258 }
10259 if (i * 2 == NumElts)
10260 ExpectedVExtractIdx = BaseIdx;
10261 }
10262
10263 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
10264 if (I0 == ExpectedVExtractIdx)
10265 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
10266 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
10267 // Try to match the following dag sequence:
10268 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
10269 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
10270 } else
10271 CanFold = false;
10272
10273 ExpectedVExtractIdx += 2;
10274 }
10275
10276 return CanFold;
10277}
10278
10279/// Emit a sequence of two 128-bit horizontal add/sub followed by
10280/// a concat_vector.
10281///
10282/// This is a helper function of LowerToHorizontalOp().
10283/// This function expects two 256-bit vectors called V0 and V1.
10284/// At first, each vector is split into two separate 128-bit vectors.
10285/// Then, the resulting 128-bit vectors are used to implement two
10286/// horizontal binary operations.
10287///
10288/// The kind of horizontal binary operation is defined by \p X86Opcode.
10289///
10290/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
10291/// the two new horizontal binop.
10292/// When Mode is set, the first horizontal binop dag node would take as input
10293/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
10294/// horizontal binop dag node would take as input the lower 128-bit of V1
10295/// and the upper 128-bit of V1.
10296/// Example:
10297/// HADD V0_LO, V0_HI
10298/// HADD V1_LO, V1_HI
10299///
10300/// Otherwise, the first horizontal binop dag node takes as input the lower
10301/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
10302/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
10303/// Example:
10304/// HADD V0_LO, V1_LO
10305/// HADD V0_HI, V1_HI
10306///
10307/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
10308/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
10309/// the upper 128-bits of the result.
10310static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
10311 const SDLoc &DL, SelectionDAG &DAG,
10312 unsigned X86Opcode, bool Mode,
10313 bool isUndefLO, bool isUndefHI) {
10314 MVT VT = V0.getSimpleValueType();
10315 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__))
10316 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__))
;
10317
10318 unsigned NumElts = VT.getVectorNumElements();
10319 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
10320 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
10321 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
10322 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
10323 MVT NewVT = V0_LO.getSimpleValueType();
10324
10325 SDValue LO = DAG.getUNDEF(NewVT);
10326 SDValue HI = DAG.getUNDEF(NewVT);
10327
10328 if (Mode) {
10329 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10330 if (!isUndefLO && !V0->isUndef())
10331 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
10332 if (!isUndefHI && !V1->isUndef())
10333 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
10334 } else {
10335 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10336 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
10337 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
10338
10339 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
10340 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
10341 }
10342
10343 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
10344}
10345
10346/// Returns true iff \p BV builds a vector with the result equivalent to
10347/// the result of ADDSUB/SUBADD operation.
10348/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
10349/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
10350/// \p Opnd0 and \p Opnd1.
10351static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
10352 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10353 SDValue &Opnd0, SDValue &Opnd1,
10354 unsigned &NumExtracts,
10355 bool &IsSubAdd) {
10356
10357 MVT VT = BV->getSimpleValueType(0);
10358 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
10359 return false;
10360
10361 unsigned NumElts = VT.getVectorNumElements();
10362 SDValue InVec0 = DAG.getUNDEF(VT);
10363 SDValue InVec1 = DAG.getUNDEF(VT);
10364
10365 NumExtracts = 0;
10366
10367 // Odd-numbered elements in the input build vector are obtained from
10368 // adding/subtracting two integer/float elements.
10369 // Even-numbered elements in the input build vector are obtained from
10370 // subtracting/adding two integer/float elements.
10371 unsigned Opc[2] = {0, 0};
10372 for (unsigned i = 0, e = NumElts; i != e; ++i) {
10373 SDValue Op = BV->getOperand(i);
10374
10375 // Skip 'undef' values.
10376 unsigned Opcode = Op.getOpcode();
10377 if (Opcode == ISD::UNDEF)
10378 continue;
10379
10380 // Early exit if we found an unexpected opcode.
10381 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
10382 return false;
10383
10384 SDValue Op0 = Op.getOperand(0);
10385 SDValue Op1 = Op.getOperand(1);
10386
10387 // Try to match the following pattern:
10388 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10389 // Early exit if we cannot match that sequence.
10390 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10391 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10392 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10393 Op0.getOperand(1) != Op1.getOperand(1))
10394 return false;
10395
10396 unsigned I0 = Op0.getConstantOperandVal(1);
10397 if (I0 != i)
10398 return false;
10399
10400 // We found a valid add/sub node, make sure its the same opcode as previous
10401 // elements for this parity.
10402 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10403 return false;
10404 Opc[i % 2] = Opcode;
10405
10406 // Update InVec0 and InVec1.
10407 if (InVec0.isUndef()) {
10408 InVec0 = Op0.getOperand(0);
10409 if (InVec0.getSimpleValueType() != VT)
10410 return false;
10411 }
10412 if (InVec1.isUndef()) {
10413 InVec1 = Op1.getOperand(0);
10414 if (InVec1.getSimpleValueType() != VT)
10415 return false;
10416 }
10417
10418 // Make sure that operands in input to each add/sub node always
10419 // come from a same pair of vectors.
10420 if (InVec0 != Op0.getOperand(0)) {
10421 if (Opcode == ISD::FSUB)
10422 return false;
10423
10424 // FADD is commutable. Try to commute the operands
10425 // and then test again.
10426 std::swap(Op0, Op1);
10427 if (InVec0 != Op0.getOperand(0))
10428 return false;
10429 }
10430
10431 if (InVec1 != Op1.getOperand(0))
10432 return false;
10433
10434 // Increment the number of extractions done.
10435 ++NumExtracts;
10436 }
10437
10438 // Ensure we have found an opcode for both parities and that they are
10439 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10440 // inputs are undef.
10441 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10442 InVec0.isUndef() || InVec1.isUndef())
10443 return false;
10444
10445 IsSubAdd = Opc[0] == ISD::FADD;
10446
10447 Opnd0 = InVec0;
10448 Opnd1 = InVec1;
10449 return true;
10450}
10451
10452/// Returns true if is possible to fold MUL and an idiom that has already been
10453/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10454/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10455/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10456///
10457/// Prior to calling this function it should be known that there is some
10458/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10459/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10460/// before replacement of such SDNode with ADDSUB operation. Thus the number
10461/// of \p Opnd0 uses is expected to be equal to 2.
10462/// For example, this function may be called for the following IR:
10463/// %AB = fmul fast <2 x double> %A, %B
10464/// %Sub = fsub fast <2 x double> %AB, %C
10465/// %Add = fadd fast <2 x double> %AB, %C
10466/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10467/// <2 x i32> <i32 0, i32 3>
10468/// There is a def for %Addsub here, which potentially can be replaced by
10469/// X86ISD::ADDSUB operation:
10470/// %Addsub = X86ISD::ADDSUB %AB, %C
10471/// and such ADDSUB can further be replaced with FMADDSUB:
10472/// %Addsub = FMADDSUB %A, %B, %C.
10473///
10474/// The main reason why this method is called before the replacement of the
10475/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10476/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10477/// FMADDSUB is.
10478static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10479 SelectionDAG &DAG,
10480 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10481 unsigned ExpectedUses) {
10482 if (Opnd0.getOpcode() != ISD::FMUL ||
10483 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10484 return false;
10485
10486 // FIXME: These checks must match the similar ones in
10487 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10488 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10489 // or MUL + ADDSUB to FMADDSUB.
10490 const TargetOptions &Options = DAG.getTarget().Options;
10491 bool AllowFusion =
10492 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10493 if (!AllowFusion)
10494 return false;
10495
10496 Opnd2 = Opnd1;
10497 Opnd1 = Opnd0.getOperand(1);
10498 Opnd0 = Opnd0.getOperand(0);
10499
10500 return true;
10501}
10502
10503/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10504/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10505/// X86ISD::FMSUBADD node.
10506static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10507 const X86Subtarget &Subtarget,
10508 SelectionDAG &DAG) {
10509 SDValue Opnd0, Opnd1;
10510 unsigned NumExtracts;
10511 bool IsSubAdd;
10512 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10513 IsSubAdd))
10514 return SDValue();
10515
10516 MVT VT = BV->getSimpleValueType(0);
10517 SDLoc DL(BV);
10518
10519 // Try to generate X86ISD::FMADDSUB node here.
10520 SDValue Opnd2;
10521 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10522 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10523 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10524 }
10525
10526 // We only support ADDSUB.
10527 if (IsSubAdd)
10528 return SDValue();
10529
10530 // There are no known X86 targets with 512-bit ADDSUB instructions!
10531 // Convert to blend(fsub,fadd).
10532 if (VT.is512BitVector()) {
10533 SmallVector<int> Mask;
10534 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10535 Mask.push_back(I);
10536 Mask.push_back(I + E + 1);
10537 }
10538 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10539 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10540 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10541 }
10542
10543 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10544}
10545
10546static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10547 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10548 // Initialize outputs to known values.
10549 MVT VT = BV->getSimpleValueType(0);
10550 HOpcode = ISD::DELETED_NODE;
10551 V0 = DAG.getUNDEF(VT);
10552 V1 = DAG.getUNDEF(VT);
10553
10554 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10555 // half of the result is calculated independently from the 128-bit halves of
10556 // the inputs, so that makes the index-checking logic below more complicated.
10557 unsigned NumElts = VT.getVectorNumElements();
10558 unsigned GenericOpcode = ISD::DELETED_NODE;
10559 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10560 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10561 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10562 for (unsigned i = 0; i != Num128BitChunks; ++i) {
10563 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10564 // Ignore undef elements.
10565 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10566 if (Op.isUndef())
10567 continue;
10568
10569 // If there's an opcode mismatch, we're done.
10570 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10571 return false;
10572
10573 // Initialize horizontal opcode.
10574 if (HOpcode == ISD::DELETED_NODE) {
10575 GenericOpcode = Op.getOpcode();
10576 switch (GenericOpcode) {
10577 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10578 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10579 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10580 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10581 default: return false;
10582 }
10583 }
10584
10585 SDValue Op0 = Op.getOperand(0);
10586 SDValue Op1 = Op.getOperand(1);
10587 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10588 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10589 Op0.getOperand(0) != Op1.getOperand(0) ||
10590 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10591 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10592 return false;
10593
10594 // The source vector is chosen based on which 64-bit half of the
10595 // destination vector is being calculated.
10596 if (j < NumEltsIn64Bits) {
10597 if (V0.isUndef())
10598 V0 = Op0.getOperand(0);
10599 } else {
10600 if (V1.isUndef())
10601 V1 = Op0.getOperand(0);
10602 }
10603
10604 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10605 if (SourceVec != Op0.getOperand(0))
10606 return false;
10607
10608 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10609 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10610 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10611 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10612 (j % NumEltsIn64Bits) * 2;
10613 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10614 continue;
10615
10616 // If this is not a commutative op, this does not match.
10617 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10618 return false;
10619
10620 // Addition is commutative, so try swapping the extract indexes.
10621 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10622 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10623 continue;
10624
10625 // Extract indexes do not match horizontal requirement.
10626 return false;
10627 }
10628 }
10629 // We matched. Opcode and operands are returned by reference as arguments.
10630 return true;
10631}
10632
10633static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10634 SelectionDAG &DAG, unsigned HOpcode,
10635 SDValue V0, SDValue V1) {
10636 // If either input vector is not the same size as the build vector,
10637 // extract/insert the low bits to the correct size.
10638 // This is free (examples: zmm --> xmm, xmm --> ymm).
10639 MVT VT = BV->getSimpleValueType(0);
10640 unsigned Width = VT.getSizeInBits();
10641 if (V0.getValueSizeInBits() > Width)
10642 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10643 else if (V0.getValueSizeInBits() < Width)
10644 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10645
10646 if (V1.getValueSizeInBits() > Width)
10647 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10648 else if (V1.getValueSizeInBits() < Width)
10649 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10650
10651 unsigned NumElts = VT.getVectorNumElements();
10652 APInt DemandedElts = APInt::getAllOnes(NumElts);
10653 for (unsigned i = 0; i != NumElts; ++i)
10654 if (BV->getOperand(i).isUndef())
10655 DemandedElts.clearBit(i);
10656
10657 // If we don't need the upper xmm, then perform as a xmm hop.
10658 unsigned HalfNumElts = NumElts / 2;
10659 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10660 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10661 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10662 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10663 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10664 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10665 }
10666
10667 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10668}
10669
10670/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10671static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10672 const X86Subtarget &Subtarget,
10673 SelectionDAG &DAG) {
10674 // We need at least 2 non-undef elements to make this worthwhile by default.
10675 unsigned NumNonUndefs =
10676 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10677 if (NumNonUndefs < 2)
10678 return SDValue();
10679
10680 // There are 4 sets of horizontal math operations distinguished by type:
10681 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10682 // subtarget feature. Try to match those "native" patterns first.
10683 MVT VT = BV->getSimpleValueType(0);
10684 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10685 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10686 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10687 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10688 unsigned HOpcode;
10689 SDValue V0, V1;
10690 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10691 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10692 }
10693
10694 // Try harder to match 256-bit ops by using extract/concat.
10695 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10696 return SDValue();
10697
10698 // Count the number of UNDEF operands in the build_vector in input.
10699 unsigned NumElts = VT.getVectorNumElements();
10700 unsigned Half = NumElts / 2;
10701 unsigned NumUndefsLO = 0;
10702 unsigned NumUndefsHI = 0;
10703 for (unsigned i = 0, e = Half; i != e; ++i)
10704 if (BV->getOperand(i)->isUndef())
10705 NumUndefsLO++;
10706
10707 for (unsigned i = Half, e = NumElts; i != e; ++i)
10708 if (BV->getOperand(i)->isUndef())
10709 NumUndefsHI++;
10710
10711 SDLoc DL(BV);
10712 SDValue InVec0, InVec1;
10713 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10714 SDValue InVec2, InVec3;
10715 unsigned X86Opcode;
10716 bool CanFold = true;
10717
10718 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10719 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10720 InVec3) &&
10721 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10722 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10723 X86Opcode = X86ISD::HADD;
10724 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10725 InVec1) &&
10726 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10727 InVec3) &&
10728 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10729 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10730 X86Opcode = X86ISD::HSUB;
10731 else
10732 CanFold = false;
10733
10734 if (CanFold) {
10735 // Do not try to expand this build_vector into a pair of horizontal
10736 // add/sub if we can emit a pair of scalar add/sub.
10737 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10738 return SDValue();
10739
10740 // Convert this build_vector into a pair of horizontal binops followed by
10741 // a concat vector. We must adjust the outputs from the partial horizontal
10742 // matching calls above to account for undefined vector halves.
10743 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10744 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10745 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10745, __extension__
__PRETTY_FUNCTION__))
;
10746 bool isUndefLO = NumUndefsLO == Half;
10747 bool isUndefHI = NumUndefsHI == Half;
10748 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10749 isUndefHI);
10750 }
10751 }
10752
10753 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10754 VT == MVT::v16i16) {
10755 unsigned X86Opcode;
10756 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10757 X86Opcode = X86ISD::HADD;
10758 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10759 InVec1))
10760 X86Opcode = X86ISD::HSUB;
10761 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10762 InVec1))
10763 X86Opcode = X86ISD::FHADD;
10764 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10765 InVec1))
10766 X86Opcode = X86ISD::FHSUB;
10767 else
10768 return SDValue();
10769
10770 // Don't try to expand this build_vector into a pair of horizontal add/sub
10771 // if we can simply emit a pair of scalar add/sub.
10772 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10773 return SDValue();
10774
10775 // Convert this build_vector into two horizontal add/sub followed by
10776 // a concat vector.
10777 bool isUndefLO = NumUndefsLO == Half;
10778 bool isUndefHI = NumUndefsHI == Half;
10779 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10780 isUndefLO, isUndefHI);
10781 }
10782
10783 return SDValue();
10784}
10785
10786static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10787 SelectionDAG &DAG);
10788
10789/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10790/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10791/// just apply the bit to the vectors.
10792/// NOTE: Its not in our interest to start make a general purpose vectorizer
10793/// from this, but enough scalar bit operations are created from the later
10794/// legalization + scalarization stages to need basic support.
10795static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10796 const X86Subtarget &Subtarget,
10797 SelectionDAG &DAG) {
10798 SDLoc DL(Op);
10799 MVT VT = Op->getSimpleValueType(0);
10800 unsigned NumElems = VT.getVectorNumElements();
10801 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10802
10803 // Check that all elements have the same opcode.
10804 // TODO: Should we allow UNDEFS and if so how many?
10805 unsigned Opcode = Op->getOperand(0).getOpcode();
10806 for (unsigned i = 1; i < NumElems; ++i)
10807 if (Opcode != Op->getOperand(i).getOpcode())
10808 return SDValue();
10809
10810 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10811 bool IsShift = false;
10812 switch (Opcode) {
10813 default:
10814 return SDValue();
10815 case ISD::SHL:
10816 case ISD::SRL:
10817 case ISD::SRA:
10818 IsShift = true;
10819 break;
10820 case ISD::AND:
10821 case ISD::XOR:
10822 case ISD::OR:
10823 // Don't do this if the buildvector is a splat - we'd replace one
10824 // constant with an entire vector.
10825 if (Op->getSplatValue())
10826 return SDValue();
10827 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10828 return SDValue();
10829 break;
10830 }
10831
10832 SmallVector<SDValue, 4> LHSElts, RHSElts;
10833 for (SDValue Elt : Op->ops()) {
10834 SDValue LHS = Elt.getOperand(0);
10835 SDValue RHS = Elt.getOperand(1);
10836
10837 // We expect the canonicalized RHS operand to be the constant.
10838 if (!isa<ConstantSDNode>(RHS))
10839 return SDValue();
10840
10841 // Extend shift amounts.
10842 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10843 if (!IsShift)
10844 return SDValue();
10845 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10846 }
10847
10848 LHSElts.push_back(LHS);
10849 RHSElts.push_back(RHS);
10850 }
10851
10852 // Limit to shifts by uniform immediates.
10853 // TODO: Only accept vXi8/vXi64 special cases?
10854 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10855 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10856 return SDValue();
10857
10858 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10859 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10860 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10861
10862 if (!IsShift)
10863 return Res;
10864
10865 // Immediately lower the shift to ensure the constant build vector doesn't
10866 // get converted to a constant pool before the shift is lowered.
10867 return LowerShift(Res, Subtarget, DAG);
10868}
10869
10870/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10871/// functionality to do this, so it's all zeros, all ones, or some derivation
10872/// that is cheap to calculate.
10873static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10874 const X86Subtarget &Subtarget) {
10875 SDLoc DL(Op);
10876 MVT VT = Op.getSimpleValueType();
10877
10878 // Vectors containing all zeros can be matched by pxor and xorps.
10879 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10880 return Op;
10881
10882 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10883 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10884 // vpcmpeqd on 256-bit vectors.
10885 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10886 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10887 return Op;
10888
10889 return getOnesVector(VT, DAG, DL);
10890 }
10891
10892 return SDValue();
10893}
10894
10895/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10896/// from a vector of source values and a vector of extraction indices.
10897/// The vectors might be manipulated to match the type of the permute op.
10898static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10899 SDLoc &DL, SelectionDAG &DAG,
10900 const X86Subtarget &Subtarget) {
10901 MVT ShuffleVT = VT;
10902 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10903 unsigned NumElts = VT.getVectorNumElements();
10904 unsigned SizeInBits = VT.getSizeInBits();
10905
10906 // Adjust IndicesVec to match VT size.
10907 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__))
10908 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__))
;
10909 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10910 // Narrow/widen the indices vector to the correct size.
10911 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10912 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10913 NumElts * VT.getScalarSizeInBits());
10914 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10915 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10916 SDLoc(IndicesVec), SizeInBits);
10917 // Zero-extend the index elements within the vector.
10918 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10919 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10920 IndicesVT, IndicesVec);
10921 }
10922 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10923
10924 // Handle SrcVec that don't match VT type.
10925 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10926 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10927 // Handle larger SrcVec by treating it as a larger permute.
10928 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10929 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10930 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10931 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10932 Subtarget, DAG, SDLoc(IndicesVec));
10933 SDValue NewSrcVec =
10934 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10935 if (NewSrcVec)
10936 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10937 return SDValue();
10938 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10939 // Widen smaller SrcVec to match VT.
10940 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10941 } else
10942 return SDValue();
10943 }
10944
10945 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10946 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10946, __extension__
__PRETTY_FUNCTION__))
;
10947 EVT SrcVT = Idx.getValueType();
10948 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10949 uint64_t IndexScale = 0;
10950 uint64_t IndexOffset = 0;
10951
10952 // If we're scaling a smaller permute op, then we need to repeat the
10953 // indices, scaling and offsetting them as well.
10954 // e.g. v4i32 -> v16i8 (Scale = 4)
10955 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10956 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10957 for (uint64_t i = 0; i != Scale; ++i) {
10958 IndexScale |= Scale << (i * NumDstBits);
10959 IndexOffset |= i << (i * NumDstBits);
10960 }
10961
10962 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10963 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10964 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10965 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10966 return Idx;
10967 };
10968
10969 unsigned Opcode = 0;
10970 switch (VT.SimpleTy) {
10971 default:
10972 break;
10973 case MVT::v16i8:
10974 if (Subtarget.hasSSSE3())
10975 Opcode = X86ISD::PSHUFB;
10976 break;
10977 case MVT::v8i16:
10978 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10979 Opcode = X86ISD::VPERMV;
10980 else if (Subtarget.hasSSSE3()) {
10981 Opcode = X86ISD::PSHUFB;
10982 ShuffleVT = MVT::v16i8;
10983 }
10984 break;
10985 case MVT::v4f32:
10986 case MVT::v4i32:
10987 if (Subtarget.hasAVX()) {
10988 Opcode = X86ISD::VPERMILPV;
10989 ShuffleVT = MVT::v4f32;
10990 } else if (Subtarget.hasSSSE3()) {
10991 Opcode = X86ISD::PSHUFB;
10992 ShuffleVT = MVT::v16i8;
10993 }
10994 break;
10995 case MVT::v2f64:
10996 case MVT::v2i64:
10997 if (Subtarget.hasAVX()) {
10998 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10999 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11000 Opcode = X86ISD::VPERMILPV;
11001 ShuffleVT = MVT::v2f64;
11002 } else if (Subtarget.hasSSE41()) {
11003 // SSE41 can compare v2i64 - select between indices 0 and 1.
11004 return DAG.getSelectCC(
11005 DL, IndicesVec,
11006 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
11007 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
11008 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
11009 ISD::CondCode::SETEQ);
11010 }
11011 break;
11012 case MVT::v32i8:
11013 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
11014 Opcode = X86ISD::VPERMV;
11015 else if (Subtarget.hasXOP()) {
11016 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
11017 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
11018 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
11019 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
11020 return DAG.getNode(
11021 ISD::CONCAT_VECTORS, DL, VT,
11022 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
11023 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
11024 } else if (Subtarget.hasAVX()) {
11025 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
11026 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
11027 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
11028 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
11029 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
11030 ArrayRef<SDValue> Ops) {
11031 // Permute Lo and Hi and then select based on index range.
11032 // This works as SHUFB uses bits[3:0] to permute elements and we don't
11033 // care about the bit[7] as its just an index vector.
11034 SDValue Idx = Ops[2];
11035 EVT VT = Idx.getValueType();
11036 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
11037 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
11038 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
11039 ISD::CondCode::SETGT);
11040 };
11041 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
11042 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
11043 PSHUFBBuilder);
11044 }
11045 break;
11046 case MVT::v16i16:
11047 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11048 Opcode = X86ISD::VPERMV;
11049 else if (Subtarget.hasAVX()) {
11050 // Scale to v32i8 and perform as v32i8.
11051 IndicesVec = ScaleIndices(IndicesVec, 2);
11052 return DAG.getBitcast(
11053 VT, createVariablePermute(
11054 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
11055 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
11056 }
11057 break;
11058 case MVT::v8f32:
11059 case MVT::v8i32:
11060 if (Subtarget.hasAVX2())
11061 Opcode = X86ISD::VPERMV;
11062 else if (Subtarget.hasAVX()) {
11063 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
11064 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11065 {0, 1, 2, 3, 0, 1, 2, 3});
11066 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11067 {4, 5, 6, 7, 4, 5, 6, 7});
11068 if (Subtarget.hasXOP())
11069 return DAG.getBitcast(
11070 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
11071 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11072 // Permute Lo and Hi and then select based on index range.
11073 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
11074 SDValue Res = DAG.getSelectCC(
11075 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
11076 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
11077 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
11078 ISD::CondCode::SETGT);
11079 return DAG.getBitcast(VT, Res);
11080 }
11081 break;
11082 case MVT::v4i64:
11083 case MVT::v4f64:
11084 if (Subtarget.hasAVX512()) {
11085 if (!Subtarget.hasVLX()) {
11086 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
11087 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
11088 SDLoc(SrcVec));
11089 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
11090 DAG, SDLoc(IndicesVec));
11091 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
11092 DAG, Subtarget);
11093 return extract256BitVector(Res, 0, DAG, DL);
11094 }
11095 Opcode = X86ISD::VPERMV;
11096 } else if (Subtarget.hasAVX()) {
11097 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
11098 SDValue LoLo =
11099 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
11100 SDValue HiHi =
11101 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
11102 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
11103 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11104 if (Subtarget.hasXOP())
11105 return DAG.getBitcast(
11106 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
11107 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11108 // Permute Lo and Hi and then select based on index range.
11109 // This works as VPERMILPD only uses index bit[1] to permute elements.
11110 SDValue Res = DAG.getSelectCC(
11111 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
11112 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
11113 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
11114 ISD::CondCode::SETGT);
11115 return DAG.getBitcast(VT, Res);
11116 }
11117 break;
11118 case MVT::v64i8:
11119 if (Subtarget.hasVBMI())
11120 Opcode = X86ISD::VPERMV;
11121 break;
11122 case MVT::v32i16:
11123 if (Subtarget.hasBWI())
11124 Opcode = X86ISD::VPERMV;
11125 break;
11126 case MVT::v16f32:
11127 case MVT::v16i32:
11128 case MVT::v8f64:
11129 case MVT::v8i64:
11130 if (Subtarget.hasAVX512())
11131 Opcode = X86ISD::VPERMV;
11132 break;
11133 }
11134 if (!Opcode)
11135 return SDValue();
11136
11137 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))
11138 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))
11139 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))
;
11140
11141 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
11142 if (Scale > 1)
11143 IndicesVec = ScaleIndices(IndicesVec, Scale);
11144
11145 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
11146 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
11147
11148 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
11149 SDValue Res = Opcode == X86ISD::VPERMV
11150 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
11151 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
11152 return DAG.getBitcast(VT, Res);
11153}
11154
11155// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
11156// reasoned to be a permutation of a vector by indices in a non-constant vector.
11157// (build_vector (extract_elt V, (extract_elt I, 0)),
11158// (extract_elt V, (extract_elt I, 1)),
11159// ...
11160// ->
11161// (vpermv I, V)
11162//
11163// TODO: Handle undefs
11164// TODO: Utilize pshufb and zero mask blending to support more efficient
11165// construction of vectors with constant-0 elements.
11166static SDValue
11167LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
11168 const X86Subtarget &Subtarget) {
11169 SDValue SrcVec, IndicesVec;
11170 // Check for a match of the permute source vector and permute index elements.
11171 // This is done by checking that the i-th build_vector operand is of the form:
11172 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
11173 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
11174 SDValue Op = V.getOperand(Idx);
11175 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11176 return SDValue();
11177
11178 // If this is the first extract encountered in V, set the source vector,
11179 // otherwise verify the extract is from the previously defined source
11180 // vector.
11181 if (!SrcVec)
11182 SrcVec = Op.getOperand(0);
11183 else if (SrcVec != Op.getOperand(0))
11184 return SDValue();
11185 SDValue ExtractedIndex = Op->getOperand(1);
11186 // Peek through extends.
11187 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
11188 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
11189 ExtractedIndex = ExtractedIndex.getOperand(0);
11190 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11191 return SDValue();
11192
11193 // If this is the first extract from the index vector candidate, set the
11194 // indices vector, otherwise verify the extract is from the previously
11195 // defined indices vector.
11196 if (!IndicesVec)
11197 IndicesVec = ExtractedIndex.getOperand(0);
11198 else if (IndicesVec != ExtractedIndex.getOperand(0))
11199 return SDValue();
11200
11201 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
11202 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
11203 return SDValue();
11204 }
11205
11206 SDLoc DL(V);
11207 MVT VT = V.getSimpleValueType();
11208 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11209}
11210
11211SDValue
11212X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
11213 SDLoc dl(Op);
11214
11215 MVT VT = Op.getSimpleValueType();
11216 MVT EltVT = VT.getVectorElementType();
11217 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
11218 unsigned NumElems = Op.getNumOperands();
11219
11220 // Generate vectors for predicate vectors.
11221 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
11222 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
11223
11224 if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
11225 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
11226
11227 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
11228 return VectorConstant;
11229
11230 unsigned EVTBits = EltVT.getSizeInBits();
11231 APInt UndefMask = APInt::getZero(NumElems);
11232 APInt FrozenUndefMask = APInt::getZero(NumElems);
11233 APInt ZeroMask = APInt::getZero(NumElems);
11234 APInt NonZeroMask = APInt::getZero(NumElems);
11235 bool IsAllConstants = true;
11236 SmallSet<SDValue, 8> Values;
11237 unsigned NumConstants = NumElems;
11238 for (unsigned i = 0; i < NumElems; ++i) {
11239 SDValue Elt = Op.getOperand(i);
11240 if (Elt.isUndef()) {
11241 UndefMask.setBit(i);
11242 continue;
11243 }
11244 if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {
11245 FrozenUndefMask.setBit(i);
11246 continue;
11247 }
11248 Values.insert(Elt);
11249 if (!isIntOrFPConstant(Elt)) {
11250 IsAllConstants = false;
11251 NumConstants--;
11252 }
11253 if (X86::isZeroNode(Elt)) {
11254 ZeroMask.setBit(i);
11255 } else {
11256 NonZeroMask.setBit(i);
11257 }
11258 }
11259
11260 // All undef vector. Return an UNDEF.
11261 if (UndefMask.isAllOnes())
11262 return DAG.getUNDEF(VT);
11263
11264 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
11265 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
11266 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
11267 // and blend the FREEZE-UNDEF operands back in.
11268 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
11269 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
11270 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
11271 SmallVector<int, 16> BlendMask(NumElems, -1);
11272 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
11273 for (unsigned i = 0; i < NumElems; ++i) {
11274 if (UndefMask[i]) {
11275 BlendMask[i] = -1;
11276 continue;
11277 }
11278 BlendMask[i] = i;
11279 if (!FrozenUndefMask[i])
11280 Elts[i] = Op.getOperand(i);
11281 else
11282 BlendMask[i] += NumElems;
11283 }
11284 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
11285 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
11286 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
11287 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
11288 }
11289
11290 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
11291
11292 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
11293 // lowering to a smaller build vector and padding with undef/zero.
11294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
11295 !isFoldableUseOfShuffle(BV)) {
11296 unsigned UpperElems = NumElems / 2;
11297 APInt UndefOrZeroMask = UndefMask | ZeroMask;
11298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
11299 if (NumUpperUndefsOrZeros >= UpperElems) {
11300 if (VT.is512BitVector() &&
11301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
11302 UpperElems = NumElems - (NumElems / 4);
11303 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
11304 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
11305 SDValue NewBV =
11306 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
11307 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
11308 }
11309 }
11310
11311 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
11312 return AddSub;
11313 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
11314 return HorizontalOp;
11315 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
11316 return Broadcast;
11317 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
11318 return BitOp;
11319
11320 unsigned NumZero = ZeroMask.popcount();
11321 unsigned NumNonZero = NonZeroMask.popcount();
11322
11323 // If we are inserting one variable into a vector of non-zero constants, try
11324 // to avoid loading each constant element as a scalar. Load the constants as a
11325 // vector and then insert the variable scalar element. If insertion is not
11326 // supported, fall back to a shuffle to get the scalar blended with the
11327 // constants. Insertion into a zero vector is handled as a special-case
11328 // somewhere below here.
11329 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
11330 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
11331 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
11332 // Create an all-constant vector. The variable element in the old
11333 // build vector is replaced by undef in the constant vector. Save the
11334 // variable scalar element and its index for use in the insertelement.
11335 LLVMContext &Context = *DAG.getContext();
11336 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
11337 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
11338 SDValue VarElt;
11339 SDValue InsIndex;
11340 for (unsigned i = 0; i != NumElems; ++i) {
11341 SDValue Elt = Op.getOperand(i);
11342 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
11343 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
11344 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
11345 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
11346 else if (!Elt.isUndef()) {
11347 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__))
11348 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__))
;
11349 VarElt = Elt;
11350 InsIndex = DAG.getVectorIdxConstant(i, dl);
11351 }
11352 }
11353 Constant *CV = ConstantVector::get(ConstVecOps);
11354 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
11355
11356 // The constants we just created may not be legal (eg, floating point). We
11357 // must lower the vector right here because we can not guarantee that we'll
11358 // legalize it before loading it. This is also why we could not just create
11359 // a new build vector here. If the build vector contains illegal constants,
11360 // it could get split back up into a series of insert elements.
11361 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
11362 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
11363 MachineFunction &MF = DAG.getMachineFunction();
11364 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
11365 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
11366 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
11367 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
11368 if (InsertC < NumEltsInLow128Bits)
11369 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
11370
11371 // There's no good way to insert into the high elements of a >128-bit
11372 // vector, so use shuffles to avoid an extract/insert sequence.
11373 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11373, __extension__
__PRETTY_FUNCTION__))
;
11374 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11374, __extension__
__PRETTY_FUNCTION__))
;
11375 SmallVector<int, 8> ShuffleMask;
11376 unsigned NumElts = VT.getVectorNumElements();
11377 for (unsigned i = 0; i != NumElts; ++i)
11378 ShuffleMask.push_back(i == InsertC ? NumElts : i);
11379 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
11380 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
11381 }
11382
11383 // Special case for single non-zero, non-undef, element.
11384 if (NumNonZero == 1) {
11385 unsigned Idx = NonZeroMask.countr_zero();
11386 SDValue Item = Op.getOperand(Idx);
11387
11388 // If we have a constant or non-constant insertion into the low element of
11389 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
11390 // the rest of the elements. This will be matched as movd/movq/movss/movsd
11391 // depending on what the source datatype is.
11392 if (Idx == 0) {
11393 if (NumZero == 0)
11394 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11395
11396 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
11397 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
11398 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
11399 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))
11400 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))
11401 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))
;
11402 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11403 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
11404 // zero vector.
11405 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11406 }
11407
11408 // We can't directly insert an i8 or i16 into a vector, so zero extend
11409 // it to i32 first.
11410 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
11411 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
11412 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
11413 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
11414 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11415 return DAG.getBitcast(VT, Item);
11416 }
11417 }
11418
11419 // Is it a vector logical left shift?
11420 if (NumElems == 2 && Idx == 1 &&
11421 X86::isZeroNode(Op.getOperand(0)) &&
11422 !X86::isZeroNode(Op.getOperand(1))) {
11423 unsigned NumBits = VT.getSizeInBits();
11424 return getVShift(true, VT,
11425 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11426 VT, Op.getOperand(1)),
11427 NumBits/2, DAG, *this, dl);
11428 }
11429
11430 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11431 return SDValue();
11432
11433 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11434 // is a non-constant being inserted into an element other than the low one,
11435 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11436 // movd/movss) to move this into the low element, then shuffle it into
11437 // place.
11438 if (EVTBits == 32) {
11439 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11440 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11441 }
11442 }
11443
11444 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11445 if (Values.size() == 1) {
11446 if (EVTBits == 32) {
11447 // Instead of a shuffle like this:
11448 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11449 // Check if it's possible to issue this instead.
11450 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11451 unsigned Idx = NonZeroMask.countr_zero();
11452 SDValue Item = Op.getOperand(Idx);
11453 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11454 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11455 }
11456 return SDValue();
11457 }
11458
11459 // A vector full of immediates; various special cases are already
11460 // handled, so this is best done with a single constant-pool load.
11461 if (IsAllConstants)
11462 return SDValue();
11463
11464 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11465 return V;
11466
11467 // See if we can use a vector load to get all of the elements.
11468 {
11469 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11470 if (SDValue LD =
11471 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11472 return LD;
11473 }
11474
11475 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11476 // build_vector and broadcast it.
11477 // TODO: We could probably generalize this more.
11478 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11479 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11480 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11481 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11482 // Make sure all the even/odd operands match.
11483 for (unsigned i = 2; i != NumElems; ++i)
11484 if (Ops[i % 2] != Op.getOperand(i))
11485 return false;
11486 return true;
11487 };
11488 if (CanSplat(Op, NumElems, Ops)) {
11489 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11490 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11491 // Create a new build vector and cast to v2i64/v2f64.
11492 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11493 DAG.getBuildVector(NarrowVT, dl, Ops));
11494 // Broadcast from v2i64/v2f64 and cast to final VT.
11495 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11496 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11497 NewBV));
11498 }
11499 }
11500
11501 // For AVX-length vectors, build the individual 128-bit pieces and use
11502 // shuffles to put them in place.
11503 if (VT.getSizeInBits() > 128) {
11504 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11505
11506 // Build both the lower and upper subvector.
11507 SDValue Lower =
11508 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11509 SDValue Upper = DAG.getBuildVector(
11510 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11511
11512 // Recreate the wider vector with the lower and upper part.
11513 return concatSubVectors(Lower, Upper, DAG, dl);
11514 }
11515
11516 // Let legalizer expand 2-wide build_vectors.
11517 if (EVTBits == 64) {
11518 if (NumNonZero == 1) {
11519 // One half is zero or undef.
11520 unsigned Idx = NonZeroMask.countr_zero();
11521 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11522 Op.getOperand(Idx));
11523 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11524 }
11525 return SDValue();
11526 }
11527
11528 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11529 if (EVTBits == 8 && NumElems == 16)
11530 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11531 DAG, Subtarget))
11532 return V;
11533
11534 if (EltVT == MVT::i16 && NumElems == 8)
11535 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11536 DAG, Subtarget))
11537 return V;
11538
11539 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11540 if (EVTBits == 32 && NumElems == 4)
11541 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11542 return V;
11543
11544 // If element VT is == 32 bits, turn it into a number of shuffles.
11545 if (NumElems == 4 && NumZero > 0) {
11546 SmallVector<SDValue, 8> Ops(NumElems);
11547 for (unsigned i = 0; i < 4; ++i) {
11548 bool isZero = !NonZeroMask[i];
11549 if (isZero)
11550 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11551 else
11552 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11553 }
11554
11555 for (unsigned i = 0; i < 2; ++i) {
11556 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11557 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11557)
;
11558 case 0:
11559 Ops[i] = Ops[i*2]; // Must be a zero vector.
11560 break;
11561 case 1:
11562 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11563 break;
11564 case 2:
11565 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11566 break;
11567 case 3:
11568 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11569 break;
11570 }
11571 }
11572
11573 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11574 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11575 int MaskVec[] = {
11576 Reverse1 ? 1 : 0,
11577 Reverse1 ? 0 : 1,
11578 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11579 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11580 };
11581 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11582 }
11583
11584 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11584, __extension__
__PRETTY_FUNCTION__))
;
11585
11586 // Check for a build vector from mostly shuffle plus few inserting.
11587 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11588 return Sh;
11589
11590 // For SSE 4.1, use insertps to put the high elements into the low element.
11591 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11592 SDValue Result;
11593 if (!Op.getOperand(0).isUndef())
11594 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11595 else
11596 Result = DAG.getUNDEF(VT);
11597
11598 for (unsigned i = 1; i < NumElems; ++i) {
11599 if (Op.getOperand(i).isUndef()) continue;
11600 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11601 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11602 }
11603 return Result;
11604 }
11605
11606 // Otherwise, expand into a number of unpckl*, start by extending each of
11607 // our (non-undef) elements to the full vector width with the element in the
11608 // bottom slot of the vector (which generates no code for SSE).
11609 SmallVector<SDValue, 8> Ops(NumElems);
11610 for (unsigned i = 0; i < NumElems; ++i) {
11611 if (!Op.getOperand(i).isUndef())
11612 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11613 else
11614 Ops[i] = DAG.getUNDEF(VT);
11615 }
11616
11617 // Next, we iteratively mix elements, e.g. for v4f32:
11618 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11619 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11620 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11621 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11622 // Generate scaled UNPCKL shuffle mask.
11623 SmallVector<int, 16> Mask;
11624 for(unsigned i = 0; i != Scale; ++i)
11625 Mask.push_back(i);
11626 for (unsigned i = 0; i != Scale; ++i)
11627 Mask.push_back(NumElems+i);
11628 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11629
11630 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11631 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11632 }
11633 return Ops[0];
11634}
11635
11636// 256-bit AVX can use the vinsertf128 instruction
11637// to create 256-bit vectors from two other 128-bit ones.
11638// TODO: Detect subvector broadcast here instead of DAG combine?
11639static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11640 const X86Subtarget &Subtarget) {
11641 SDLoc dl(Op);
11642 MVT ResVT = Op.getSimpleValueType();
11643
11644 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__))
11645 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__))
;
11646
11647 unsigned NumOperands = Op.getNumOperands();
11648 unsigned NumFreezeUndef = 0;
11649 unsigned NumZero = 0;
11650 unsigned NumNonZero = 0;
11651 unsigned NonZeros = 0;
11652 for (unsigned i = 0; i != NumOperands; ++i) {
11653 SDValue SubVec = Op.getOperand(i);
11654 if (SubVec.isUndef())
11655 continue;
11656 if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())
11657 ++NumFreezeUndef;
11658 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11659 ++NumZero;
11660 else {
11661 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11661, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11662 NonZeros |= 1 << i;
11663 ++NumNonZero;
11664 }
11665 }
11666
11667 // If we have more than 2 non-zeros, build each half separately.
11668 if (NumNonZero > 2) {
11669 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11670 ArrayRef<SDUse> Ops = Op->ops();
11671 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11672 Ops.slice(0, NumOperands/2));
11673 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11674 Ops.slice(NumOperands/2));
11675 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11676 }
11677
11678 // Otherwise, build it up through insert_subvectors.
11679 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11680 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
11681 : DAG.getUNDEF(ResVT));
11682
11683 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11684 unsigned NumSubElems = SubVT.getVectorNumElements();
11685 for (unsigned i = 0; i != NumOperands; ++i) {
11686 if ((NonZeros & (1 << i)) == 0)
11687 continue;
11688
11689 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11690 Op.getOperand(i),
11691 DAG.getIntPtrConstant(i * NumSubElems, dl));
11692 }
11693
11694 return Vec;
11695}
11696
11697// Returns true if the given node is a type promotion (by concatenating i1
11698// zeros) of the result of a node that already zeros all upper bits of
11699// k-register.
11700// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11701static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11702 const X86Subtarget &Subtarget,
11703 SelectionDAG & DAG) {
11704 SDLoc dl(Op);
11705 MVT ResVT = Op.getSimpleValueType();
11706 unsigned NumOperands = Op.getNumOperands();
11707
11708 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__))
11709 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__))
;
11710
11711 uint64_t Zeros = 0;
11712 uint64_t NonZeros = 0;
11713 for (unsigned i = 0; i != NumOperands; ++i) {
11714 SDValue SubVec = Op.getOperand(i);
11715 if (SubVec.isUndef())
11716 continue;
11717 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11717, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11718 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11719 Zeros |= (uint64_t)1 << i;
11720 else
11721 NonZeros |= (uint64_t)1 << i;
11722 }
11723
11724 unsigned NumElems = ResVT.getVectorNumElements();
11725
11726 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11727 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11728 // insert_subvector will give us two kshifts.
11729 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11730 Log2_64(NonZeros) != NumOperands - 1) {
11731 MVT ShiftVT = ResVT;
11732 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11733 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11734 unsigned Idx = Log2_64(NonZeros);
11735 SDValue SubVec = Op.getOperand(Idx);
11736 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11737 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11738 DAG.getUNDEF(ShiftVT), SubVec,
11739 DAG.getIntPtrConstant(0, dl));
11740 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11741 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11742 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11743 DAG.getIntPtrConstant(0, dl));
11744 }
11745
11746 // If there are zero or one non-zeros we can handle this very simply.
11747 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11748 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11749 if (!NonZeros)
11750 return Vec;
11751 unsigned Idx = Log2_64(NonZeros);
11752 SDValue SubVec = Op.getOperand(Idx);
11753 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11754 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11755 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11756 }
11757
11758 if (NumOperands > 2) {
11759 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11760 ArrayRef<SDUse> Ops = Op->ops();
11761 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11762 Ops.slice(0, NumOperands/2));
11763 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11764 Ops.slice(NumOperands/2));
11765 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11766 }
11767
11768 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11768, __extension__
__PRETTY_FUNCTION__))
;
11769
11770 if (ResVT.getVectorNumElements() >= 16)
11771 return Op; // The operation is legal with KUNPCK
11772
11773 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11774 DAG.getUNDEF(ResVT), Op.getOperand(0),
11775 DAG.getIntPtrConstant(0, dl));
11776 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11777 DAG.getIntPtrConstant(NumElems/2, dl));
11778}
11779
11780static SDValue LowerCONCAT_VECTORS(SDValue Op,
11781 const X86Subtarget &Subtarget,
11782 SelectionDAG &DAG) {
11783 MVT VT = Op.getSimpleValueType();
11784 if (VT.getVectorElementType() == MVT::i1)
11785 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11786
11787 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))
11788 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))
11789 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))
;
11790
11791 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11792 // from two other 128-bit ones.
11793
11794 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11795 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11796}
11797
11798//===----------------------------------------------------------------------===//
11799// Vector shuffle lowering
11800//
11801// This is an experimental code path for lowering vector shuffles on x86. It is
11802// designed to handle arbitrary vector shuffles and blends, gracefully
11803// degrading performance as necessary. It works hard to recognize idiomatic
11804// shuffles and lower them to optimal instruction patterns without leaving
11805// a framework that allows reasonably efficient handling of all vector shuffle
11806// patterns.
11807//===----------------------------------------------------------------------===//
11808
11809/// Tiny helper function to identify a no-op mask.
11810///
11811/// This is a somewhat boring predicate function. It checks whether the mask
11812/// array input, which is assumed to be a single-input shuffle mask of the kind
11813/// used by the X86 shuffle instructions (not a fully general
11814/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11815/// in-place shuffle are 'no-op's.
11816static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11817 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11818 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11818, __extension__
__PRETTY_FUNCTION__))
;
11819 if (Mask[i] >= 0 && Mask[i] != i)
11820 return false;
11821 }
11822 return true;
11823}
11824
11825/// Test whether there are elements crossing LaneSizeInBits lanes in this
11826/// shuffle mask.
11827///
11828/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11829/// and we routinely test for these.
11830static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11831 unsigned ScalarSizeInBits,
11832 ArrayRef<int> Mask) {
11833 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))
11834 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))
11835 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))
;
11836 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11837 int Size = Mask.size();
11838 for (int i = 0; i < Size; ++i)
11839 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11840 return true;
11841 return false;
11842}
11843
11844/// Test whether there are elements crossing 128-bit lanes in this
11845/// shuffle mask.
11846static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11847 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11848}
11849
11850/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11851/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11852/// better support 'repeated mask + lane permute' style shuffles.
11853static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11854 unsigned ScalarSizeInBits,
11855 ArrayRef<int> Mask) {
11856 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))
11857 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))
11858 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))
;
11859 int NumElts = Mask.size();
11860 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11861 int NumLanes = NumElts / NumEltsPerLane;
11862 if (NumLanes > 1) {
11863 for (int i = 0; i != NumLanes; ++i) {
11864 int SrcLane = -1;
11865 for (int j = 0; j != NumEltsPerLane; ++j) {
11866 int M = Mask[(i * NumEltsPerLane) + j];
11867 if (M < 0)
11868 continue;
11869 int Lane = (M % NumElts) / NumEltsPerLane;
11870 if (SrcLane >= 0 && SrcLane != Lane)
11871 return true;
11872 SrcLane = Lane;
11873 }
11874 }
11875 }
11876 return false;
11877}
11878
11879/// Test whether a shuffle mask is equivalent within each sub-lane.
11880///
11881/// This checks a shuffle mask to see if it is performing the same
11882/// lane-relative shuffle in each sub-lane. This trivially implies
11883/// that it is also not lane-crossing. It may however involve a blend from the
11884/// same lane of a second vector.
11885///
11886/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11887/// non-trivial to compute in the face of undef lanes. The representation is
11888/// suitable for use with existing 128-bit shuffles as entries from the second
11889/// vector have been remapped to [LaneSize, 2*LaneSize).
11890static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11891 ArrayRef<int> Mask,
11892 SmallVectorImpl<int> &RepeatedMask) {
11893 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11894 RepeatedMask.assign(LaneSize, -1);
11895 int Size = Mask.size();
11896 for (int i = 0; i < Size; ++i) {
11897 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11897, __extension__
__PRETTY_FUNCTION__))
;
11898 if (Mask[i] < 0)
11899 continue;
11900 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11901 // This entry crosses lanes, so there is no way to model this shuffle.
11902 return false;
11903
11904 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11905 // Adjust second vector indices to start at LaneSize instead of Size.
11906 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11907 : Mask[i] % LaneSize + LaneSize;
11908 if (RepeatedMask[i % LaneSize] < 0)
11909 // This is the first non-undef entry in this slot of a 128-bit lane.
11910 RepeatedMask[i % LaneSize] = LocalM;
11911 else if (RepeatedMask[i % LaneSize] != LocalM)
11912 // Found a mismatch with the repeated mask.
11913 return false;
11914 }
11915 return true;
11916}
11917
11918/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11919static bool
11920is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11921 SmallVectorImpl<int> &RepeatedMask) {
11922 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11923}
11924
11925static bool
11926is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11927 SmallVector<int, 32> RepeatedMask;
11928 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11929}
11930
11931/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11932static bool
11933is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11934 SmallVectorImpl<int> &RepeatedMask) {
11935 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11936}
11937
11938/// Test whether a target shuffle mask is equivalent within each sub-lane.
11939/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11940static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11941 unsigned EltSizeInBits,
11942 ArrayRef<int> Mask,
11943 SmallVectorImpl<int> &RepeatedMask) {
11944 int LaneSize = LaneSizeInBits / EltSizeInBits;
11945 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11946 int Size = Mask.size();
11947 for (int i = 0; i < Size; ++i) {
11948 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11948, __extension__
__PRETTY_FUNCTION__))
;
11949 if (Mask[i] == SM_SentinelUndef)
11950 continue;
11951 if (Mask[i] == SM_SentinelZero) {
11952 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11953 return false;
11954 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11955 continue;
11956 }
11957 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11958 // This entry crosses lanes, so there is no way to model this shuffle.
11959 return false;
11960
11961 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11962 // later vector indices to start at multiples of LaneSize instead of Size.
11963 int LaneM = Mask[i] / Size;
11964 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11965 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11966 // This is the first non-undef entry in this slot of a 128-bit lane.
11967 RepeatedMask[i % LaneSize] = LocalM;
11968 else if (RepeatedMask[i % LaneSize] != LocalM)
11969 // Found a mismatch with the repeated mask.
11970 return false;
11971 }
11972 return true;
11973}
11974
11975/// Test whether a target shuffle mask is equivalent within each sub-lane.
11976/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11977static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11978 ArrayRef<int> Mask,
11979 SmallVectorImpl<int> &RepeatedMask) {
11980 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11981 Mask, RepeatedMask);
11982}
11983
11984/// Checks whether the vector elements referenced by two shuffle masks are
11985/// equivalent.
11986static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11987 int Idx, int ExpectedIdx) {
11988 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__))
11989 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__))
;
11990 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11991 return false;
11992
11993 switch (Op.getOpcode()) {
11994 case ISD::BUILD_VECTOR:
11995 // If the values are build vectors, we can look through them to find
11996 // equivalent inputs that make the shuffles equivalent.
11997 // TODO: Handle MaskSize != Op.getNumOperands()?
11998 if (MaskSize == (int)Op.getNumOperands() &&
11999 MaskSize == (int)ExpectedOp.getNumOperands())
12000 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
12001 break;
12002 case X86ISD::VBROADCAST:
12003 case X86ISD::VBROADCAST_LOAD:
12004 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
12005 return (Op == ExpectedOp &&
12006 (int)Op.getValueType().getVectorNumElements() == MaskSize);
12007 case X86ISD::HADD:
12008 case X86ISD::HSUB:
12009 case X86ISD::FHADD:
12010 case X86ISD::FHSUB:
12011 case X86ISD::PACKSS:
12012 case X86ISD::PACKUS:
12013 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
12014 // TODO: Handle MaskSize != NumElts?
12015 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
12016 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
12017 MVT VT = Op.getSimpleValueType();
12018 int NumElts = VT.getVectorNumElements();
12019 if (MaskSize == NumElts) {
12020 int NumLanes = VT.getSizeInBits() / 128;
12021 int NumEltsPerLane = NumElts / NumLanes;
12022 int NumHalfEltsPerLane = NumEltsPerLane / 2;
12023 bool SameLane =
12024 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
12025 bool SameElt =
12026 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
12027 return SameLane && SameElt;
12028 }
12029 }
12030 break;
12031 }
12032
12033 return false;
12034}
12035
12036/// Checks whether a shuffle mask is equivalent to an explicit list of
12037/// arguments.
12038///
12039/// This is a fast way to test a shuffle mask against a fixed pattern:
12040///
12041/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
12042///
12043/// It returns true if the mask is exactly as wide as the argument list, and
12044/// each element of the mask is either -1 (signifying undef) or the value given
12045/// in the argument.
12046static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
12047 SDValue V1 = SDValue(),
12048 SDValue V2 = SDValue()) {
12049 int Size = Mask.size();
12050 if (Size != (int)ExpectedMask.size())
12051 return false;
12052
12053 for (int i = 0; i < Size; ++i) {
12054 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12054, __extension__
__PRETTY_FUNCTION__))
;
12055 int MaskIdx = Mask[i];
12056 int ExpectedIdx = ExpectedMask[i];
12057 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
12058 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12059 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12060 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12061 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12062 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12063 return false;
12064 }
12065 }
12066 return true;
12067}
12068
12069/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
12070///
12071/// The masks must be exactly the same width.
12072///
12073/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
12074/// value in ExpectedMask is always accepted. Otherwise the indices must match.
12075///
12076/// SM_SentinelZero is accepted as a valid negative index but must match in
12077/// both, or via a known bits test.
12078static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
12079 ArrayRef<int> ExpectedMask,
12080 const SelectionDAG &DAG,
12081 SDValue V1 = SDValue(),
12082 SDValue V2 = SDValue()) {
12083 int Size = Mask.size();
12084 if (Size != (int)ExpectedMask.size())
12085 return false;
12086 assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))
12087 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))
12088 "Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))
;
12089
12090 // Check for out-of-range target shuffle mask indices.
12091 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
12092 return false;
12093
12094 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
12095 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
12096 V1 = SDValue();
12097 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
12098 V2 = SDValue();
12099
12100 APInt ZeroV1 = APInt::getZero(Size);
12101 APInt ZeroV2 = APInt::getZero(Size);
12102
12103 for (int i = 0; i < Size; ++i) {
12104 int MaskIdx = Mask[i];
12105 int ExpectedIdx = ExpectedMask[i];
12106 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
12107 continue;
12108 if (MaskIdx == SM_SentinelZero) {
12109 // If we need this expected index to be a zero element, then update the
12110 // relevant zero mask and perform the known bits at the end to minimize
12111 // repeated computes.
12112 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12113 if (ExpectedV &&
12114 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
12115 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12116 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
12117 ZeroMask.setBit(BitIdx);
12118 continue;
12119 }
12120 }
12121 if (MaskIdx >= 0) {
12122 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12123 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12124 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12125 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12126 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12127 continue;
12128 }
12129 return false;
12130 }
12131 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
12132 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
12133}
12134
12135// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
12136// instructions.
12137static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
12138 const SelectionDAG &DAG) {
12139 if (VT != MVT::v8i32 && VT != MVT::v8f32)
12140 return false;
12141
12142 SmallVector<int, 8> Unpcklwd;
12143 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
12144 /* Unary = */ false);
12145 SmallVector<int, 8> Unpckhwd;
12146 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
12147 /* Unary = */ false);
12148 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
12149 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
12150 return IsUnpackwdMask;
12151}
12152
12153static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
12154 const SelectionDAG &DAG) {
12155 // Create 128-bit vector type based on mask size.
12156 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
12157 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
12158
12159 // We can't assume a canonical shuffle mask, so try the commuted version too.
12160 SmallVector<int, 4> CommutedMask(Mask);
12161 ShuffleVectorSDNode::commuteMask(CommutedMask);
12162
12163 // Match any of unary/binary or low/high.
12164 for (unsigned i = 0; i != 4; ++i) {
12165 SmallVector<int, 16> UnpackMask;
12166 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
12167 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
12168 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
12169 return true;
12170 }
12171 return false;
12172}
12173
12174/// Return true if a shuffle mask chooses elements identically in its top and
12175/// bottom halves. For example, any splat mask has the same top and bottom
12176/// halves. If an element is undefined in only one half of the mask, the halves
12177/// are not considered identical.
12178static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
12179 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12179, __extension__
__PRETTY_FUNCTION__))
;
12180 unsigned HalfSize = Mask.size() / 2;
12181 for (unsigned i = 0; i != HalfSize; ++i) {
12182 if (Mask[i] != Mask[i + HalfSize])
12183 return false;
12184 }
12185 return true;
12186}
12187
12188/// Get a 4-lane 8-bit shuffle immediate for a mask.
12189///
12190/// This helper function produces an 8-bit shuffle immediate corresponding to
12191/// the ubiquitous shuffle encoding scheme used in x86 instructions for
12192/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
12193/// example.
12194///
12195/// NB: We rely heavily on "undef" masks preserving the input lane.
12196static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
12197 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12197, __extension__
__PRETTY_FUNCTION__))
;
12198 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12198, __extension__
__PRETTY_FUNCTION__))
;
12199 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12199, __extension__
__PRETTY_FUNCTION__))
;
12200 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12200, __extension__
__PRETTY_FUNCTION__))
;
12201 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12201, __extension__
__PRETTY_FUNCTION__))
;
12202
12203 // If the mask only uses one non-undef element, then fully 'splat' it to
12204 // improve later broadcast matching.
12205 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
12206 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12206, __extension__
__PRETTY_FUNCTION__))
;
12207
12208 int FirstElt = Mask[FirstIndex];
12209 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
12210 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
12211
12212 unsigned Imm = 0;
12213 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
12214 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
12215 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
12216 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
12217 return Imm;
12218}
12219
12220static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
12221 SelectionDAG &DAG) {
12222 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
12223}
12224
12225// The Shuffle result is as follow:
12226// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
12227// Each Zeroable's element correspond to a particular Mask's element.
12228// As described in computeZeroableShuffleElements function.
12229//
12230// The function looks for a sub-mask that the nonzero elements are in
12231// increasing order. If such sub-mask exist. The function returns true.
12232static bool isNonZeroElementsInOrder(const APInt &Zeroable,
12233 ArrayRef<int> Mask, const EVT &VectorType,
12234 bool &IsZeroSideLeft) {
12235 int NextElement = -1;
12236 // Check if the Mask's nonzero elements are in increasing order.
12237 for (int i = 0, e = Mask.size(); i < e; i++) {
12238 // Checks if the mask's zeros elements are built from only zeros.
12239 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__))
;
12240 if (Mask[i] < 0)
12241 return false;
12242 if (Zeroable[i])
12243 continue;
12244 // Find the lowest non zero element
12245 if (NextElement < 0) {
12246 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
12247 IsZeroSideLeft = NextElement != 0;
12248 }
12249 // Exit if the mask's non zero elements are not in increasing order.
12250 if (NextElement != Mask[i])
12251 return false;
12252 NextElement++;
12253 }
12254 return true;
12255}
12256
12257/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
12258static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
12259 ArrayRef<int> Mask, SDValue V1,
12260 SDValue V2, const APInt &Zeroable,
12261 const X86Subtarget &Subtarget,
12262 SelectionDAG &DAG) {
12263 int Size = Mask.size();
12264 int LaneSize = 128 / VT.getScalarSizeInBits();
12265 const int NumBytes = VT.getSizeInBits() / 8;
12266 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
12267
12268 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
12269 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
12270 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
;
12271
12272 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
12273 // Sign bit set in i8 mask means zero element.
12274 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
12275
12276 SDValue V;
12277 for (int i = 0; i < NumBytes; ++i) {
12278 int M = Mask[i / NumEltBytes];
12279 if (M < 0) {
12280 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
12281 continue;
12282 }
12283 if (Zeroable[i / NumEltBytes]) {
12284 PSHUFBMask[i] = ZeroMask;
12285 continue;
12286 }
12287
12288 // We can only use a single input of V1 or V2.
12289 SDValue SrcV = (M >= Size ? V2 : V1);
12290 if (V && V != SrcV)
12291 return SDValue();
12292 V = SrcV;
12293 M %= Size;
12294
12295 // PSHUFB can't cross lanes, ensure this doesn't happen.
12296 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
12297 return SDValue();
12298
12299 M = M % LaneSize;
12300 M = M * NumEltBytes + (i % NumEltBytes);
12301 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
12302 }
12303 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12303, __extension__
__PRETTY_FUNCTION__))
;
12304
12305 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
12306 return DAG.getBitcast(
12307 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
12308 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
12309}
12310
12311static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
12312 const X86Subtarget &Subtarget, SelectionDAG &DAG,
12313 const SDLoc &dl);
12314
12315// X86 has dedicated shuffle that can be lowered to VEXPAND
12316static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
12317 const APInt &Zeroable,
12318 ArrayRef<int> Mask, SDValue &V1,
12319 SDValue &V2, SelectionDAG &DAG,
12320 const X86Subtarget &Subtarget) {
12321 bool IsLeftZeroSide = true;
12322 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
12323 IsLeftZeroSide))
12324 return SDValue();
12325 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
12326 MVT IntegerType =
12327 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12328 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
12329 unsigned NumElts = VT.getVectorNumElements();
12330 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__))
12331 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__))
;
12332 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
12333 Subtarget, DAG, DL);
12334 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
12335 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
12336 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
12337}
12338
12339static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
12340 unsigned &UnpackOpcode, bool IsUnary,
12341 ArrayRef<int> TargetMask, const SDLoc &DL,
12342 SelectionDAG &DAG,
12343 const X86Subtarget &Subtarget) {
12344 int NumElts = VT.getVectorNumElements();
12345
12346 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
12347 for (int i = 0; i != NumElts; i += 2) {
12348 int M1 = TargetMask[i + 0];
12349 int M2 = TargetMask[i + 1];
12350 Undef1 &= (SM_SentinelUndef == M1);
12351 Undef2 &= (SM_SentinelUndef == M2);
12352 Zero1 &= isUndefOrZero(M1);
12353 Zero2 &= isUndefOrZero(M2);
12354 }
12355 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__))
12356 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__))
;
12357
12358 // Attempt to match the target mask against the unpack lo/hi mask patterns.
12359 SmallVector<int, 64> Unpckl, Unpckh;
12360 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
12361 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
12362 (IsUnary ? V1 : V2))) {
12363 UnpackOpcode = X86ISD::UNPCKL;
12364 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12365 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12366 return true;
12367 }
12368
12369 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
12370 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
12371 (IsUnary ? V1 : V2))) {
12372 UnpackOpcode = X86ISD::UNPCKH;
12373 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12374 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12375 return true;
12376 }
12377
12378 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
12379 if (IsUnary && (Zero1 || Zero2)) {
12380 // Don't bother if we can blend instead.
12381 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
12382 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
12383 return false;
12384
12385 bool MatchLo = true, MatchHi = true;
12386 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
12387 int M = TargetMask[i];
12388
12389 // Ignore if the input is known to be zero or the index is undef.
12390 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
12391 (M == SM_SentinelUndef))
12392 continue;
12393
12394 MatchLo &= (M == Unpckl[i]);
12395 MatchHi &= (M == Unpckh[i]);
12396 }
12397
12398 if (MatchLo || MatchHi) {
12399 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12400 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12401 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12402 return true;
12403 }
12404 }
12405
12406 // If a binary shuffle, commute and try again.
12407 if (!IsUnary) {
12408 ShuffleVectorSDNode::commuteMask(Unpckl);
12409 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
12410 UnpackOpcode = X86ISD::UNPCKL;
12411 std::swap(V1, V2);
12412 return true;
12413 }
12414
12415 ShuffleVectorSDNode::commuteMask(Unpckh);
12416 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
12417 UnpackOpcode = X86ISD::UNPCKH;
12418 std::swap(V1, V2);
12419 return true;
12420 }
12421 }
12422
12423 return false;
12424}
12425
12426// X86 has dedicated unpack instructions that can handle specific blend
12427// operations: UNPCKH and UNPCKL.
12428static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12429 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12430 SelectionDAG &DAG) {
12431 SmallVector<int, 8> Unpckl;
12432 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12433 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12434 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12435
12436 SmallVector<int, 8> Unpckh;
12437 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12438 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12439 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12440
12441 // Commute and try again.
12442 ShuffleVectorSDNode::commuteMask(Unpckl);
12443 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12444 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12445
12446 ShuffleVectorSDNode::commuteMask(Unpckh);
12447 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12448 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12449
12450 return SDValue();
12451}
12452
12453/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12454/// followed by unpack 256-bit.
12455static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12456 ArrayRef<int> Mask, SDValue V1,
12457 SDValue V2, SelectionDAG &DAG) {
12458 SmallVector<int, 32> Unpckl, Unpckh;
12459 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12460 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12461
12462 unsigned UnpackOpcode;
12463 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12464 UnpackOpcode = X86ISD::UNPCKL;
12465 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12466 UnpackOpcode = X86ISD::UNPCKH;
12467 else
12468 return SDValue();
12469
12470 // This is a "natural" unpack operation (rather than the 128-bit sectored
12471 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12472 // input in order to use the x86 instruction.
12473 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12474 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12475 V1 = DAG.getBitcast(VT, V1);
12476 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12477}
12478
12479// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12480// source into the lower elements and zeroing the upper elements.
12481static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12482 ArrayRef<int> Mask, const APInt &Zeroable,
12483 const X86Subtarget &Subtarget) {
12484 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12485 return false;
12486
12487 unsigned NumElts = Mask.size();
12488 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12489 unsigned MaxScale = 64 / EltSizeInBits;
12490
12491 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12492 unsigned SrcEltBits = EltSizeInBits * Scale;
12493 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12494 continue;
12495 unsigned NumSrcElts = NumElts / Scale;
12496 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12497 continue;
12498 unsigned UpperElts = NumElts - NumSrcElts;
12499 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12500 continue;
12501 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12502 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12503 DstVT = MVT::getIntegerVT(EltSizeInBits);
12504 if ((NumSrcElts * EltSizeInBits) >= 128) {
12505 // ISD::TRUNCATE
12506 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12507 } else {
12508 // X86ISD::VTRUNC
12509 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12510 }
12511 return true;
12512 }
12513
12514 return false;
12515}
12516
12517// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12518// element padding to the final DstVT.
12519static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12520 const X86Subtarget &Subtarget,
12521 SelectionDAG &DAG, bool ZeroUppers) {
12522 MVT SrcVT = Src.getSimpleValueType();
12523 MVT DstSVT = DstVT.getScalarType();
12524 unsigned NumDstElts = DstVT.getVectorNumElements();
12525 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12526 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12527
12528 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12529 return SDValue();
12530
12531 // Perform a direct ISD::TRUNCATE if possible.
12532 if (NumSrcElts == NumDstElts)
12533 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12534
12535 if (NumSrcElts > NumDstElts) {
12536 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12537 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12538 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12539 }
12540
12541 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12542 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12543 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12544 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12545 DstVT.getSizeInBits());
12546 }
12547
12548 // Non-VLX targets must truncate from a 512-bit type, so we need to
12549 // widen, truncate and then possibly extract the original subvector.
12550 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12551 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12552 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12553 }
12554
12555 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12556 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12557 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12558 if (DstVT != TruncVT)
12559 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12560 DstVT.getSizeInBits());
12561 return Trunc;
12562}
12563
12564// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12565//
12566// An example is the following:
12567//
12568// t0: ch = EntryToken
12569// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12570// t25: v4i32 = truncate t2
12571// t41: v8i16 = bitcast t25
12572// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12573// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12574// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12575// t18: v2i64 = bitcast t51
12576//
12577// One can just use a single vpmovdw instruction, without avx512vl we need to
12578// use the zmm variant and extract the lower subvector, padding with zeroes.
12579// TODO: Merge with lowerShuffleAsVTRUNC.
12580static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12581 SDValue V2, ArrayRef<int> Mask,
12582 const APInt &Zeroable,
12583 const X86Subtarget &Subtarget,
12584 SelectionDAG &DAG) {
12585 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12585, __extension__
__PRETTY_FUNCTION__))
;
12586 if (!Subtarget.hasAVX512())
12587 return SDValue();
12588
12589 unsigned NumElts = VT.getVectorNumElements();
12590 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12591 unsigned MaxScale = 64 / EltSizeInBits;
12592 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12593 unsigned SrcEltBits = EltSizeInBits * Scale;
12594 unsigned NumSrcElts = NumElts / Scale;
12595 unsigned UpperElts = NumElts - NumSrcElts;
12596 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12597 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12598 continue;
12599
12600 // Attempt to find a matching source truncation, but as a fall back VLX
12601 // cases can use the VPMOV directly.
12602 SDValue Src = peekThroughBitcasts(V1);
12603 if (Src.getOpcode() == ISD::TRUNCATE &&
12604 Src.getScalarValueSizeInBits() == SrcEltBits) {
12605 Src = Src.getOperand(0);
12606 } else if (Subtarget.hasVLX()) {
12607 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12608 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12609 Src = DAG.getBitcast(SrcVT, Src);
12610 // Don't do this if PACKSS/PACKUS could perform it cheaper.
12611 if (Scale == 2 &&
12612 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12613 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12614 return SDValue();
12615 } else
12616 return SDValue();
12617
12618 // VPMOVWB is only available with avx512bw.
12619 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
12620 return SDValue();
12621
12622 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12623 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12624 }
12625
12626 return SDValue();
12627}
12628
12629// Attempt to match binary shuffle patterns as a truncate.
12630static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12631 SDValue V2, ArrayRef<int> Mask,
12632 const APInt &Zeroable,
12633 const X86Subtarget &Subtarget,
12634 SelectionDAG &DAG) {
12635 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))
12636 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))
;
12637 if (!Subtarget.hasAVX512())
12638 return SDValue();
12639
12640 unsigned NumElts = VT.getVectorNumElements();
12641 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12642 unsigned MaxScale = 64 / EltSizeInBits;
12643 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12644 // TODO: Support non-BWI VPMOVWB truncations?
12645 unsigned SrcEltBits = EltSizeInBits * Scale;
12646 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12647 continue;
12648
12649 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
12650 // Bail if the V2 elements are undef.
12651 unsigned NumHalfSrcElts = NumElts / Scale;
12652 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12653 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
12654 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
12655 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12656 continue;
12657
12658 // The elements beyond the truncation must be undef/zero.
12659 unsigned UpperElts = NumElts - NumSrcElts;
12660 if (UpperElts > 0 &&
12661 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12662 continue;
12663 bool UndefUppers =
12664 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12665
12666 // For offset truncations, ensure that the concat is cheap.
12667 if (Offset) {
12668 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
12669 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12670 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12671 return Lo.getOperand(0) == Hi.getOperand(0);
12672 if (ISD::isNormalLoad(Lo.getNode()) &&
12673 ISD::isNormalLoad(Hi.getNode())) {
12674 auto *LDLo = cast<LoadSDNode>(Lo);
12675 auto *LDHi = cast<LoadSDNode>(Hi);
12676 return DAG.areNonVolatileConsecutiveLoads(
12677 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
12678 }
12679 return false;
12680 };
12681 if (!IsCheapConcat(V1, V2))
12682 continue;
12683 }
12684
12685 // As we're using both sources then we need to concat them together
12686 // and truncate from the double-sized src.
12687 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12688 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12689
12690 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12691 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12692 Src = DAG.getBitcast(SrcVT, Src);
12693
12694 // Shift the offset'd elements into place for the truncation.
12695 // TODO: Use getTargetVShiftByConstNode.
12696 if (Offset)
12697 Src = DAG.getNode(
12698 X86ISD::VSRLI, DL, SrcVT, Src,
12699 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
12700
12701 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12702 }
12703 }
12704
12705 return SDValue();
12706}
12707
12708/// Check whether a compaction lowering can be done by dropping even/odd
12709/// elements and compute how many times even/odd elements must be dropped.
12710///
12711/// This handles shuffles which take every Nth element where N is a power of
12712/// two. Example shuffle masks:
12713///
12714/// (even)
12715/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12716/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12717/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12718/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12719/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12720/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12721///
12722/// (odd)
12723/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12724/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12725///
12726/// Any of these lanes can of course be undef.
12727///
12728/// This routine only supports N <= 3.
12729/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12730/// for larger N.
12731///
12732/// \returns N above, or the number of times even/odd elements must be dropped
12733/// if there is such a number. Otherwise returns zero.
12734static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12735 bool IsSingleInput) {
12736 // The modulus for the shuffle vector entries is based on whether this is
12737 // a single input or not.
12738 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12739 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__))
12740 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__))
;
12741
12742 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12743 int Offset = MatchEven ? 0 : 1;
12744
12745 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12746 // and 2^3 simultaneously. This is because we may have ambiguity with
12747 // partially undef inputs.
12748 bool ViableForN[3] = {true, true, true};
12749
12750 for (int i = 0, e = Mask.size(); i < e; ++i) {
12751 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12752 // want.
12753 if (Mask[i] < 0)
12754 continue;
12755
12756 bool IsAnyViable = false;
12757 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12758 if (ViableForN[j]) {
12759 uint64_t N = j + 1;
12760
12761 // The shuffle mask must be equal to (i * 2^N) % M.
12762 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12763 IsAnyViable = true;
12764 else
12765 ViableForN[j] = false;
12766 }
12767 // Early exit if we exhaust the possible powers of two.
12768 if (!IsAnyViable)
12769 break;
12770 }
12771
12772 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12773 if (ViableForN[j])
12774 return j + 1;
12775
12776 // Return 0 as there is no viable power of two.
12777 return 0;
12778}
12779
12780// X86 has dedicated pack instructions that can handle specific truncation
12781// operations: PACKSS and PACKUS.
12782// Checks for compaction shuffle masks if MaxStages > 1.
12783// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12784static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12785 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12786 const SelectionDAG &DAG,
12787 const X86Subtarget &Subtarget,
12788 unsigned MaxStages = 1) {
12789 unsigned NumElts = VT.getVectorNumElements();
12790 unsigned BitSize = VT.getScalarSizeInBits();
12791 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__))
12792 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__))
;
12793
12794 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12795 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12796 unsigned NumPackedBits = NumSrcBits - BitSize;
12797 N1 = peekThroughBitcasts(N1);
12798 N2 = peekThroughBitcasts(N2);
12799 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12800 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12801 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12802 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12803 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12804 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12805 return false;
12806 if (Subtarget.hasSSE41() || BitSize == 8) {
12807 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12808 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12809 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12810 V1 = N1;
12811 V2 = N2;
12812 SrcVT = PackVT;
12813 PackOpcode = X86ISD::PACKUS;
12814 return true;
12815 }
12816 }
12817 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12818 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12819 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12820 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12821 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12822 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12823 V1 = N1;
12824 V2 = N2;
12825 SrcVT = PackVT;
12826 PackOpcode = X86ISD::PACKSS;
12827 return true;
12828 }
12829 return false;
12830 };
12831
12832 // Attempt to match against wider and wider compaction patterns.
12833 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12834 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12835 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12836
12837 // Try binary shuffle.
12838 SmallVector<int, 32> BinaryMask;
12839 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12840 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
12841 if (MatchPACK(V1, V2, PackVT))
12842 return true;
12843
12844 // Try unary shuffle.
12845 SmallVector<int, 32> UnaryMask;
12846 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12847 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
12848 if (MatchPACK(V1, V1, PackVT))
12849 return true;
12850 }
12851
12852 return false;
12853}
12854
12855static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12856 SDValue V1, SDValue V2, SelectionDAG &DAG,
12857 const X86Subtarget &Subtarget) {
12858 MVT PackVT;
12859 unsigned PackOpcode;
12860 unsigned SizeBits = VT.getSizeInBits();
12861 unsigned EltBits = VT.getScalarSizeInBits();
12862 unsigned MaxStages = Log2_32(64 / EltBits);
12863 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12864 Subtarget, MaxStages))
12865 return SDValue();
12866
12867 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12868 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12869
12870 // Don't lower multi-stage packs on AVX512, truncation is better.
12871 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12872 return SDValue();
12873
12874 // Pack to the largest type possible:
12875 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12876 unsigned MaxPackBits = 16;
12877 if (CurrentEltBits > 16 &&
12878 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12879 MaxPackBits = 32;
12880
12881 // Repeatedly pack down to the target size.
12882 SDValue Res;
12883 for (unsigned i = 0; i != NumStages; ++i) {
12884 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12885 unsigned NumSrcElts = SizeBits / SrcEltBits;
12886 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12887 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12888 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12889 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12890 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12891 DAG.getBitcast(SrcVT, V2));
12892 V1 = V2 = Res;
12893 CurrentEltBits /= 2;
12894 }
12895 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__))
12896 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__))
;
12897 return Res;
12898}
12899
12900/// Try to emit a bitmask instruction for a shuffle.
12901///
12902/// This handles cases where we can model a blend exactly as a bitmask due to
12903/// one of the inputs being zeroable.
12904static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12905 SDValue V2, ArrayRef<int> Mask,
12906 const APInt &Zeroable,
12907 const X86Subtarget &Subtarget,
12908 SelectionDAG &DAG) {
12909 MVT MaskVT = VT;
12910 MVT EltVT = VT.getVectorElementType();
12911 SDValue Zero, AllOnes;
12912 // Use f64 if i64 isn't legal.
12913 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12914 EltVT = MVT::f64;
12915 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12916 }
12917
12918 MVT LogicVT = VT;
12919 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12920 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12921 APFloat AllOnesValue =
12922 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12923 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12924 LogicVT =
12925 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12926 } else {
12927 Zero = DAG.getConstant(0, DL, EltVT);
12928 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12929 }
12930
12931 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12932 SDValue V;
12933 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12934 if (Zeroable[i])
12935 continue;
12936 if (Mask[i] % Size != i)
12937 return SDValue(); // Not a blend.
12938 if (!V)
12939 V = Mask[i] < Size ? V1 : V2;
12940 else if (V != (Mask[i] < Size ? V1 : V2))
12941 return SDValue(); // Can only let one input through the mask.
12942
12943 VMaskOps[i] = AllOnes;
12944 }
12945 if (!V)
12946 return SDValue(); // No non-zeroable elements!
12947
12948 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12949 VMask = DAG.getBitcast(LogicVT, VMask);
12950 V = DAG.getBitcast(LogicVT, V);
12951 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12952 return DAG.getBitcast(VT, And);
12953}
12954
12955/// Try to emit a blend instruction for a shuffle using bit math.
12956///
12957/// This is used as a fallback approach when first class blend instructions are
12958/// unavailable. Currently it is only suitable for integer vectors, but could
12959/// be generalized for floating point vectors if desirable.
12960static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12961 SDValue V2, ArrayRef<int> Mask,
12962 SelectionDAG &DAG) {
12963 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12963, __extension__
__PRETTY_FUNCTION__))
;
12964 MVT EltVT = VT.getVectorElementType();
12965 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12966 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12967 SmallVector<SDValue, 16> MaskOps;
12968 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12969 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12970 return SDValue(); // Shuffled input!
12971 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12972 }
12973
12974 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12975 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12976 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12977 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12978}
12979
12980static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12981 SDValue PreservedSrc,
12982 const X86Subtarget &Subtarget,
12983 SelectionDAG &DAG);
12984
12985static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
12986 MutableArrayRef<int> Mask,
12987 const APInt &Zeroable, bool &ForceV1Zero,
12988 bool &ForceV2Zero, uint64_t &BlendMask) {
12989 bool V1IsZeroOrUndef =
12990 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12991 bool V2IsZeroOrUndef =
12992 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12993
12994 BlendMask = 0;
12995 ForceV1Zero = false, ForceV2Zero = false;
12996 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12996, __extension__
__PRETTY_FUNCTION__))
;
12997
12998 int NumElts = Mask.size();
12999 int NumLanes = VT.getSizeInBits() / 128;
13000 int NumEltsPerLane = NumElts / NumLanes;
13001 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__))
;
13002
13003 // For 32/64-bit elements, if we only reference one input (plus any undefs),
13004 // then ensure the blend mask part for that lane just references that input.
13005 bool ForceWholeLaneMasks =
13006 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
13007
13008 // Attempt to generate the binary blend mask. If an input is zero then
13009 // we can use any lane.
13010 for (int Lane = 0; Lane != NumLanes; ++Lane) {
13011 // Keep track of the inputs used per lane.
13012 bool LaneV1InUse = false;
13013 bool LaneV2InUse = false;
13014 uint64_t LaneBlendMask = 0;
13015 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
13016 int Elt = (Lane * NumEltsPerLane) + LaneElt;
13017 int M = Mask[Elt];
13018 if (M == SM_SentinelUndef)
13019 continue;
13020 if (M == Elt || (0 <= M && M < NumElts &&
13021 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
13022 Mask[Elt] = Elt;
13023 LaneV1InUse = true;
13024 continue;
13025 }
13026 if (M == (Elt + NumElts) ||
13027 (NumElts <= M &&
13028 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
13029 LaneBlendMask |= 1ull << LaneElt;
13030 Mask[Elt] = Elt + NumElts;
13031 LaneV2InUse = true;
13032 continue;
13033 }
13034 if (Zeroable[Elt]) {
13035 if (V1IsZeroOrUndef) {
13036 ForceV1Zero = true;
13037 Mask[Elt] = Elt;
13038 LaneV1InUse = true;
13039 continue;
13040 }
13041 if (V2IsZeroOrUndef) {
13042 ForceV2Zero = true;
13043 LaneBlendMask |= 1ull << LaneElt;
13044 Mask[Elt] = Elt + NumElts;
13045 LaneV2InUse = true;
13046 continue;
13047 }
13048 }
13049 return false;
13050 }
13051
13052 // If we only used V2 then splat the lane blend mask to avoid any demanded
13053 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
13054 // blend mask bit).
13055 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
13056 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
13057
13058 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
13059 }
13060 return true;
13061}
13062
13063static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
13064 int Scale) {
13065 uint64_t ScaledMask = 0;
13066 for (int i = 0; i != Size; ++i)
13067 if (BlendMask & (1ull << i))
13068 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
13069 return ScaledMask;
13070}
13071
13072/// Try to emit a blend instruction for a shuffle.
13073///
13074/// This doesn't do any checks for the availability of instructions for blending
13075/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
13076/// be matched in the backend with the type given. What it does check for is
13077/// that the shuffle mask is a blend, or convertible into a blend with zero.
13078static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
13079 SDValue V2, ArrayRef<int> Original,
13080 const APInt &Zeroable,
13081 const X86Subtarget &Subtarget,
13082 SelectionDAG &DAG) {
13083 uint64_t BlendMask = 0;
13084 bool ForceV1Zero = false, ForceV2Zero = false;
13085 SmallVector<int, 64> Mask(Original);
13086 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
13087 BlendMask))
13088 return SDValue();
13089
13090 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
13091 if (ForceV1Zero)
13092 V1 = getZeroVector(VT, Subtarget, DAG, DL);
13093 if (ForceV2Zero)
13094 V2 = getZeroVector(VT, Subtarget, DAG, DL);
13095
13096 unsigned NumElts = VT.getVectorNumElements();
13097
13098 switch (VT.SimpleTy) {
13099 case MVT::v4i64:
13100 case MVT::v8i32:
13101 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13101, __extension__
__PRETTY_FUNCTION__))
;
13102 [[fallthrough]];
13103 case MVT::v4f64:
13104 case MVT::v8f32:
13105 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13105, __extension__
__PRETTY_FUNCTION__))
;
13106 [[fallthrough]];
13107 case MVT::v2f64:
13108 case MVT::v2i64:
13109 case MVT::v4f32:
13110 case MVT::v4i32:
13111 case MVT::v8i16:
13112 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13112, __extension__
__PRETTY_FUNCTION__))
;
13113 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
13114 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13115 case MVT::v16i16: {
13116 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13116, __extension__
__PRETTY_FUNCTION__))
;
13117 SmallVector<int, 8> RepeatedMask;
13118 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13119 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
13120 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13120, __extension__
__PRETTY_FUNCTION__))
;
13121 BlendMask = 0;
13122 for (int i = 0; i < 8; ++i)
13123 if (RepeatedMask[i] >= 8)
13124 BlendMask |= 1ull << i;
13125 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13126 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13127 }
13128 // Use PBLENDW for lower/upper lanes and then blend lanes.
13129 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
13130 // merge to VSELECT where useful.
13131 uint64_t LoMask = BlendMask & 0xFF;
13132 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
13133 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
13134 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13135 DAG.getTargetConstant(LoMask, DL, MVT::i8));
13136 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13137 DAG.getTargetConstant(HiMask, DL, MVT::i8));
13138 return DAG.getVectorShuffle(
13139 MVT::v16i16, DL, Lo, Hi,
13140 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
13141 }
13142 [[fallthrough]];
13143 }
13144 case MVT::v32i8:
13145 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13145, __extension__
__PRETTY_FUNCTION__))
;
13146 [[fallthrough]];
13147 case MVT::v16i8: {
13148 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13148, __extension__
__PRETTY_FUNCTION__))
;
13149
13150 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
13151 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13152 Subtarget, DAG))
13153 return Masked;
13154
13155 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
13156 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13157 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13158 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13159 }
13160
13161 // If we have VPTERNLOG, we can use that as a bit blend.
13162 if (Subtarget.hasVLX())
13163 if (SDValue BitBlend =
13164 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13165 return BitBlend;
13166
13167 // Scale the blend by the number of bytes per element.
13168 int Scale = VT.getScalarSizeInBits() / 8;
13169
13170 // This form of blend is always done on bytes. Compute the byte vector
13171 // type.
13172 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13173
13174 // x86 allows load folding with blendvb from the 2nd source operand. But
13175 // we are still using LLVM select here (see comment below), so that's V1.
13176 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
13177 // allow that load-folding possibility.
13178 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
13179 ShuffleVectorSDNode::commuteMask(Mask);
13180 std::swap(V1, V2);
13181 }
13182
13183 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
13184 // mix of LLVM's code generator and the x86 backend. We tell the code
13185 // generator that boolean values in the elements of an x86 vector register
13186 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
13187 // mapping a select to operand #1, and 'false' mapping to operand #2. The
13188 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
13189 // of the element (the remaining are ignored) and 0 in that high bit would
13190 // mean operand #1 while 1 in the high bit would mean operand #2. So while
13191 // the LLVM model for boolean values in vector elements gets the relevant
13192 // bit set, it is set backwards and over constrained relative to x86's
13193 // actual model.
13194 SmallVector<SDValue, 32> VSELECTMask;
13195 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13196 for (int j = 0; j < Scale; ++j)
13197 VSELECTMask.push_back(
13198 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
13199 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
13200 MVT::i8));
13201
13202 V1 = DAG.getBitcast(BlendVT, V1);
13203 V2 = DAG.getBitcast(BlendVT, V2);
13204 return DAG.getBitcast(
13205 VT,
13206 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
13207 V1, V2));
13208 }
13209 case MVT::v16f32:
13210 case MVT::v8f64:
13211 case MVT::v8i64:
13212 case MVT::v16i32:
13213 case MVT::v32i16:
13214 case MVT::v64i8: {
13215 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
13216 bool OptForSize = DAG.shouldOptForSize();
13217 if (!OptForSize) {
13218 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13219 Subtarget, DAG))
13220 return Masked;
13221 }
13222
13223 // Otherwise load an immediate into a GPR, cast to k-register, and use a
13224 // masked move.
13225 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13226 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13227 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13228 }
13229 default:
13230 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13230)
;
13231 }
13232}
13233
13234/// Try to lower as a blend of elements from two inputs followed by
13235/// a single-input permutation.
13236///
13237/// This matches the pattern where we can blend elements from two inputs and
13238/// then reduce the shuffle to a single-input permutation.
13239static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
13240 SDValue V1, SDValue V2,
13241 ArrayRef<int> Mask,
13242 SelectionDAG &DAG,
13243 bool ImmBlends = false) {
13244 // We build up the blend mask while checking whether a blend is a viable way
13245 // to reduce the shuffle.
13246 SmallVector<int, 32> BlendMask(Mask.size(), -1);
13247 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
13248
13249 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13250 if (Mask[i] < 0)
13251 continue;
13252
13253 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13253, __extension__
__PRETTY_FUNCTION__))
;
13254
13255 if (BlendMask[Mask[i] % Size] < 0)
13256 BlendMask[Mask[i] % Size] = Mask[i];
13257 else if (BlendMask[Mask[i] % Size] != Mask[i])
13258 return SDValue(); // Can't blend in the needed input!
13259
13260 PermuteMask[i] = Mask[i] % Size;
13261 }
13262
13263 // If only immediate blends, then bail if the blend mask can't be widened to
13264 // i16.
13265 unsigned EltSize = VT.getScalarSizeInBits();
13266 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
13267 return SDValue();
13268
13269 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
13270 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
13271}
13272
13273/// Try to lower as an unpack of elements from two inputs followed by
13274/// a single-input permutation.
13275///
13276/// This matches the pattern where we can unpack elements from two inputs and
13277/// then reduce the shuffle to a single-input (wider) permutation.
13278static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
13279 SDValue V1, SDValue V2,
13280 ArrayRef<int> Mask,
13281 SelectionDAG &DAG) {
13282 int NumElts = Mask.size();
13283 int NumLanes = VT.getSizeInBits() / 128;
13284 int NumLaneElts = NumElts / NumLanes;
13285 int NumHalfLaneElts = NumLaneElts / 2;
13286
13287 bool MatchLo = true, MatchHi = true;
13288 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13289
13290 // Determine UNPCKL/UNPCKH type and operand order.
13291 for (int Elt = 0; Elt != NumElts; ++Elt) {
13292 int M = Mask[Elt];
13293 if (M < 0)
13294 continue;
13295
13296 // Normalize the mask value depending on whether it's V1 or V2.
13297 int NormM = M;
13298 SDValue &Op = Ops[Elt & 1];
13299 if (M < NumElts && (Op.isUndef() || Op == V1))
13300 Op = V1;
13301 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
13302 Op = V2;
13303 NormM -= NumElts;
13304 } else
13305 return SDValue();
13306
13307 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
13308 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
13309 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
13310 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
13311 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
13312 if (MatchLoAnyLane || MatchHiAnyLane) {
13313 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13314, __extension__
__PRETTY_FUNCTION__))
13314 "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13314, __extension__
__PRETTY_FUNCTION__))
;
13315 break;
13316 }
13317 }
13318 MatchLo &= MatchLoAnyLane;
13319 MatchHi &= MatchHiAnyLane;
13320 if (!MatchLo && !MatchHi)
13321 return SDValue();
13322 }
13323 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13323, __extension__
__PRETTY_FUNCTION__))
;
13324
13325 // Element indices have changed after unpacking. Calculate permute mask
13326 // so that they will be put back to the position as dictated by the
13327 // original shuffle mask indices.
13328 SmallVector<int, 32> PermuteMask(NumElts, -1);
13329 for (int Elt = 0; Elt != NumElts; ++Elt) {
13330 int M = Mask[Elt];
13331 if (NumElts <= M)
13332 PermuteMask[Elt] = NumLaneElts * ((M - NumElts) / NumLaneElts) +
13333 (2 * (M % NumHalfLaneElts)) + 1;
13334 else if (0 <= M)
13335 PermuteMask[Elt] =
13336 NumLaneElts * (M / NumLaneElts) + (2 * (M % NumHalfLaneElts));
13337 }
13338
13339 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
13340 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
13341 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
13342}
13343
13344/// Try to lower a shuffle as a permute of the inputs followed by an
13345/// UNPCK instruction.
13346///
13347/// This specifically targets cases where we end up with alternating between
13348/// the two inputs, and so can permute them into something that feeds a single
13349/// UNPCK instruction. Note that this routine only targets integer vectors
13350/// because for floating point vectors we have a generalized SHUFPS lowering
13351/// strategy that handles everything that doesn't *exactly* match an unpack,
13352/// making this clever lowering unnecessary.
13353static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
13354 SDValue V1, SDValue V2,
13355 ArrayRef<int> Mask,
13356 const X86Subtarget &Subtarget,
13357 SelectionDAG &DAG) {
13358 int Size = Mask.size();
13359 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13359, __extension__
__PRETTY_FUNCTION__))
;
13360
13361 // This routine only supports 128-bit integer dual input vectors.
13362 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
13363 return SDValue();
13364
13365 int NumLoInputs =
13366 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13367 int NumHiInputs =
13368 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13369
13370 bool UnpackLo = NumLoInputs >= NumHiInputs;
13371
13372 auto TryUnpack = [&](int ScalarSize, int Scale) {
13373 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13374 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13375
13376 for (int i = 0; i < Size; ++i) {
13377 if (Mask[i] < 0)
13378 continue;
13379
13380 // Each element of the unpack contains Scale elements from this mask.
13381 int UnpackIdx = i / Scale;
13382
13383 // We only handle the case where V1 feeds the first slots of the unpack.
13384 // We rely on canonicalization to ensure this is the case.
13385 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13386 return SDValue();
13387
13388 // Setup the mask for this input. The indexing is tricky as we have to
13389 // handle the unpack stride.
13390 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13391 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13392 Mask[i] % Size;
13393 }
13394
13395 // If we will have to shuffle both inputs to use the unpack, check whether
13396 // we can just unpack first and shuffle the result. If so, skip this unpack.
13397 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13398 !isNoopShuffleMask(V2Mask))
13399 return SDValue();
13400
13401 // Shuffle the inputs into place.
13402 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13403 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13404
13405 // Cast the inputs to the type we will use to unpack them.
13406 MVT UnpackVT =
13407 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13408 V1 = DAG.getBitcast(UnpackVT, V1);
13409 V2 = DAG.getBitcast(UnpackVT, V2);
13410
13411 // Unpack the inputs and cast the result back to the desired type.
13412 return DAG.getBitcast(
13413 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13414 UnpackVT, V1, V2));
13415 };
13416
13417 // We try each unpack from the largest to the smallest to try and find one
13418 // that fits this mask.
13419 int OrigScalarSize = VT.getScalarSizeInBits();
13420 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13421 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13422 return Unpack;
13423
13424 // If we're shuffling with a zero vector then we're better off not doing
13425 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13426 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13427 ISD::isBuildVectorAllZeros(V2.getNode()))
13428 return SDValue();
13429
13430 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13431 // initial unpack.
13432 if (NumLoInputs == 0 || NumHiInputs == 0) {
13433 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13434, __extension__
__PRETTY_FUNCTION__))
13434 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13434, __extension__
__PRETTY_FUNCTION__))
;
13435 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13436
13437 // FIXME: We could consider the total complexity of the permute of each
13438 // possible unpacking. Or at the least we should consider how many
13439 // half-crossings are created.
13440 // FIXME: We could consider commuting the unpacks.
13441
13442 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13443 for (int i = 0; i < Size; ++i) {
13444 if (Mask[i] < 0)
13445 continue;
13446
13447 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13447, __extension__
__PRETTY_FUNCTION__))
;
13448
13449 PermMask[i] =
13450 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13451 }
13452 return DAG.getVectorShuffle(
13453 VT, DL,
13454 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
13455 V1, V2),
13456 DAG.getUNDEF(VT), PermMask);
13457 }
13458
13459 return SDValue();
13460}
13461
13462/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
13463/// permuting the elements of the result in place.
13464static SDValue lowerShuffleAsByteRotateAndPermute(
13465 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13466 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13467 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
13468 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
13469 (VT.is512BitVector() && !Subtarget.hasBWI()))
13470 return SDValue();
13471
13472 // We don't currently support lane crossing permutes.
13473 if (is128BitLaneCrossingShuffleMask(VT, Mask))
13474 return SDValue();
13475
13476 int Scale = VT.getScalarSizeInBits() / 8;
13477 int NumLanes = VT.getSizeInBits() / 128;
13478 int NumElts = VT.getVectorNumElements();
13479 int NumEltsPerLane = NumElts / NumLanes;
13480
13481 // Determine range of mask elts.
13482 bool Blend1 = true;
13483 bool Blend2 = true;
13484 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13485 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13486 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13487 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13488 int M = Mask[Lane + Elt];
13489 if (M < 0)
13490 continue;
13491 if (M < NumElts) {
13492 Blend1 &= (M == (Lane + Elt));
13493 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13493, __extension__
__PRETTY_FUNCTION__))
;
13494 M = M % NumEltsPerLane;
13495 Range1.first = std::min(Range1.first, M);
13496 Range1.second = std::max(Range1.second, M);
13497 } else {
13498 M -= NumElts;
13499 Blend2 &= (M == (Lane + Elt));
13500 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13500, __extension__
__PRETTY_FUNCTION__))
;
13501 M = M % NumEltsPerLane;
13502 Range2.first = std::min(Range2.first, M);
13503 Range2.second = std::max(Range2.second, M);
13504 }
13505 }
13506 }
13507
13508 // Bail if we don't need both elements.
13509 // TODO - it might be worth doing this for unary shuffles if the permute
13510 // can be widened.
13511 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
13512 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
13513 return SDValue();
13514
13515 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
13516 return SDValue();
13517
13518 // Rotate the 2 ops so we can access both ranges, then permute the result.
13519 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
13520 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13521 SDValue Rotate = DAG.getBitcast(
13522 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
13523 DAG.getBitcast(ByteVT, Lo),
13524 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
13525 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
13526 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13527 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13528 int M = Mask[Lane + Elt];
13529 if (M < 0)
13530 continue;
13531 if (M < NumElts)
13532 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
13533 else
13534 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
13535 }
13536 }
13537 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
13538 };
13539
13540 // Check if the ranges are small enough to rotate from either direction.
13541 if (Range2.second < Range1.first)
13542 return RotateAndPermute(V1, V2, Range1.first, 0);
13543 if (Range1.second < Range2.first)
13544 return RotateAndPermute(V2, V1, Range2.first, NumElts);
13545 return SDValue();
13546}
13547
13548static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
13549 return isUndefOrEqual(Mask, 0);
13550}
13551
13552static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
13553 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
13554}
13555
13556/// Check if the Mask consists of the same element repeated multiple times.
13557static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
13558 size_t NumUndefs = 0;
13559 std::optional<int> UniqueElt;
13560 for (int Elt : Mask) {
13561 if (Elt == SM_SentinelUndef) {
13562 NumUndefs++;
13563 continue;
13564 }
13565 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
13566 return false;
13567 UniqueElt = Elt;
13568 }
13569 // Make sure the element is repeated enough times by checking the number of
13570 // undefs is small.
13571 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
13572}
13573
13574/// Generic routine to decompose a shuffle and blend into independent
13575/// blends and permutes.
13576///
13577/// This matches the extremely common pattern for handling combined
13578/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
13579/// operations. It will try to pick the best arrangement of shuffles and
13580/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
13581static SDValue lowerShuffleAsDecomposedShuffleMerge(
13582 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13583 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13584 int NumElts = Mask.size();
13585 int NumLanes = VT.getSizeInBits() / 128;
13586 int NumEltsPerLane = NumElts / NumLanes;
13587
13588 // Shuffle the input elements into the desired positions in V1 and V2 and
13589 // unpack/blend them together.
13590 bool IsAlternating = true;
13591 SmallVector<int, 32> V1Mask(NumElts, -1);
13592 SmallVector<int, 32> V2Mask(NumElts, -1);
13593 SmallVector<int, 32> FinalMask(NumElts, -1);
13594 for (int i = 0; i < NumElts; ++i) {
13595 int M = Mask[i];
13596 if (M >= 0 && M < NumElts) {
13597 V1Mask[i] = M;
13598 FinalMask[i] = i;
13599 IsAlternating &= (i & 1) == 0;
13600 } else if (M >= NumElts) {
13601 V2Mask[i] = M - NumElts;
13602 FinalMask[i] = i + NumElts;
13603 IsAlternating &= (i & 1) == 1;
13604 }
13605 }
13606
13607 // If we effectively only demand the 0'th element of \p Input, and not only
13608 // as 0'th element, then broadcast said input,
13609 // and change \p InputMask to be a no-op (identity) mask.
13610 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
13611 &DAG](SDValue &Input,
13612 MutableArrayRef<int> InputMask) {
13613 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
13614 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
13615 !X86::mayFoldLoad(Input, Subtarget)))
13616 return;
13617 if (isNoopShuffleMask(InputMask))
13618 return;
13619 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13620, __extension__
__PRETTY_FUNCTION__))
13620 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13620, __extension__
__PRETTY_FUNCTION__))
;
13621 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13622 for (auto I : enumerate(InputMask)) {
13623 int &InputMaskElt = I.value();
13624 if (InputMaskElt >= 0)
13625 InputMaskElt = I.index();
13626 }
13627 };
13628
13629 // Currently, we may need to produce one shuffle per input, and blend results.
13630 // It is possible that the shuffle for one of the inputs is already a no-op.
13631 // See if we can simplify non-no-op shuffles into broadcasts,
13632 // which we consider to be strictly better than an arbitrary shuffle.
13633 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13634 isNoopOrBroadcastShuffleMask(V2Mask)) {
13635 canonicalizeBroadcastableInput(V1, V1Mask);
13636 canonicalizeBroadcastableInput(V2, V2Mask);
13637 }
13638
13639 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13640 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13641 // the shuffle may be able to fold with a load or other benefit. However, when
13642 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13643 // pre-shuffle first is a better strategy.
13644 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13645 // Only prefer immediate blends to unpack/rotate.
13646 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13647 DAG, true))
13648 return BlendPerm;
13649 // If either input vector provides only a single element which is repeated
13650 // multiple times, unpacking from both input vectors would generate worse
13651 // code. e.g. for
13652 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
13653 // it is better to process t4 first to create a vector of t4[0], then unpack
13654 // that vector with t2.
13655 if (!isSingleElementRepeatedMask(V1Mask) &&
13656 !isSingleElementRepeatedMask(V2Mask))
13657 if (SDValue UnpackPerm =
13658 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
13659 return UnpackPerm;
13660 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13661 DL, VT, V1, V2, Mask, Subtarget, DAG))
13662 return RotatePerm;
13663 // Unpack/rotate failed - try again with variable blends.
13664 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13665 DAG))
13666 return BlendPerm;
13667 if (VT.getScalarSizeInBits() >= 32)
13668 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
13669 DL, VT, V1, V2, Mask, Subtarget, DAG))
13670 return PermUnpack;
13671 }
13672
13673 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13674 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13675 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13676 // than half the elements coming from each source.
13677 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13678 V1Mask.assign(NumElts, -1);
13679 V2Mask.assign(NumElts, -1);
13680 FinalMask.assign(NumElts, -1);
13681 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13682 for (int j = 0; j != NumEltsPerLane; ++j) {
13683 int M = Mask[i + j];
13684 if (M >= 0 && M < NumElts) {
13685 V1Mask[i + (j / 2)] = M;
13686 FinalMask[i + j] = i + (j / 2);
13687 } else if (M >= NumElts) {
13688 V2Mask[i + (j / 2)] = M - NumElts;
13689 FinalMask[i + j] = i + (j / 2) + NumElts;
13690 }
13691 }
13692 }
13693
13694 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13695 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13696 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13697}
13698
13699/// Try to lower a vector shuffle as a bit rotation.
13700///
13701/// Look for a repeated rotation pattern in each sub group.
13702/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13703static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13704 int NumElts = Mask.size();
13705 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13705, __extension__
__PRETTY_FUNCTION__))
;
13706
13707 int RotateAmt = -1;
13708 for (int i = 0; i != NumElts; i += NumSubElts) {
13709 for (int j = 0; j != NumSubElts; ++j) {
13710 int M = Mask[i + j];
13711 if (M < 0)
13712 continue;
13713 if (!isInRange(M, i, i + NumSubElts))
13714 return -1;
13715 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13716 if (0 <= RotateAmt && Offset != RotateAmt)
13717 return -1;
13718 RotateAmt = Offset;
13719 }
13720 }
13721 return RotateAmt;
13722}
13723
13724static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13725 const X86Subtarget &Subtarget,
13726 ArrayRef<int> Mask) {
13727 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13727, __extension__
__PRETTY_FUNCTION__))
;
13728 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13728, __extension__
__PRETTY_FUNCTION__))
;
13729
13730 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13731 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13732 int MaxSubElts = 64 / EltSizeInBits;
13733 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13734 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13735 if (RotateAmt < 0)
13736 continue;
13737
13738 int NumElts = Mask.size();
13739 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13740 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13741 return RotateAmt * EltSizeInBits;
13742 }
13743
13744 return -1;
13745}
13746
13747/// Lower shuffle using X86ISD::VROTLI rotations.
13748static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13749 ArrayRef<int> Mask,
13750 const X86Subtarget &Subtarget,
13751 SelectionDAG &DAG) {
13752 // Only XOP + AVX512 targets have bit rotation instructions.
13753 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13754 bool IsLegal =
13755 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13756 if (!IsLegal && Subtarget.hasSSE3())
13757 return SDValue();
13758
13759 MVT RotateVT;
13760 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13761 Subtarget, Mask);
13762 if (RotateAmt < 0)
13763 return SDValue();
13764
13765 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13766 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13767 // widen to vXi16 or more then existing lowering should will be better.
13768 if (!IsLegal) {
13769 if ((RotateAmt % 16) == 0)
13770 return SDValue();
13771 // TODO: Use getTargetVShiftByConstNode.
13772 unsigned ShlAmt = RotateAmt;
13773 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13774 V1 = DAG.getBitcast(RotateVT, V1);
13775 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13776 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13777 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13778 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13779 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13780 return DAG.getBitcast(VT, Rot);
13781 }
13782
13783 SDValue Rot =
13784 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13785 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13786 return DAG.getBitcast(VT, Rot);
13787}
13788
13789/// Try to match a vector shuffle as an element rotation.
13790///
13791/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13792static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13793 ArrayRef<int> Mask) {
13794 int NumElts = Mask.size();
13795
13796 // We need to detect various ways of spelling a rotation:
13797 // [11, 12, 13, 14, 15, 0, 1, 2]
13798 // [-1, 12, 13, 14, -1, -1, 1, -1]
13799 // [-1, -1, -1, -1, -1, -1, 1, 2]
13800 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13801 // [-1, 4, 5, 6, -1, -1, 9, -1]
13802 // [-1, 4, 5, 6, -1, -1, -1, -1]
13803 int Rotation = 0;
13804 SDValue Lo, Hi;
13805 for (int i = 0; i < NumElts; ++i) {
13806 int M = Mask[i];
13807 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13808, __extension__
__PRETTY_FUNCTION__))
13808 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13808, __extension__
__PRETTY_FUNCTION__))
;
13809 if (M < 0)
13810 continue;
13811
13812 // Determine where a rotated vector would have started.
13813 int StartIdx = i - (M % NumElts);
13814 if (StartIdx == 0)
13815 // The identity rotation isn't interesting, stop.
13816 return -1;
13817
13818 // If we found the tail of a vector the rotation must be the missing
13819 // front. If we found the head of a vector, it must be how much of the
13820 // head.
13821 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13822
13823 if (Rotation == 0)
13824 Rotation = CandidateRotation;
13825 else if (Rotation != CandidateRotation)
13826 // The rotations don't match, so we can't match this mask.
13827 return -1;
13828
13829 // Compute which value this mask is pointing at.
13830 SDValue MaskV = M < NumElts ? V1 : V2;
13831
13832 // Compute which of the two target values this index should be assigned
13833 // to. This reflects whether the high elements are remaining or the low
13834 // elements are remaining.
13835 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13836
13837 // Either set up this value if we've not encountered it before, or check
13838 // that it remains consistent.
13839 if (!TargetV)
13840 TargetV = MaskV;
13841 else if (TargetV != MaskV)
13842 // This may be a rotation, but it pulls from the inputs in some
13843 // unsupported interleaving.
13844 return -1;
13845 }
13846
13847 // Check that we successfully analyzed the mask, and normalize the results.
13848 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13848, __extension__
__PRETTY_FUNCTION__))
;
13849 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__))
;
13850 if (!Lo)
13851 Lo = Hi;
13852 else if (!Hi)
13853 Hi = Lo;
13854
13855 V1 = Lo;
13856 V2 = Hi;
13857
13858 return Rotation;
13859}
13860
13861/// Try to lower a vector shuffle as a byte rotation.
13862///
13863/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13864/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13865/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13866/// try to generically lower a vector shuffle through such an pattern. It
13867/// does not check for the profitability of lowering either as PALIGNR or
13868/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13869/// This matches shuffle vectors that look like:
13870///
13871/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13872///
13873/// Essentially it concatenates V1 and V2, shifts right by some number of
13874/// elements, and takes the low elements as the result. Note that while this is
13875/// specified as a *right shift* because x86 is little-endian, it is a *left
13876/// rotate* of the vector lanes.
13877static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13878 ArrayRef<int> Mask) {
13879 // Don't accept any shuffles with zero elements.
13880 if (isAnyZero(Mask))
13881 return -1;
13882
13883 // PALIGNR works on 128-bit lanes.
13884 SmallVector<int, 16> RepeatedMask;
13885 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13886 return -1;
13887
13888 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13889 if (Rotation <= 0)
13890 return -1;
13891
13892 // PALIGNR rotates bytes, so we need to scale the
13893 // rotation based on how many bytes are in the vector lane.
13894 int NumElts = RepeatedMask.size();
13895 int Scale = 16 / NumElts;
13896 return Rotation * Scale;
13897}
13898
13899static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13900 SDValue V2, ArrayRef<int> Mask,
13901 const X86Subtarget &Subtarget,
13902 SelectionDAG &DAG) {
13903 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13903, __extension__
__PRETTY_FUNCTION__))
;
13904
13905 SDValue Lo = V1, Hi = V2;
13906 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13907 if (ByteRotation <= 0)
13908 return SDValue();
13909
13910 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13911 // PSLLDQ/PSRLDQ.
13912 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13913 Lo = DAG.getBitcast(ByteVT, Lo);
13914 Hi = DAG.getBitcast(ByteVT, Hi);
13915
13916 // SSSE3 targets can use the palignr instruction.
13917 if (Subtarget.hasSSSE3()) {
13918 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13919, __extension__
__PRETTY_FUNCTION__))
13919 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13919, __extension__
__PRETTY_FUNCTION__))
;
13920 return DAG.getBitcast(
13921 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13922 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13923 }
13924
13925 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13926, __extension__
__PRETTY_FUNCTION__))
13926 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13926, __extension__
__PRETTY_FUNCTION__))
;
13927 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13928, __extension__
__PRETTY_FUNCTION__))
13928 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13928, __extension__
__PRETTY_FUNCTION__))
;
13929 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__))
13930 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__))
;
13931
13932 // Default SSE2 implementation
13933 int LoByteShift = 16 - ByteRotation;
13934 int HiByteShift = ByteRotation;
13935
13936 SDValue LoShift =
13937 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13938 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13939 SDValue HiShift =
13940 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13941 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13942 return DAG.getBitcast(VT,
13943 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13944}
13945
13946/// Try to lower a vector shuffle as a dword/qword rotation.
13947///
13948/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13949/// rotation of the concatenation of two vectors; This routine will
13950/// try to generically lower a vector shuffle through such an pattern.
13951///
13952/// Essentially it concatenates V1 and V2, shifts right by some number of
13953/// elements, and takes the low elements as the result. Note that while this is
13954/// specified as a *right shift* because x86 is little-endian, it is a *left
13955/// rotate* of the vector lanes.
13956static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13957 SDValue V2, ArrayRef<int> Mask,
13958 const X86Subtarget &Subtarget,
13959 SelectionDAG &DAG) {
13960 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13961, __extension__
__PRETTY_FUNCTION__))
13961 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13961, __extension__
__PRETTY_FUNCTION__))
;
13962
13963 // 128/256-bit vectors are only supported with VLX.
13964 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))
13965 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))
;
13966
13967 SDValue Lo = V1, Hi = V2;
13968 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13969 if (Rotation <= 0)
13970 return SDValue();
13971
13972 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13973 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13974}
13975
13976/// Try to lower a vector shuffle as a byte shift sequence.
13977static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13978 SDValue V2, ArrayRef<int> Mask,
13979 const APInt &Zeroable,
13980 const X86Subtarget &Subtarget,
13981 SelectionDAG &DAG) {
13982 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13982, __extension__
__PRETTY_FUNCTION__))
;
13983 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13983, __extension__
__PRETTY_FUNCTION__))
;
13984
13985 // We need a shuffle that has zeros at one/both ends and a sequential
13986 // shuffle from one source within.
13987 unsigned ZeroLo = Zeroable.countr_one();
13988 unsigned ZeroHi = Zeroable.countl_one();
13989 if (!ZeroLo && !ZeroHi)
13990 return SDValue();
13991
13992 unsigned NumElts = Mask.size();
13993 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13994 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13995 return SDValue();
13996
13997 unsigned Scale = VT.getScalarSizeInBits() / 8;
13998 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13999 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
14000 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
14001 return SDValue();
14002
14003 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
14004 Res = DAG.getBitcast(MVT::v16i8, Res);
14005
14006 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
14007 // inner sequential set of elements, possibly offset:
14008 // 01234567 --> zzzzzz01 --> 1zzzzzzz
14009 // 01234567 --> 4567zzzz --> zzzzz456
14010 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
14011 if (ZeroLo == 0) {
14012 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14013 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14014 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14015 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14016 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
14017 } else if (ZeroHi == 0) {
14018 unsigned Shift = Mask[ZeroLo] % NumElts;
14019 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14020 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14021 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14022 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14023 } else if (!Subtarget.hasSSSE3()) {
14024 // If we don't have PSHUFB then its worth avoiding an AND constant mask
14025 // by performing 3 byte shifts. Shuffle combining can kick in above that.
14026 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
14027 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14028 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14029 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14030 Shift += Mask[ZeroLo] % NumElts;
14031 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14032 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14033 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14034 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14035 } else
14036 return SDValue();
14037
14038 return DAG.getBitcast(VT, Res);
14039}
14040
14041/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
14042///
14043/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
14044/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
14045/// matches elements from one of the input vectors shuffled to the left or
14046/// right with zeroable elements 'shifted in'. It handles both the strictly
14047/// bit-wise element shifts and the byte shift across an entire 128-bit double
14048/// quad word lane.
14049///
14050/// PSHL : (little-endian) left bit shift.
14051/// [ zz, 0, zz, 2 ]
14052/// [ -1, 4, zz, -1 ]
14053/// PSRL : (little-endian) right bit shift.
14054/// [ 1, zz, 3, zz]
14055/// [ -1, -1, 7, zz]
14056/// PSLLDQ : (little-endian) left byte shift
14057/// [ zz, 0, 1, 2, 3, 4, 5, 6]
14058/// [ zz, zz, -1, -1, 2, 3, 4, -1]
14059/// [ zz, zz, zz, zz, zz, zz, -1, 1]
14060/// PSRLDQ : (little-endian) right byte shift
14061/// [ 5, 6, 7, zz, zz, zz, zz, zz]
14062/// [ -1, 5, 6, 7, zz, zz, zz, zz]
14063/// [ 1, 2, -1, -1, -1, -1, zz, zz]
14064static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
14065 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
14066 int MaskOffset, const APInt &Zeroable,
14067 const X86Subtarget &Subtarget) {
14068 int Size = Mask.size();
14069 unsigned SizeInBits = Size * ScalarSizeInBits;
14070
14071 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
14072 for (int i = 0; i < Size; i += Scale)
14073 for (int j = 0; j < Shift; ++j)
14074 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
14075 return false;
14076
14077 return true;
14078 };
14079
14080 auto MatchShift = [&](int Shift, int Scale, bool Left) {
14081 for (int i = 0; i != Size; i += Scale) {
14082 unsigned Pos = Left ? i + Shift : i;
14083 unsigned Low = Left ? i : i + Shift;
14084 unsigned Len = Scale - Shift;
14085 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
14086 return -1;
14087 }
14088
14089 int ShiftEltBits = ScalarSizeInBits * Scale;
14090 bool ByteShift = ShiftEltBits > 64;
14091 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
14092 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
14093 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
14094
14095 // Normalize the scale for byte shifts to still produce an i64 element
14096 // type.
14097 Scale = ByteShift ? Scale / 2 : Scale;
14098
14099 // We need to round trip through the appropriate type for the shift.
14100 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
14101 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
14102 : MVT::getVectorVT(ShiftSVT, Size / Scale);
14103 return (int)ShiftAmt;
14104 };
14105
14106 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
14107 // keep doubling the size of the integer elements up to that. We can
14108 // then shift the elements of the integer vector by whole multiples of
14109 // their width within the elements of the larger integer vector. Test each
14110 // multiple to see if we can find a match with the moved element indices
14111 // and that the shifted in elements are all zeroable.
14112 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
14113 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
14114 for (int Shift = 1; Shift != Scale; ++Shift)
14115 for (bool Left : {true, false})
14116 if (CheckZeros(Shift, Scale, Left)) {
14117 int ShiftAmt = MatchShift(Shift, Scale, Left);
14118 if (0 < ShiftAmt)
14119 return ShiftAmt;
14120 }
14121
14122 // no match
14123 return -1;
14124}
14125
14126static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
14127 SDValue V2, ArrayRef<int> Mask,
14128 const APInt &Zeroable,
14129 const X86Subtarget &Subtarget,
14130 SelectionDAG &DAG, bool BitwiseOnly) {
14131 int Size = Mask.size();
14132 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14132, __extension__
__PRETTY_FUNCTION__))
;
14133
14134 MVT ShiftVT;
14135 SDValue V = V1;
14136 unsigned Opcode;
14137
14138 // Try to match shuffle against V1 shift.
14139 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14140 Mask, 0, Zeroable, Subtarget);
14141
14142 // If V1 failed, try to match shuffle against V2 shift.
14143 if (ShiftAmt < 0) {
14144 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14145 Mask, Size, Zeroable, Subtarget);
14146 V = V2;
14147 }
14148
14149 if (ShiftAmt < 0)
14150 return SDValue();
14151
14152 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
14153 return SDValue();
14154
14155 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__))
14156 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__))
;
14157 V = DAG.getBitcast(ShiftVT, V);
14158 V = DAG.getNode(Opcode, DL, ShiftVT, V,
14159 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
14160 return DAG.getBitcast(VT, V);
14161}
14162
14163// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
14164// Remainder of lower half result is zero and upper half is all undef.
14165static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
14166 ArrayRef<int> Mask, uint64_t &BitLen,
14167 uint64_t &BitIdx, const APInt &Zeroable) {
14168 int Size = Mask.size();
14169 int HalfSize = Size / 2;
14170 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14170, __extension__
__PRETTY_FUNCTION__))
;
14171 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14171, __extension__
__PRETTY_FUNCTION__))
;
14172
14173 // Upper half must be undefined.
14174 if (!isUndefUpperHalf(Mask))
14175 return false;
14176
14177 // Determine the extraction length from the part of the
14178 // lower half that isn't zeroable.
14179 int Len = HalfSize;
14180 for (; Len > 0; --Len)
14181 if (!Zeroable[Len - 1])
14182 break;
14183 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14183, __extension__
__PRETTY_FUNCTION__))
;
14184
14185 // Attempt to match first Len sequential elements from the lower half.
14186 SDValue Src;
14187 int Idx = -1;
14188 for (int i = 0; i != Len; ++i) {
14189 int M = Mask[i];
14190 if (M == SM_SentinelUndef)
14191 continue;
14192 SDValue &V = (M < Size ? V1 : V2);
14193 M = M % Size;
14194
14195 // The extracted elements must start at a valid index and all mask
14196 // elements must be in the lower half.
14197 if (i > M || M >= HalfSize)
14198 return false;
14199
14200 if (Idx < 0 || (Src == V && Idx == (M - i))) {
14201 Src = V;
14202 Idx = M - i;
14203 continue;
14204 }
14205 return false;
14206 }
14207
14208 if (!Src || Idx < 0)
14209 return false;
14210
14211 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14211, __extension__
__PRETTY_FUNCTION__))
;
14212 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14213 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14214 V1 = Src;
14215 return true;
14216}
14217
14218// INSERTQ: Extract lowest Len elements from lower half of second source and
14219// insert over first source, starting at Idx.
14220// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
14221static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
14222 ArrayRef<int> Mask, uint64_t &BitLen,
14223 uint64_t &BitIdx) {
14224 int Size = Mask.size();
14225 int HalfSize = Size / 2;
14226 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14226, __extension__
__PRETTY_FUNCTION__))
;
14227
14228 // Upper half must be undefined.
14229 if (!isUndefUpperHalf(Mask))
14230 return false;
14231
14232 for (int Idx = 0; Idx != HalfSize; ++Idx) {
14233 SDValue Base;
14234
14235 // Attempt to match first source from mask before insertion point.
14236 if (isUndefInRange(Mask, 0, Idx)) {
14237 /* EMPTY */
14238 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
14239 Base = V1;
14240 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
14241 Base = V2;
14242 } else {
14243 continue;
14244 }
14245
14246 // Extend the extraction length looking to match both the insertion of
14247 // the second source and the remaining elements of the first.
14248 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
14249 SDValue Insert;
14250 int Len = Hi - Idx;
14251
14252 // Match insertion.
14253 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
14254 Insert = V1;
14255 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
14256 Insert = V2;
14257 } else {
14258 continue;
14259 }
14260
14261 // Match the remaining elements of the lower half.
14262 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
14263 /* EMPTY */
14264 } else if ((!Base || (Base == V1)) &&
14265 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
14266 Base = V1;
14267 } else if ((!Base || (Base == V2)) &&
14268 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
14269 Size + Hi)) {
14270 Base = V2;
14271 } else {
14272 continue;
14273 }
14274
14275 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14276 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14277 V1 = Base;
14278 V2 = Insert;
14279 return true;
14280 }
14281 }
14282
14283 return false;
14284}
14285
14286/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
14287static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
14288 SDValue V2, ArrayRef<int> Mask,
14289 const APInt &Zeroable, SelectionDAG &DAG) {
14290 uint64_t BitLen, BitIdx;
14291 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
14292 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
14293 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14294 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14295
14296 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
14297 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
14298 V2 ? V2 : DAG.getUNDEF(VT),
14299 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14300 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14301
14302 return SDValue();
14303}
14304
14305/// Lower a vector shuffle as a zero or any extension.
14306///
14307/// Given a specific number of elements, element bit width, and extension
14308/// stride, produce either a zero or any extension based on the available
14309/// features of the subtarget. The extended elements are consecutive and
14310/// begin and can start from an offsetted element index in the input; to
14311/// avoid excess shuffling the offset must either being in the bottom lane
14312/// or at the start of a higher lane. All extended elements must be from
14313/// the same lane.
14314static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14315 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
14316 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14317 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14317, __extension__
__PRETTY_FUNCTION__))
;
14318 int EltBits = VT.getScalarSizeInBits();
14319 int NumElements = VT.getVectorNumElements();
14320 int NumEltsPerLane = 128 / EltBits;
14321 int OffsetLane = Offset / NumEltsPerLane;
14322 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14323, __extension__
__PRETTY_FUNCTION__))
14323 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14323, __extension__
__PRETTY_FUNCTION__))
;
14324 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14324, __extension__
__PRETTY_FUNCTION__))
;
14325 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14325, __extension__
__PRETTY_FUNCTION__))
;
14326 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14327, __extension__
__PRETTY_FUNCTION__))
14327 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14327, __extension__
__PRETTY_FUNCTION__))
;
14328
14329 // Check that an index is in same lane as the base offset.
14330 auto SafeOffset = [&](int Idx) {
14331 return OffsetLane == (Idx / NumEltsPerLane);
14332 };
14333
14334 // Shift along an input so that the offset base moves to the first element.
14335 auto ShuffleOffset = [&](SDValue V) {
14336 if (!Offset)
14337 return V;
14338
14339 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14340 for (int i = 0; i * Scale < NumElements; ++i) {
14341 int SrcIdx = i + Offset;
14342 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
14343 }
14344 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
14345 };
14346
14347 // Found a valid a/zext mask! Try various lowering strategies based on the
14348 // input type and available ISA extensions.
14349 if (Subtarget.hasSSE41()) {
14350 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
14351 // PUNPCK will catch this in a later shuffle match.
14352 if (Offset && Scale == 2 && VT.is128BitVector())
14353 return SDValue();
14354 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14355 NumElements / Scale);
14356 InputV = DAG.getBitcast(VT, InputV);
14357 InputV = ShuffleOffset(InputV);
14358 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14359 DL, ExtVT, InputV, DAG);
14360 return DAG.getBitcast(VT, InputV);
14361 }
14362
14363 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__))
;
14364 InputV = DAG.getBitcast(VT, InputV);
14365
14366 // For any extends we can cheat for larger element sizes and use shuffle
14367 // instructions that can fold with a load and/or copy.
14368 if (AnyExt && EltBits == 32) {
14369 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
14370 -1};
14371 return DAG.getBitcast(
14372 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14373 DAG.getBitcast(MVT::v4i32, InputV),
14374 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14375 }
14376 if (AnyExt && EltBits == 16 && Scale > 2) {
14377 int PSHUFDMask[4] = {Offset / 2, -1,
14378 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
14379 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14380 DAG.getBitcast(MVT::v4i32, InputV),
14381 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14382 int PSHUFWMask[4] = {1, -1, -1, -1};
14383 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
14384 return DAG.getBitcast(
14385 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
14386 DAG.getBitcast(MVT::v8i16, InputV),
14387 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
14388 }
14389
14390 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
14391 // to 64-bits.
14392 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
14393 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14393, __extension__
__PRETTY_FUNCTION__))
;
14394 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14394, __extension__
__PRETTY_FUNCTION__))
;
14395
14396 int LoIdx = Offset * EltBits;
14397 SDValue Lo = DAG.getBitcast(
14398 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14399 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14400 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
14401
14402 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
14403 return DAG.getBitcast(VT, Lo);
14404
14405 int HiIdx = (Offset + 1) * EltBits;
14406 SDValue Hi = DAG.getBitcast(
14407 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14408 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14409 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
14410 return DAG.getBitcast(VT,
14411 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
14412 }
14413
14414 // If this would require more than 2 unpack instructions to expand, use
14415 // pshufb when available. We can only use more than 2 unpack instructions
14416 // when zero extending i8 elements which also makes it easier to use pshufb.
14417 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
14418 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14418, __extension__
__PRETTY_FUNCTION__))
;
14419 SDValue PSHUFBMask[16];
14420 for (int i = 0; i < 16; ++i) {
14421 int Idx = Offset + (i / Scale);
14422 if ((i % Scale == 0 && SafeOffset(Idx))) {
14423 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
14424 continue;
14425 }
14426 PSHUFBMask[i] =
14427 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
14428 }
14429 InputV = DAG.getBitcast(MVT::v16i8, InputV);
14430 return DAG.getBitcast(
14431 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
14432 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
14433 }
14434
14435 // If we are extending from an offset, ensure we start on a boundary that
14436 // we can unpack from.
14437 int AlignToUnpack = Offset % (NumElements / Scale);
14438 if (AlignToUnpack) {
14439 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14440 for (int i = AlignToUnpack; i < NumElements; ++i)
14441 ShMask[i - AlignToUnpack] = i;
14442 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
14443 Offset -= AlignToUnpack;
14444 }
14445
14446 // Otherwise emit a sequence of unpacks.
14447 do {
14448 unsigned UnpackLoHi = X86ISD::UNPCKL;
14449 if (Offset >= (NumElements / 2)) {
14450 UnpackLoHi = X86ISD::UNPCKH;
14451 Offset -= (NumElements / 2);
14452 }
14453
14454 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
14455 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
14456 : getZeroVector(InputVT, Subtarget, DAG, DL);
14457 InputV = DAG.getBitcast(InputVT, InputV);
14458 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
14459 Scale /= 2;
14460 EltBits *= 2;
14461 NumElements /= 2;
14462 } while (Scale > 1);
14463 return DAG.getBitcast(VT, InputV);
14464}
14465
14466/// Try to lower a vector shuffle as a zero extension on any microarch.
14467///
14468/// This routine will try to do everything in its power to cleverly lower
14469/// a shuffle which happens to match the pattern of a zero extend. It doesn't
14470/// check for the profitability of this lowering, it tries to aggressively
14471/// match this pattern. It will use all of the micro-architectural details it
14472/// can to emit an efficient lowering. It handles both blends with all-zero
14473/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
14474/// masking out later).
14475///
14476/// The reason we have dedicated lowering for zext-style shuffles is that they
14477/// are both incredibly common and often quite performance sensitive.
14478static SDValue lowerShuffleAsZeroOrAnyExtend(
14479 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14480 const APInt &Zeroable, const X86Subtarget &Subtarget,
14481 SelectionDAG &DAG) {
14482 int Bits = VT.getSizeInBits();
14483 int NumLanes = Bits / 128;
14484 int NumElements = VT.getVectorNumElements();
14485 int NumEltsPerLane = NumElements / NumLanes;
14486 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14487, __extension__
__PRETTY_FUNCTION__))
14487 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14487, __extension__
__PRETTY_FUNCTION__))
;
14488 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14488, __extension__
__PRETTY_FUNCTION__))
;
14489
14490 // Define a helper function to check a particular ext-scale and lower to it if
14491 // valid.
14492 auto Lower = [&](int Scale) -> SDValue {
14493 SDValue InputV;
14494 bool AnyExt = true;
14495 int Offset = 0;
14496 int Matches = 0;
14497 for (int i = 0; i < NumElements; ++i) {
14498 int M = Mask[i];
14499 if (M < 0)
14500 continue; // Valid anywhere but doesn't tell us anything.
14501 if (i % Scale != 0) {
14502 // Each of the extended elements need to be zeroable.
14503 if (!Zeroable[i])
14504 return SDValue();
14505
14506 // We no longer are in the anyext case.
14507 AnyExt = false;
14508 continue;
14509 }
14510
14511 // Each of the base elements needs to be consecutive indices into the
14512 // same input vector.
14513 SDValue V = M < NumElements ? V1 : V2;
14514 M = M % NumElements;
14515 if (!InputV) {
14516 InputV = V;
14517 Offset = M - (i / Scale);
14518 } else if (InputV != V)
14519 return SDValue(); // Flip-flopping inputs.
14520
14521 // Offset must start in the lowest 128-bit lane or at the start of an
14522 // upper lane.
14523 // FIXME: Is it ever worth allowing a negative base offset?
14524 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
14525 (Offset % NumEltsPerLane) == 0))
14526 return SDValue();
14527
14528 // If we are offsetting, all referenced entries must come from the same
14529 // lane.
14530 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
14531 return SDValue();
14532
14533 if ((M % NumElements) != (Offset + (i / Scale)))
14534 return SDValue(); // Non-consecutive strided elements.
14535 Matches++;
14536 }
14537
14538 // If we fail to find an input, we have a zero-shuffle which should always
14539 // have already been handled.
14540 // FIXME: Maybe handle this here in case during blending we end up with one?
14541 if (!InputV)
14542 return SDValue();
14543
14544 // If we are offsetting, don't extend if we only match a single input, we
14545 // can always do better by using a basic PSHUF or PUNPCK.
14546 if (Offset != 0 && Matches < 2)
14547 return SDValue();
14548
14549 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
14550 InputV, Mask, Subtarget, DAG);
14551 };
14552
14553 // The widest scale possible for extending is to a 64-bit integer.
14554 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14555, __extension__
__PRETTY_FUNCTION__))
14555 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14555, __extension__
__PRETTY_FUNCTION__))
;
14556 int NumExtElements = Bits / 64;
14557
14558 // Each iteration, try extending the elements half as much, but into twice as
14559 // many elements.
14560 for (; NumExtElements < NumElements; NumExtElements *= 2) {
14561 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14562, __extension__
__PRETTY_FUNCTION__))
14562 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14562, __extension__
__PRETTY_FUNCTION__))
;
14563 if (SDValue V = Lower(NumElements / NumExtElements))
14564 return V;
14565 }
14566
14567 // General extends failed, but 128-bit vectors may be able to use MOVQ.
14568 if (Bits != 128)
14569 return SDValue();
14570
14571 // Returns one of the source operands if the shuffle can be reduced to a
14572 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
14573 auto CanZExtLowHalf = [&]() {
14574 for (int i = NumElements / 2; i != NumElements; ++i)
14575 if (!Zeroable[i])
14576 return SDValue();
14577 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
14578 return V1;
14579 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
14580 return V2;
14581 return SDValue();
14582 };
14583
14584 if (SDValue V = CanZExtLowHalf()) {
14585 V = DAG.getBitcast(MVT::v2i64, V);
14586 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
14587 return DAG.getBitcast(VT, V);
14588 }
14589
14590 // No viable ext lowering found.
14591 return SDValue();
14592}
14593
14594/// Try to get a scalar value for a specific element of a vector.
14595///
14596/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
14597static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
14598 SelectionDAG &DAG) {
14599 MVT VT = V.getSimpleValueType();
14600 MVT EltVT = VT.getVectorElementType();
14601 V = peekThroughBitcasts(V);
14602
14603 // If the bitcasts shift the element size, we can't extract an equivalent
14604 // element from it.
14605 MVT NewVT = V.getSimpleValueType();
14606 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
14607 return SDValue();
14608
14609 if (V.getOpcode() == ISD::BUILD_VECTOR ||
14610 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
14611 // Ensure the scalar operand is the same size as the destination.
14612 // FIXME: Add support for scalar truncation where possible.
14613 SDValue S = V.getOperand(Idx);
14614 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
14615 return DAG.getBitcast(EltVT, S);
14616 }
14617
14618 return SDValue();
14619}
14620
14621/// Helper to test for a load that can be folded with x86 shuffles.
14622///
14623/// This is particularly important because the set of instructions varies
14624/// significantly based on whether the operand is a load or not.
14625static bool isShuffleFoldableLoad(SDValue V) {
14626 return V->hasOneUse() &&
14627 ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
14628}
14629
14630template<typename T>
14631static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
14632 return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
14633}
14634
14635template<typename T>
14636bool X86TargetLowering::isSoftFP16(T VT) const {
14637 return ::isSoftFP16(VT, Subtarget);
14638}
14639
14640/// Try to lower insertion of a single element into a zero vector.
14641///
14642/// This is a common pattern that we have especially efficient patterns to lower
14643/// across all subtarget feature sets.
14644static SDValue lowerShuffleAsElementInsertion(
14645 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14646 const APInt &Zeroable, const X86Subtarget &Subtarget,
14647 SelectionDAG &DAG) {
14648 MVT ExtVT = VT;
14649 MVT EltVT = VT.getVectorElementType();
14650
14651 if (isSoftFP16(EltVT, Subtarget))
14652 return SDValue();
14653
14654 int V2Index =
14655 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14656 Mask.begin();
14657 bool IsV1Zeroable = true;
14658 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14659 if (i != V2Index && !Zeroable[i]) {
14660 IsV1Zeroable = false;
14661 break;
14662 }
14663
14664 // Check for a single input from a SCALAR_TO_VECTOR node.
14665 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14666 // all the smarts here sunk into that routine. However, the current
14667 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14668 // vector shuffle lowering is dead.
14669 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14670 DAG);
14671 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14672 // We need to zext the scalar if it is smaller than an i32.
14673 V2S = DAG.getBitcast(EltVT, V2S);
14674 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14675 // Using zext to expand a narrow element won't work for non-zero
14676 // insertions.
14677 if (!IsV1Zeroable)
14678 return SDValue();
14679
14680 // Zero-extend directly to i32.
14681 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14682 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14683 }
14684 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14685 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14686 EltVT == MVT::i16) {
14687 // Either not inserting from the low element of the input or the input
14688 // element size is too small to use VZEXT_MOVL to clear the high bits.
14689 return SDValue();
14690 }
14691
14692 if (!IsV1Zeroable) {
14693 // If V1 can't be treated as a zero vector we have fewer options to lower
14694 // this. We can't support integer vectors or non-zero targets cheaply, and
14695 // the V1 elements can't be permuted in any way.
14696 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14696, __extension__
__PRETTY_FUNCTION__))
;
14697 if (!VT.isFloatingPoint() || V2Index != 0)
14698 return SDValue();
14699 SmallVector<int, 8> V1Mask(Mask);
14700 V1Mask[V2Index] = -1;
14701 if (!isNoopShuffleMask(V1Mask))
14702 return SDValue();
14703 if (!VT.is128BitVector())
14704 return SDValue();
14705
14706 // Otherwise, use MOVSD, MOVSS or MOVSH.
14707 unsigned MovOpc = 0;
14708 if (EltVT == MVT::f16)
14709 MovOpc = X86ISD::MOVSH;
14710 else if (EltVT == MVT::f32)
14711 MovOpc = X86ISD::MOVSS;
14712 else if (EltVT == MVT::f64)
14713 MovOpc = X86ISD::MOVSD;
14714 else
14715 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14715)
;
14716 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14717 }
14718
14719 // This lowering only works for the low element with floating point vectors.
14720 if (VT.isFloatingPoint() && V2Index != 0)
14721 return SDValue();
14722
14723 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14724 if (ExtVT != VT)
14725 V2 = DAG.getBitcast(VT, V2);
14726
14727 if (V2Index != 0) {
14728 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14729 // the desired position. Otherwise it is more efficient to do a vector
14730 // shift left. We know that we can do a vector shift left because all
14731 // the inputs are zero.
14732 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
14733 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14734 V2Shuffle[V2Index] = 0;
14735 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14736 } else {
14737 V2 = DAG.getBitcast(MVT::v16i8, V2);
14738 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14739 DAG.getTargetConstant(
14740 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
14741 V2 = DAG.getBitcast(VT, V2);
14742 }
14743 }
14744 return V2;
14745}
14746
14747/// Try to lower broadcast of a single - truncated - integer element,
14748/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14749///
14750/// This assumes we have AVX2.
14751static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14752 int BroadcastIdx,
14753 const X86Subtarget &Subtarget,
14754 SelectionDAG &DAG) {
14755 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14756, __extension__
__PRETTY_FUNCTION__))
14756 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14756, __extension__
__PRETTY_FUNCTION__))
;
14757
14758 MVT EltVT = VT.getVectorElementType();
14759 MVT V0VT = V0.getSimpleValueType();
14760
14761 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14761, __extension__
__PRETTY_FUNCTION__))
;
14762 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14762, __extension__
__PRETTY_FUNCTION__))
;
14763
14764 MVT V0EltVT = V0VT.getVectorElementType();
14765 if (!V0EltVT.isInteger())
14766 return SDValue();
14767
14768 const unsigned EltSize = EltVT.getSizeInBits();
14769 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14770
14771 // This is only a truncation if the original element type is larger.
14772 if (V0EltSize <= EltSize)
14773 return SDValue();
14774
14775 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__))
14776 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__))
;
14777
14778 const unsigned V0Opc = V0.getOpcode();
14779 const unsigned Scale = V0EltSize / EltSize;
14780 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14781
14782 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14783 V0Opc != ISD::BUILD_VECTOR)
14784 return SDValue();
14785
14786 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14787
14788 // If we're extracting non-least-significant bits, shift so we can truncate.
14789 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14790 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14791 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14792 if (const int OffsetIdx = BroadcastIdx % Scale)
14793 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14794 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14795
14796 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14797 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14798}
14799
14800/// Test whether this can be lowered with a single SHUFPS instruction.
14801///
14802/// This is used to disable more specialized lowerings when the shufps lowering
14803/// will happen to be efficient.
14804static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14805 // This routine only handles 128-bit shufps.
14806 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14806, __extension__
__PRETTY_FUNCTION__))
;
14807 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14807, __extension__
__PRETTY_FUNCTION__))
;
14808 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14808, __extension__
__PRETTY_FUNCTION__))
;
14809 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14809, __extension__
__PRETTY_FUNCTION__))
;
14810 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14810, __extension__
__PRETTY_FUNCTION__))
;
14811
14812 // To lower with a single SHUFPS we need to have the low half and high half
14813 // each requiring a single input.
14814 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14815 return false;
14816 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14817 return false;
14818
14819 return true;
14820}
14821
14822/// Test whether the specified input (0 or 1) is in-place blended by the
14823/// given mask.
14824///
14825/// This returns true if the elements from a particular input are already in the
14826/// slot required by the given mask and require no permutation.
14827static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14828 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14828, __extension__
__PRETTY_FUNCTION__))
;
14829 int Size = Mask.size();
14830 for (int i = 0; i < Size; ++i)
14831 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14832 return false;
14833
14834 return true;
14835}
14836
14837/// If we are extracting two 128-bit halves of a vector and shuffling the
14838/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14839/// multi-shuffle lowering.
14840static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14841 SDValue N1, ArrayRef<int> Mask,
14842 SelectionDAG &DAG) {
14843 MVT VT = N0.getSimpleValueType();
14844 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14846, __extension__
__PRETTY_FUNCTION__))
14845 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14846, __extension__
__PRETTY_FUNCTION__))
14846 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14846, __extension__
__PRETTY_FUNCTION__))
;
14847
14848 // Check that both sources are extracts of the same source vector.
14849 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14850 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14851 N0.getOperand(0) != N1.getOperand(0) ||
14852 !N0.hasOneUse() || !N1.hasOneUse())
14853 return SDValue();
14854
14855 SDValue WideVec = N0.getOperand(0);
14856 MVT WideVT = WideVec.getSimpleValueType();
14857 if (!WideVT.is256BitVector())
14858 return SDValue();
14859
14860 // Match extracts of each half of the wide source vector. Commute the shuffle
14861 // if the extract of the low half is N1.
14862 unsigned NumElts = VT.getVectorNumElements();
14863 SmallVector<int, 4> NewMask(Mask);
14864 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14865 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14866 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14867 ShuffleVectorSDNode::commuteMask(NewMask);
14868 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14869 return SDValue();
14870
14871 // Final bailout: if the mask is simple, we are better off using an extract
14872 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14873 // because that avoids a constant load from memory.
14874 if (NumElts == 4 &&
14875 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
14876 return SDValue();
14877
14878 // Extend the shuffle mask with undef elements.
14879 NewMask.append(NumElts, -1);
14880
14881 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14882 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14883 NewMask);
14884 // This is free: ymm -> xmm.
14885 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14886 DAG.getIntPtrConstant(0, DL));
14887}
14888
14889/// Try to lower broadcast of a single element.
14890///
14891/// For convenience, this code also bundles all of the subtarget feature set
14892/// filtering. While a little annoying to re-dispatch on type here, there isn't
14893/// a convenient way to factor it out.
14894static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14895 SDValue V2, ArrayRef<int> Mask,
14896 const X86Subtarget &Subtarget,
14897 SelectionDAG &DAG) {
14898 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14899 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14900 (Subtarget.hasAVX2() && VT.isInteger())))
14901 return SDValue();
14902
14903 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14904 // we can only broadcast from a register with AVX2.
14905 unsigned NumEltBits = VT.getScalarSizeInBits();
14906 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14907 ? X86ISD::MOVDDUP
14908 : X86ISD::VBROADCAST;
14909 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14910
14911 // Check that the mask is a broadcast.
14912 int BroadcastIdx = getSplatIndex(Mask);
14913 if (BroadcastIdx < 0)
14914 return SDValue();
14915 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14917, __extension__
__PRETTY_FUNCTION__))
14916 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14917, __extension__
__PRETTY_FUNCTION__))
14917 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14917, __extension__
__PRETTY_FUNCTION__))
;
14918
14919 // Go up the chain of (vector) values to find a scalar load that we can
14920 // combine with the broadcast.
14921 // TODO: Combine this logic with findEltLoadSrc() used by
14922 // EltsFromConsecutiveLoads().
14923 int BitOffset = BroadcastIdx * NumEltBits;
14924 SDValue V = V1;
14925 for (;;) {
14926 switch (V.getOpcode()) {
14927 case ISD::BITCAST: {
14928 V = V.getOperand(0);
14929 continue;
14930 }
14931 case ISD::CONCAT_VECTORS: {
14932 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14933 int OpIdx = BitOffset / OpBitWidth;
14934 V = V.getOperand(OpIdx);
14935 BitOffset %= OpBitWidth;
14936 continue;
14937 }
14938 case ISD::EXTRACT_SUBVECTOR: {
14939 // The extraction index adds to the existing offset.
14940 unsigned EltBitWidth = V.getScalarValueSizeInBits();
14941 unsigned Idx = V.getConstantOperandVal(1);
14942 unsigned BeginOffset = Idx * EltBitWidth;
14943 BitOffset += BeginOffset;
14944 V = V.getOperand(0);
14945 continue;
14946 }
14947 case ISD::INSERT_SUBVECTOR: {
14948 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
14949 int EltBitWidth = VOuter.getScalarValueSizeInBits();
14950 int Idx = (int)V.getConstantOperandVal(2);
14951 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
14952 int BeginOffset = Idx * EltBitWidth;
14953 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
14954 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
14955 BitOffset -= BeginOffset;
14956 V = VInner;
14957 } else {
14958 V = VOuter;
14959 }
14960 continue;
14961 }
14962 }
14963 break;
14964 }
14965 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14965, __extension__
__PRETTY_FUNCTION__))
;
14966 BroadcastIdx = BitOffset / NumEltBits;
14967
14968 // Do we need to bitcast the source to retrieve the original broadcast index?
14969 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
14970
14971 // Check if this is a broadcast of a scalar. We special case lowering
14972 // for scalars so that we can more effectively fold with loads.
14973 // If the original value has a larger element type than the shuffle, the
14974 // broadcast element is in essence truncated. Make that explicit to ease
14975 // folding.
14976 if (BitCastSrc && VT.isInteger())
14977 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
14978 DL, VT, V, BroadcastIdx, Subtarget, DAG))
14979 return TruncBroadcast;
14980
14981 // Also check the simpler case, where we can directly reuse the scalar.
14982 if (!BitCastSrc &&
14983 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14984 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14985 V = V.getOperand(BroadcastIdx);
14986
14987 // If we can't broadcast from a register, check that the input is a load.
14988 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14989 return SDValue();
14990 } else if (ISD::isNormalLoad(V.getNode()) &&
14991 cast<LoadSDNode>(V)->isSimple()) {
14992 // We do not check for one-use of the vector load because a broadcast load
14993 // is expected to be a win for code size, register pressure, and possibly
14994 // uops even if the original vector load is not eliminated.
14995
14996 // Reduce the vector load and shuffle to a broadcasted scalar load.
14997 LoadSDNode *Ld = cast<LoadSDNode>(V);
14998 SDValue BaseAddr = Ld->getOperand(1);
14999 MVT SVT = VT.getScalarType();
15000 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
15001 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15001, __extension__
__PRETTY_FUNCTION__))
;
15002 SDValue NewAddr =
15003 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
15004
15005 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
15006 // than MOVDDUP.
15007 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
15008 if (Opcode == X86ISD::VBROADCAST) {
15009 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
15010 SDValue Ops[] = {Ld->getChain(), NewAddr};
15011 V = DAG.getMemIntrinsicNode(
15012 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
15013 DAG.getMachineFunction().getMachineMemOperand(
15014 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15015 DAG.makeEquivalentMemoryOrdering(Ld, V);
15016 return DAG.getBitcast(VT, V);
15017 }
15018 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15018, __extension__
__PRETTY_FUNCTION__))
;
15019 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
15020 DAG.getMachineFunction().getMachineMemOperand(
15021 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15022 DAG.makeEquivalentMemoryOrdering(Ld, V);
15023 } else if (!BroadcastFromReg) {
15024 // We can't broadcast from a vector register.
15025 return SDValue();
15026 } else if (BitOffset != 0) {
15027 // We can only broadcast from the zero-element of a vector register,
15028 // but it can be advantageous to broadcast from the zero-element of a
15029 // subvector.
15030 if (!VT.is256BitVector() && !VT.is512BitVector())
15031 return SDValue();
15032
15033 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
15034 if (VT == MVT::v4f64 || VT == MVT::v4i64)
15035 return SDValue();
15036
15037 // Only broadcast the zero-element of a 128-bit subvector.
15038 if ((BitOffset % 128) != 0)
15039 return SDValue();
15040
15041 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15042, __extension__
__PRETTY_FUNCTION__))
15042 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15042, __extension__
__PRETTY_FUNCTION__))
;
15043 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15044, __extension__
__PRETTY_FUNCTION__))
15044 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15044, __extension__
__PRETTY_FUNCTION__))
;
15045 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
15046 V = extract128BitVector(V, ExtractIdx, DAG, DL);
15047 }
15048
15049 // On AVX we can use VBROADCAST directly for scalar sources.
15050 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
15051 V = DAG.getBitcast(MVT::f64, V);
15052 if (Subtarget.hasAVX()) {
15053 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
15054 return DAG.getBitcast(VT, V);
15055 }
15056 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
15057 }
15058
15059 // If this is a scalar, do the broadcast on this type and bitcast.
15060 if (!V.getValueType().isVector()) {
15061 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15062, __extension__
__PRETTY_FUNCTION__))
15062 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15062, __extension__
__PRETTY_FUNCTION__))
;
15063 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
15064 VT.getVectorNumElements());
15065 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
15066 }
15067
15068 // We only support broadcasting from 128-bit vectors to minimize the
15069 // number of patterns we need to deal with in isel. So extract down to
15070 // 128-bits, removing as many bitcasts as possible.
15071 if (V.getValueSizeInBits() > 128)
15072 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
15073
15074 // Otherwise cast V to a vector with the same element type as VT, but
15075 // possibly narrower than VT. Then perform the broadcast.
15076 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
15077 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
15078 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
15079}
15080
15081// Check for whether we can use INSERTPS to perform the shuffle. We only use
15082// INSERTPS when the V1 elements are already in the correct locations
15083// because otherwise we can just always use two SHUFPS instructions which
15084// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
15085// perform INSERTPS if a single V1 element is out of place and all V2
15086// elements are zeroable.
15087static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
15088 unsigned &InsertPSMask,
15089 const APInt &Zeroable,
15090 ArrayRef<int> Mask, SelectionDAG &DAG) {
15091 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15091, __extension__
__PRETTY_FUNCTION__))
;
15092 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15092, __extension__
__PRETTY_FUNCTION__))
;
15093 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15093, __extension__
__PRETTY_FUNCTION__))
;
15094
15095 // Attempt to match INSERTPS with one element from VA or VB being
15096 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
15097 // are updated.
15098 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
15099 ArrayRef<int> CandidateMask) {
15100 unsigned ZMask = 0;
15101 int VADstIndex = -1;
15102 int VBDstIndex = -1;
15103 bool VAUsedInPlace = false;
15104
15105 for (int i = 0; i < 4; ++i) {
15106 // Synthesize a zero mask from the zeroable elements (includes undefs).
15107 if (Zeroable[i]) {
15108 ZMask |= 1 << i;
15109 continue;
15110 }
15111
15112 // Flag if we use any VA inputs in place.
15113 if (i == CandidateMask[i]) {
15114 VAUsedInPlace = true;
15115 continue;
15116 }
15117
15118 // We can only insert a single non-zeroable element.
15119 if (VADstIndex >= 0 || VBDstIndex >= 0)
15120 return false;
15121
15122 if (CandidateMask[i] < 4) {
15123 // VA input out of place for insertion.
15124 VADstIndex = i;
15125 } else {
15126 // VB input for insertion.
15127 VBDstIndex = i;
15128 }
15129 }
15130
15131 // Don't bother if we have no (non-zeroable) element for insertion.
15132 if (VADstIndex < 0 && VBDstIndex < 0)
15133 return false;
15134
15135 // Determine element insertion src/dst indices. The src index is from the
15136 // start of the inserted vector, not the start of the concatenated vector.
15137 unsigned VBSrcIndex = 0;
15138 if (VADstIndex >= 0) {
15139 // If we have a VA input out of place, we use VA as the V2 element
15140 // insertion and don't use the original V2 at all.
15141 VBSrcIndex = CandidateMask[VADstIndex];
15142 VBDstIndex = VADstIndex;
15143 VB = VA;
15144 } else {
15145 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
15146 }
15147
15148 // If no V1 inputs are used in place, then the result is created only from
15149 // the zero mask and the V2 insertion - so remove V1 dependency.
15150 if (!VAUsedInPlace)
15151 VA = DAG.getUNDEF(MVT::v4f32);
15152
15153 // Update V1, V2 and InsertPSMask accordingly.
15154 V1 = VA;
15155 V2 = VB;
15156
15157 // Insert the V2 element into the desired position.
15158 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
15159 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15159, __extension__
__PRETTY_FUNCTION__))
;
15160 return true;
15161 };
15162
15163 if (matchAsInsertPS(V1, V2, Mask))
15164 return true;
15165
15166 // Commute and try again.
15167 SmallVector<int, 4> CommutedMask(Mask);
15168 ShuffleVectorSDNode::commuteMask(CommutedMask);
15169 if (matchAsInsertPS(V2, V1, CommutedMask))
15170 return true;
15171
15172 return false;
15173}
15174
15175static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
15176 ArrayRef<int> Mask, const APInt &Zeroable,
15177 SelectionDAG &DAG) {
15178 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15178, __extension__
__PRETTY_FUNCTION__))
;
15179 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15179, __extension__
__PRETTY_FUNCTION__))
;
15180
15181 // Attempt to match the insertps pattern.
15182 unsigned InsertPSMask = 0;
15183 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
15184 return SDValue();
15185
15186 // Insert the V2 element into the desired position.
15187 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
15188 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
15189}
15190
15191/// Handle lowering of 2-lane 64-bit floating point shuffles.
15192///
15193/// This is the basis function for the 2-lane 64-bit shuffles as we have full
15194/// support for floating point shuffles but not integer shuffles. These
15195/// instructions will incur a domain crossing penalty on some chips though so
15196/// it is better to avoid lowering through this for integer vectors where
15197/// possible.
15198static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15199 const APInt &Zeroable, SDValue V1, SDValue V2,
15200 const X86Subtarget &Subtarget,
15201 SelectionDAG &DAG) {
15202 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15202, __extension__
__PRETTY_FUNCTION__))
;
15203 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15203, __extension__
__PRETTY_FUNCTION__))
;
15204 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15204, __extension__
__PRETTY_FUNCTION__))
;
15205
15206 if (V2.isUndef()) {
15207 // Check for being able to broadcast a single element.
15208 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
15209 Mask, Subtarget, DAG))
15210 return Broadcast;
15211
15212 // Straight shuffle of a single input vector. Simulate this by using the
15213 // single input as both of the "inputs" to this instruction..
15214 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
15215
15216 if (Subtarget.hasAVX()) {
15217 // If we have AVX, we can use VPERMILPS which will allow folding a load
15218 // into the shuffle.
15219 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
15220 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15221 }
15222
15223 return DAG.getNode(
15224 X86ISD::SHUFP, DL, MVT::v2f64,
15225 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15226 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15227 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15228 }
15229 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15229, __extension__
__PRETTY_FUNCTION__))
;
15230 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15230, __extension__
__PRETTY_FUNCTION__))
;
15231 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15231, __extension__
__PRETTY_FUNCTION__))
;
15232 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15232, __extension__
__PRETTY_FUNCTION__))
;
15233
15234 if (Subtarget.hasAVX2())
15235 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15236 return Extract;
15237
15238 // When loading a scalar and then shuffling it into a vector we can often do
15239 // the insertion cheaply.
15240 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15241 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15242 return Insertion;
15243 // Try inverting the insertion since for v2 masks it is easy to do and we
15244 // can't reliably sort the mask one way or the other.
15245 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
15246 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
15247 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15248 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15249 return Insertion;
15250
15251 // Try to use one of the special instruction patterns to handle two common
15252 // blend patterns if a zero-blend above didn't work.
15253 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
15254 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
15255 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
15256 // We can either use a special instruction to load over the low double or
15257 // to move just the low double.
15258 return DAG.getNode(
15259 X86ISD::MOVSD, DL, MVT::v2f64, V2,
15260 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
15261
15262 if (Subtarget.hasSSE41())
15263 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
15264 Zeroable, Subtarget, DAG))
15265 return Blend;
15266
15267 // Use dedicated unpack instructions for masks that match their pattern.
15268 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
15269 return V;
15270
15271 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
15272 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
15273 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15274}
15275
15276/// Handle lowering of 2-lane 64-bit integer shuffles.
15277///
15278/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
15279/// the integer unit to minimize domain crossing penalties. However, for blends
15280/// it falls back to the floating point shuffle operation with appropriate bit
15281/// casting.
15282static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15283 const APInt &Zeroable, SDValue V1, SDValue V2,
15284 const X86Subtarget &Subtarget,
15285 SelectionDAG &DAG) {
15286 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15286, __extension__
__PRETTY_FUNCTION__))
;
15287 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15287, __extension__
__PRETTY_FUNCTION__))
;
15288 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15288, __extension__
__PRETTY_FUNCTION__))
;
15289
15290 if (V2.isUndef()) {
15291 // Check for being able to broadcast a single element.
15292 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
15293 Mask, Subtarget, DAG))
15294 return Broadcast;
15295
15296 // Straight shuffle of a single input vector. For everything from SSE2
15297 // onward this has a single fast instruction with no scary immediates.
15298 // We have to map the mask as it is actually a v4i32 shuffle instruction.
15299 V1 = DAG.getBitcast(MVT::v4i32, V1);
15300 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
15301 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
15302 Mask[1] < 0 ? -1 : (Mask[1] * 2),
15303 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
15304 return DAG.getBitcast(
15305 MVT::v2i64,
15306 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15307 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
15308 }
15309 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15309, __extension__
__PRETTY_FUNCTION__))
;
15310 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15310, __extension__
__PRETTY_FUNCTION__))
;
15311 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15311, __extension__
__PRETTY_FUNCTION__))
;
15312 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15312, __extension__
__PRETTY_FUNCTION__))
;
15313
15314 if (Subtarget.hasAVX2())
15315 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15316 return Extract;
15317
15318 // Try to use shift instructions.
15319 if (SDValue Shift =
15320 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
15321 DAG, /*BitwiseOnly*/ false))
15322 return Shift;
15323
15324 // When loading a scalar and then shuffling it into a vector we can often do
15325 // the insertion cheaply.
15326 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15327 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15328 return Insertion;
15329 // Try inverting the insertion since for v2 masks it is easy to do and we
15330 // can't reliably sort the mask one way or the other.
15331 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
15332 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15333 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15334 return Insertion;
15335
15336 // We have different paths for blend lowering, but they all must use the
15337 // *exact* same predicate.
15338 bool IsBlendSupported = Subtarget.hasSSE41();
15339 if (IsBlendSupported)
15340 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
15341 Zeroable, Subtarget, DAG))
15342 return Blend;
15343
15344 // Use dedicated unpack instructions for masks that match their pattern.
15345 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
15346 return V;
15347
15348 // Try to use byte rotation instructions.
15349 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15350 if (Subtarget.hasSSSE3()) {
15351 if (Subtarget.hasVLX())
15352 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
15353 Subtarget, DAG))
15354 return Rotate;
15355
15356 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
15357 Subtarget, DAG))
15358 return Rotate;
15359 }
15360
15361 // If we have direct support for blends, we should lower by decomposing into
15362 // a permute. That will be faster than the domain cross.
15363 if (IsBlendSupported)
15364 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
15365 Subtarget, DAG);
15366
15367 // We implement this with SHUFPD which is pretty lame because it will likely
15368 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
15369 // However, all the alternatives are still more cycles and newer chips don't
15370 // have this problem. It would be really nice if x86 had better shuffles here.
15371 V1 = DAG.getBitcast(MVT::v2f64, V1);
15372 V2 = DAG.getBitcast(MVT::v2f64, V2);
15373 return DAG.getBitcast(MVT::v2i64,
15374 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
15375}
15376
15377/// Lower a vector shuffle using the SHUFPS instruction.
15378///
15379/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
15380/// It makes no assumptions about whether this is the *best* lowering, it simply
15381/// uses it.
15382static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
15383 ArrayRef<int> Mask, SDValue V1,
15384 SDValue V2, SelectionDAG &DAG) {
15385 SDValue LowV = V1, HighV = V2;
15386 SmallVector<int, 4> NewMask(Mask);
15387 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15388
15389 if (NumV2Elements == 1) {
15390 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
15391
15392 // Compute the index adjacent to V2Index and in the same half by toggling
15393 // the low bit.
15394 int V2AdjIndex = V2Index ^ 1;
15395
15396 if (Mask[V2AdjIndex] < 0) {
15397 // Handles all the cases where we have a single V2 element and an undef.
15398 // This will only ever happen in the high lanes because we commute the
15399 // vector otherwise.
15400 if (V2Index < 2)
15401 std::swap(LowV, HighV);
15402 NewMask[V2Index] -= 4;
15403 } else {
15404 // Handle the case where the V2 element ends up adjacent to a V1 element.
15405 // To make this work, blend them together as the first step.
15406 int V1Index = V2AdjIndex;
15407 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
15408 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
15409 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15410
15411 // Now proceed to reconstruct the final blend as we have the necessary
15412 // high or low half formed.
15413 if (V2Index < 2) {
15414 LowV = V2;
15415 HighV = V1;
15416 } else {
15417 HighV = V2;
15418 }
15419 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
15420 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
15421 }
15422 } else if (NumV2Elements == 2) {
15423 if (Mask[0] < 4 && Mask[1] < 4) {
15424 // Handle the easy case where we have V1 in the low lanes and V2 in the
15425 // high lanes.
15426 NewMask[2] -= 4;
15427 NewMask[3] -= 4;
15428 } else if (Mask[2] < 4 && Mask[3] < 4) {
15429 // We also handle the reversed case because this utility may get called
15430 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
15431 // arrange things in the right direction.
15432 NewMask[0] -= 4;
15433 NewMask[1] -= 4;
15434 HighV = V1;
15435 LowV = V2;
15436 } else {
15437 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
15438 // trying to place elements directly, just blend them and set up the final
15439 // shuffle to place them.
15440
15441 // The first two blend mask elements are for V1, the second two are for
15442 // V2.
15443 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
15444 Mask[2] < 4 ? Mask[2] : Mask[3],
15445 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
15446 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
15447 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15448 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15449
15450 // Now we do a normal shuffle of V1 by giving V1 as both operands to
15451 // a blend.
15452 LowV = HighV = V1;
15453 NewMask[0] = Mask[0] < 4 ? 0 : 2;
15454 NewMask[1] = Mask[0] < 4 ? 2 : 0;
15455 NewMask[2] = Mask[2] < 4 ? 1 : 3;
15456 NewMask[3] = Mask[2] < 4 ? 3 : 1;
15457 }
15458 } else if (NumV2Elements == 3) {
15459 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
15460 // we can get here due to other paths (e.g repeated mask matching) that we
15461 // don't want to do another round of lowerVECTOR_SHUFFLE.
15462 ShuffleVectorSDNode::commuteMask(NewMask);
15463 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
15464 }
15465 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
15466 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
15467}
15468
15469/// Lower 4-lane 32-bit floating point shuffles.
15470///
15471/// Uses instructions exclusively from the floating point unit to minimize
15472/// domain crossing penalties, as these are sufficient to implement all v4f32
15473/// shuffles.
15474static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15475 const APInt &Zeroable, SDValue V1, SDValue V2,
15476 const X86Subtarget &Subtarget,
15477 SelectionDAG &DAG) {
15478 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__))
;
15479 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15479, __extension__
__PRETTY_FUNCTION__))
;
15480 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15480, __extension__
__PRETTY_FUNCTION__))
;
15481
15482 if (Subtarget.hasSSE41())
15483 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15484 Zeroable, Subtarget, DAG))
15485 return Blend;
15486
15487 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15488
15489 if (NumV2Elements == 0) {
15490 // Check for being able to broadcast a single element.
15491 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
15492 Mask, Subtarget, DAG))
15493 return Broadcast;
15494
15495 // Use even/odd duplicate instructions for masks that match their pattern.
15496 if (Subtarget.hasSSE3()) {
15497 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15498 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
15499 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
15500 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
15501 }
15502
15503 if (Subtarget.hasAVX()) {
15504 // If we have AVX, we can use VPERMILPS which will allow folding a load
15505 // into the shuffle.
15506 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
15507 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15508 }
15509
15510 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
15511 // in SSE1 because otherwise they are widened to v2f64 and never get here.
15512 if (!Subtarget.hasSSE2()) {
15513 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
15514 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
15515 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
15516 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
15517 }
15518
15519 // Otherwise, use a straight shuffle of a single input vector. We pass the
15520 // input vector to both operands to simulate this with a SHUFPS.
15521 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
15522 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15523 }
15524
15525 if (Subtarget.hasSSE2())
15526 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15527 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15528 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15529 return ZExt;
15530 }
15531
15532 if (Subtarget.hasAVX2())
15533 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15534 return Extract;
15535
15536 // There are special ways we can lower some single-element blends. However, we
15537 // have custom ways we can lower more complex single-element blends below that
15538 // we defer to if both this and BLENDPS fail to match, so restrict this to
15539 // when the V2 input is targeting element 0 of the mask -- that is the fast
15540 // case here.
15541 if (NumV2Elements == 1 && Mask[0] >= 4)
15542 if (SDValue V = lowerShuffleAsElementInsertion(
15543 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15544 return V;
15545
15546 if (Subtarget.hasSSE41()) {
15547 // Use INSERTPS if we can complete the shuffle efficiently.
15548 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
15549 return V;
15550
15551 if (!isSingleSHUFPSMask(Mask))
15552 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
15553 V2, Mask, DAG))
15554 return BlendPerm;
15555 }
15556
15557 // Use low/high mov instructions. These are only valid in SSE1 because
15558 // otherwise they are widened to v2f64 and never get here.
15559 if (!Subtarget.hasSSE2()) {
15560 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15561 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15562 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15563 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15564 }
15565
15566 // Use dedicated unpack instructions for masks that match their pattern.
15567 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15568 return V;
15569
15570 // Otherwise fall back to a SHUFPS lowering strategy.
15571 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15572}
15573
15574/// Lower 4-lane i32 vector shuffles.
15575///
15576/// We try to handle these with integer-domain shuffles where we can, but for
15577/// blends we use the floating point domain blend instructions.
15578static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15579 const APInt &Zeroable, SDValue V1, SDValue V2,
15580 const X86Subtarget &Subtarget,
15581 SelectionDAG &DAG) {
15582 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15582, __extension__
__PRETTY_FUNCTION__))
;
15583 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15583, __extension__
__PRETTY_FUNCTION__))
;
15584 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15584, __extension__
__PRETTY_FUNCTION__))
;
15585
15586 // Whenever we can lower this as a zext, that instruction is strictly faster
15587 // than any alternative. It also allows us to fold memory operands into the
15588 // shuffle in many cases.
15589 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15590 Zeroable, Subtarget, DAG))
15591 return ZExt;
15592
15593 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15594
15595 // Try to use shift instructions if fast.
15596 if (Subtarget.preferLowerShuffleAsShift()) {
15597 if (SDValue Shift =
15598 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
15599 Subtarget, DAG, /*BitwiseOnly*/ true))
15600 return Shift;
15601 if (NumV2Elements == 0)
15602 if (SDValue Rotate =
15603 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
15604 return Rotate;
15605 }
15606
15607 if (NumV2Elements == 0) {
15608 // Try to use broadcast unless the mask only has one non-undef element.
15609 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15610 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15611 Mask, Subtarget, DAG))
15612 return Broadcast;
15613 }
15614
15615 // Straight shuffle of a single input vector. For everything from SSE2
15616 // onward this has a single fast instruction with no scary immediates.
15617 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15618 // but we aren't actually going to use the UNPCK instruction because doing
15619 // so prevents folding a load into this instruction or making a copy.
15620 const int UnpackLoMask[] = {0, 0, 1, 1};
15621 const int UnpackHiMask[] = {2, 2, 3, 3};
15622 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15623 Mask = UnpackLoMask;
15624 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15625 Mask = UnpackHiMask;
15626
15627 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15628 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15629 }
15630
15631 if (Subtarget.hasAVX2())
15632 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15633 return Extract;
15634
15635 // Try to use shift instructions.
15636 if (SDValue Shift =
15637 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
15638 DAG, /*BitwiseOnly*/ false))
15639 return Shift;
15640
15641 // There are special ways we can lower some single-element blends.
15642 if (NumV2Elements == 1)
15643 if (SDValue V = lowerShuffleAsElementInsertion(
15644 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15645 return V;
15646
15647 // We have different paths for blend lowering, but they all must use the
15648 // *exact* same predicate.
15649 bool IsBlendSupported = Subtarget.hasSSE41();
15650 if (IsBlendSupported)
15651 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15652 Zeroable, Subtarget, DAG))
15653 return Blend;
15654
15655 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15656 Zeroable, Subtarget, DAG))
15657 return Masked;
15658
15659 // Use dedicated unpack instructions for masks that match their pattern.
15660 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15661 return V;
15662
15663 // Try to use byte rotation instructions.
15664 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15665 if (Subtarget.hasSSSE3()) {
15666 if (Subtarget.hasVLX())
15667 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15668 Subtarget, DAG))
15669 return Rotate;
15670
15671 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15672 Subtarget, DAG))
15673 return Rotate;
15674 }
15675
15676 // Assume that a single SHUFPS is faster than an alternative sequence of
15677 // multiple instructions (even if the CPU has a domain penalty).
15678 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15679 if (!isSingleSHUFPSMask(Mask)) {
15680 // If we have direct support for blends, we should lower by decomposing into
15681 // a permute. That will be faster than the domain cross.
15682 if (IsBlendSupported)
15683 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15684 Subtarget, DAG);
15685
15686 // Try to lower by permuting the inputs into an unpack instruction.
15687 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15688 Mask, Subtarget, DAG))
15689 return Unpack;
15690 }
15691
15692 // We implement this with SHUFPS because it can blend from two vectors.
15693 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15694 // up the inputs, bypassing domain shift penalties that we would incur if we
15695 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15696 // relevant.
15697 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15698 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15699 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15700 return DAG.getBitcast(MVT::v4i32, ShufPS);
15701}
15702
15703/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15704/// shuffle lowering, and the most complex part.
15705///
15706/// The lowering strategy is to try to form pairs of input lanes which are
15707/// targeted at the same half of the final vector, and then use a dword shuffle
15708/// to place them onto the right half, and finally unpack the paired lanes into
15709/// their final position.
15710///
15711/// The exact breakdown of how to form these dword pairs and align them on the
15712/// correct sides is really tricky. See the comments within the function for
15713/// more of the details.
15714///
15715/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15716/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15717/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15718/// vector, form the analogous 128-bit 8-element Mask.
15719static SDValue lowerV8I16GeneralSingleInputShuffle(
15720 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15721 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15722 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15722, __extension__
__PRETTY_FUNCTION__))
;
15723 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15724
15725 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15725, __extension__
__PRETTY_FUNCTION__))
;
15726 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15727 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15728
15729 // Attempt to directly match PSHUFLW or PSHUFHW.
15730 if (isUndefOrInRange(LoMask, 0, 4) &&
15731 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15732 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15733 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15734 }
15735 if (isUndefOrInRange(HiMask, 4, 8) &&
15736 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15737 for (int i = 0; i != 4; ++i)
15738 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15739 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15740 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15741 }
15742
15743 SmallVector<int, 4> LoInputs;
15744 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15745 array_pod_sort(LoInputs.begin(), LoInputs.end());
15746 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15747 SmallVector<int, 4> HiInputs;
15748 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15749 array_pod_sort(HiInputs.begin(), HiInputs.end());
15750 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15751 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15752 int NumHToL = LoInputs.size() - NumLToL;
15753 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15754 int NumHToH = HiInputs.size() - NumLToH;
15755 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15756 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15757 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15758 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15759
15760 // If we are shuffling values from one half - check how many different DWORD
15761 // pairs we need to create. If only 1 or 2 then we can perform this as a
15762 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15763 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15764 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15765 V = DAG.getNode(ShufWOp, DL, VT, V,
15766 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15767 V = DAG.getBitcast(PSHUFDVT, V);
15768 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15769 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15770 return DAG.getBitcast(VT, V);
15771 };
15772
15773 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15774 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15775 SmallVector<std::pair<int, int>, 4> DWordPairs;
15776 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15777
15778 // Collect the different DWORD pairs.
15779 for (int DWord = 0; DWord != 4; ++DWord) {
15780 int M0 = Mask[2 * DWord + 0];
15781 int M1 = Mask[2 * DWord + 1];
15782 M0 = (M0 >= 0 ? M0 % 4 : M0);
15783 M1 = (M1 >= 0 ? M1 % 4 : M1);
15784 if (M0 < 0 && M1 < 0)
15785 continue;
15786
15787 bool Match = false;
15788 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15789 auto &DWordPair = DWordPairs[j];
15790 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15791 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15792 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15793 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15794 PSHUFDMask[DWord] = DOffset + j;
15795 Match = true;
15796 break;
15797 }
15798 }
15799 if (!Match) {
15800 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15801 DWordPairs.push_back(std::make_pair(M0, M1));
15802 }
15803 }
15804
15805 if (DWordPairs.size() <= 2) {
15806 DWordPairs.resize(2, std::make_pair(-1, -1));
15807 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15808 DWordPairs[1].first, DWordPairs[1].second};
15809 if ((NumHToL + NumHToH) == 0)
15810 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15811 if ((NumLToL + NumLToH) == 0)
15812 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15813 }
15814 }
15815
15816 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15817 // such inputs we can swap two of the dwords across the half mark and end up
15818 // with <=2 inputs to each half in each half. Once there, we can fall through
15819 // to the generic code below. For example:
15820 //
15821 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15822 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15823 //
15824 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15825 // and an existing 2-into-2 on the other half. In this case we may have to
15826 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15827 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15828 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15829 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15830 // half than the one we target for fixing) will be fixed when we re-enter this
15831 // path. We will also combine away any sequence of PSHUFD instructions that
15832 // result into a single instruction. Here is an example of the tricky case:
15833 //
15834 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15835 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15836 //
15837 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15838 //
15839 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15840 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15841 //
15842 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15843 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15844 //
15845 // The result is fine to be handled by the generic logic.
15846 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15847 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15848 int AOffset, int BOffset) {
15849 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15850, __extension__
__PRETTY_FUNCTION__))
15850 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15850, __extension__
__PRETTY_FUNCTION__))
;
15851 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15852, __extension__
__PRETTY_FUNCTION__))
15852 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15852, __extension__
__PRETTY_FUNCTION__))
;
15853 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15854, __extension__
__PRETTY_FUNCTION__))
15854 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15854, __extension__
__PRETTY_FUNCTION__))
;
15855
15856 bool ThreeAInputs = AToAInputs.size() == 3;
15857
15858 // Compute the index of dword with only one word among the three inputs in
15859 // a half by taking the sum of the half with three inputs and subtracting
15860 // the sum of the actual three inputs. The difference is the remaining
15861 // slot.
15862 int ADWord = 0, BDWord = 0;
15863 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15864 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15865 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15866 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15867 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15868 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15869 int TripleNonInputIdx =
15870 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15871 TripleDWord = TripleNonInputIdx / 2;
15872
15873 // We use xor with one to compute the adjacent DWord to whichever one the
15874 // OneInput is in.
15875 OneInputDWord = (OneInput / 2) ^ 1;
15876
15877 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15878 // and BToA inputs. If there is also such a problem with the BToB and AToB
15879 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15880 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15881 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15882 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15883 // Compute how many inputs will be flipped by swapping these DWords. We
15884 // need
15885 // to balance this to ensure we don't form a 3-1 shuffle in the other
15886 // half.
15887 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15888 llvm::count(AToBInputs, 2 * ADWord + 1);
15889 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15890 llvm::count(BToBInputs, 2 * BDWord + 1);
15891 if ((NumFlippedAToBInputs == 1 &&
15892 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15893 (NumFlippedBToBInputs == 1 &&
15894 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15895 // We choose whether to fix the A half or B half based on whether that
15896 // half has zero flipped inputs. At zero, we may not be able to fix it
15897 // with that half. We also bias towards fixing the B half because that
15898 // will more commonly be the high half, and we have to bias one way.
15899 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15900 ArrayRef<int> Inputs) {
15901 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15902 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15903 // Determine whether the free index is in the flipped dword or the
15904 // unflipped dword based on where the pinned index is. We use this bit
15905 // in an xor to conditionally select the adjacent dword.
15906 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15907 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15908 if (IsFixIdxInput == IsFixFreeIdxInput)
15909 FixFreeIdx += 1;
15910 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15911 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15912, __extension__
__PRETTY_FUNCTION__))
15912 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15912, __extension__
__PRETTY_FUNCTION__))
;
15913 int PSHUFHalfMask[] = {0, 1, 2, 3};
15914 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15915 V = DAG.getNode(
15916 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15917 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15918 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15919
15920 for (int &M : Mask)
15921 if (M >= 0 && M == FixIdx)
15922 M = FixFreeIdx;
15923 else if (M >= 0 && M == FixFreeIdx)
15924 M = FixIdx;
15925 };
15926 if (NumFlippedBToBInputs != 0) {
15927 int BPinnedIdx =
15928 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15929 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15930 } else {
15931 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15931, __extension__
__PRETTY_FUNCTION__))
;
15932 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15933 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15934 }
15935 }
15936 }
15937
15938 int PSHUFDMask[] = {0, 1, 2, 3};
15939 PSHUFDMask[ADWord] = BDWord;
15940 PSHUFDMask[BDWord] = ADWord;
15941 V = DAG.getBitcast(
15942 VT,
15943 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15944 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15945
15946 // Adjust the mask to match the new locations of A and B.
15947 for (int &M : Mask)
15948 if (M >= 0 && M/2 == ADWord)
15949 M = 2 * BDWord + M % 2;
15950 else if (M >= 0 && M/2 == BDWord)
15951 M = 2 * ADWord + M % 2;
15952
15953 // Recurse back into this routine to re-compute state now that this isn't
15954 // a 3 and 1 problem.
15955 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15956 };
15957 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15958 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15959 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15960 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15961
15962 // At this point there are at most two inputs to the low and high halves from
15963 // each half. That means the inputs can always be grouped into dwords and
15964 // those dwords can then be moved to the correct half with a dword shuffle.
15965 // We use at most one low and one high word shuffle to collect these paired
15966 // inputs into dwords, and finally a dword shuffle to place them.
15967 int PSHUFLMask[4] = {-1, -1, -1, -1};
15968 int PSHUFHMask[4] = {-1, -1, -1, -1};
15969 int PSHUFDMask[4] = {-1, -1, -1, -1};
15970
15971 // First fix the masks for all the inputs that are staying in their
15972 // original halves. This will then dictate the targets of the cross-half
15973 // shuffles.
15974 auto fixInPlaceInputs =
15975 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15976 MutableArrayRef<int> SourceHalfMask,
15977 MutableArrayRef<int> HalfMask, int HalfOffset) {
15978 if (InPlaceInputs.empty())
15979 return;
15980 if (InPlaceInputs.size() == 1) {
15981 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15982 InPlaceInputs[0] - HalfOffset;
15983 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15984 return;
15985 }
15986 if (IncomingInputs.empty()) {
15987 // Just fix all of the in place inputs.
15988 for (int Input : InPlaceInputs) {
15989 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15990 PSHUFDMask[Input / 2] = Input / 2;
15991 }
15992 return;
15993 }
15994
15995 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15995, __extension__
__PRETTY_FUNCTION__))
;
15996 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15997 InPlaceInputs[0] - HalfOffset;
15998 // Put the second input next to the first so that they are packed into
15999 // a dword. We find the adjacent index by toggling the low bit.
16000 int AdjIndex = InPlaceInputs[0] ^ 1;
16001 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
16002 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
16003 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
16004 };
16005 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
16006 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
16007
16008 // Now gather the cross-half inputs and place them into a free dword of
16009 // their target half.
16010 // FIXME: This operation could almost certainly be simplified dramatically to
16011 // look more like the 3-1 fixing operation.
16012 auto moveInputsToRightHalf = [&PSHUFDMask](
16013 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
16014 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
16015 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
16016 int DestOffset) {
16017 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
16018 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
16019 };
16020 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
16021 int Word) {
16022 int LowWord = Word & ~1;
16023 int HighWord = Word | 1;
16024 return isWordClobbered(SourceHalfMask, LowWord) ||
16025 isWordClobbered(SourceHalfMask, HighWord);
16026 };
16027
16028 if (IncomingInputs.empty())
16029 return;
16030
16031 if (ExistingInputs.empty()) {
16032 // Map any dwords with inputs from them into the right half.
16033 for (int Input : IncomingInputs) {
16034 // If the source half mask maps over the inputs, turn those into
16035 // swaps and use the swapped lane.
16036 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
16037 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
16038 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
16039 Input - SourceOffset;
16040 // We have to swap the uses in our half mask in one sweep.
16041 for (int &M : HalfMask)
16042 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
16043 M = Input;
16044 else if (M == Input)
16045 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16046 } else {
16047 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16049, __extension__
__PRETTY_FUNCTION__))
16048 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16049, __extension__
__PRETTY_FUNCTION__))
16049 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16049, __extension__
__PRETTY_FUNCTION__))
;
16050 }
16051 // Note that this correctly re-maps both when we do a swap and when
16052 // we observe the other side of the swap above. We rely on that to
16053 // avoid swapping the members of the input list directly.
16054 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16055 }
16056
16057 // Map the input's dword into the correct half.
16058 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
16059 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
16060 else
16061 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16063, __extension__
__PRETTY_FUNCTION__))
16062 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16063, __extension__
__PRETTY_FUNCTION__))
16063 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16063, __extension__
__PRETTY_FUNCTION__))
;
16064 }
16065
16066 // And just directly shift any other-half mask elements to be same-half
16067 // as we will have mirrored the dword containing the element into the
16068 // same position within that half.
16069 for (int &M : HalfMask)
16070 if (M >= SourceOffset && M < SourceOffset + 4) {
16071 M = M - SourceOffset + DestOffset;
16072 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16072, __extension__
__PRETTY_FUNCTION__))
;
16073 }
16074 return;
16075 }
16076
16077 // Ensure we have the input in a viable dword of its current half. This
16078 // is particularly tricky because the original position may be clobbered
16079 // by inputs being moved and *staying* in that half.
16080 if (IncomingInputs.size() == 1) {
16081 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16082 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
16083 SourceOffset;
16084 SourceHalfMask[InputFixed - SourceOffset] =
16085 IncomingInputs[0] - SourceOffset;
16086 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
16087 InputFixed);
16088 IncomingInputs[0] = InputFixed;
16089 }
16090 } else if (IncomingInputs.size() == 2) {
16091 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
16092 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16093 // We have two non-adjacent or clobbered inputs we need to extract from
16094 // the source half. To do this, we need to map them into some adjacent
16095 // dword slot in the source mask.
16096 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
16097 IncomingInputs[1] - SourceOffset};
16098
16099 // If there is a free slot in the source half mask adjacent to one of
16100 // the inputs, place the other input in it. We use (Index XOR 1) to
16101 // compute an adjacent index.
16102 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
16103 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
16104 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
16105 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16106 InputsFixed[1] = InputsFixed[0] ^ 1;
16107 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
16108 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
16109 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
16110 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
16111 InputsFixed[0] = InputsFixed[1] ^ 1;
16112 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
16113 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
16114 // The two inputs are in the same DWord but it is clobbered and the
16115 // adjacent DWord isn't used at all. Move both inputs to the free
16116 // slot.
16117 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
16118 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
16119 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
16120 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
16121 } else {
16122 // The only way we hit this point is if there is no clobbering
16123 // (because there are no off-half inputs to this half) and there is no
16124 // free slot adjacent to one of the inputs. In this case, we have to
16125 // swap an input with a non-input.
16126 for (int i = 0; i < 4; ++i)
16127 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
16128 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))
;
16129 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16130, __extension__
__PRETTY_FUNCTION__))
16130 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16130, __extension__
__PRETTY_FUNCTION__))
;
16131
16132 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16133 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
16134
16135 // We also have to update the final source mask in this case because
16136 // it may need to undo the above swap.
16137 for (int &M : FinalSourceHalfMask)
16138 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
16139 M = InputsFixed[1] + SourceOffset;
16140 else if (M == InputsFixed[1] + SourceOffset)
16141 M = (InputsFixed[0] ^ 1) + SourceOffset;
16142
16143 InputsFixed[1] = InputsFixed[0] ^ 1;
16144 }
16145
16146 // Point everything at the fixed inputs.
16147 for (int &M : HalfMask)
16148 if (M == IncomingInputs[0])
16149 M = InputsFixed[0] + SourceOffset;
16150 else if (M == IncomingInputs[1])
16151 M = InputsFixed[1] + SourceOffset;
16152
16153 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
16154 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
16155 }
16156 } else {
16157 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16157)
;
16158 }
16159
16160 // Now hoist the DWord down to the right half.
16161 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
16162 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__))
;
16163 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
16164 for (int &M : HalfMask)
16165 for (int Input : IncomingInputs)
16166 if (M == Input)
16167 M = FreeDWord * 2 + Input % 2;
16168 };
16169 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
16170 /*SourceOffset*/ 4, /*DestOffset*/ 0);
16171 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
16172 /*SourceOffset*/ 0, /*DestOffset*/ 4);
16173
16174 // Now enact all the shuffles we've computed to move the inputs into their
16175 // target half.
16176 if (!isNoopShuffleMask(PSHUFLMask))
16177 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16178 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
16179 if (!isNoopShuffleMask(PSHUFHMask))
16180 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16181 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
16182 if (!isNoopShuffleMask(PSHUFDMask))
16183 V = DAG.getBitcast(
16184 VT,
16185 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16186 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16187
16188 // At this point, each half should contain all its inputs, and we can then
16189 // just shuffle them into their final position.
16190 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16191, __extension__
__PRETTY_FUNCTION__))
16191 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16191, __extension__
__PRETTY_FUNCTION__))
;
16192 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))
16193 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))
;
16194
16195 // Do a half shuffle for the low mask.
16196 if (!isNoopShuffleMask(LoMask))
16197 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16198 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
16199
16200 // Do a half shuffle with the high mask after shifting its values down.
16201 for (int &M : HiMask)
16202 if (M >= 0)
16203 M -= 4;
16204 if (!isNoopShuffleMask(HiMask))
16205 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16206 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
16207
16208 return V;
16209}
16210
16211/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
16212/// blend if only one input is used.
16213static SDValue lowerShuffleAsBlendOfPSHUFBs(
16214 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16215 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
16216 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16217, __extension__
__PRETTY_FUNCTION__))
16217 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16217, __extension__
__PRETTY_FUNCTION__))
;
16218
16219 int NumBytes = VT.getSizeInBits() / 8;
16220 int Size = Mask.size();
16221 int Scale = NumBytes / Size;
16222
16223 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16224 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16225 V1InUse = false;
16226 V2InUse = false;
16227
16228 for (int i = 0; i < NumBytes; ++i) {
16229 int M = Mask[i / Scale];
16230 if (M < 0)
16231 continue;
16232
16233 const int ZeroMask = 0x80;
16234 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
16235 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
16236 if (Zeroable[i / Scale])
16237 V1Idx = V2Idx = ZeroMask;
16238
16239 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
16240 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
16241 V1InUse |= (ZeroMask != V1Idx);
16242 V2InUse |= (ZeroMask != V2Idx);
16243 }
16244
16245 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
16246 if (V1InUse)
16247 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
16248 DAG.getBuildVector(ShufVT, DL, V1Mask));
16249 if (V2InUse)
16250 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
16251 DAG.getBuildVector(ShufVT, DL, V2Mask));
16252
16253 // If we need shuffled inputs from both, blend the two.
16254 SDValue V;
16255 if (V1InUse && V2InUse)
16256 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
16257 else
16258 V = V1InUse ? V1 : V2;
16259
16260 // Cast the result back to the correct type.
16261 return DAG.getBitcast(VT, V);
16262}
16263
16264/// Generic lowering of 8-lane i16 shuffles.
16265///
16266/// This handles both single-input shuffles and combined shuffle/blends with
16267/// two inputs. The single input shuffles are immediately delegated to
16268/// a dedicated lowering routine.
16269///
16270/// The blends are lowered in one of three fundamental ways. If there are few
16271/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
16272/// of the input is significantly cheaper when lowered as an interleaving of
16273/// the two inputs, try to interleave them. Otherwise, blend the low and high
16274/// halves of the inputs separately (making them have relatively few inputs)
16275/// and then concatenate them.
16276static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16277 const APInt &Zeroable, SDValue V1, SDValue V2,
16278 const X86Subtarget &Subtarget,
16279 SelectionDAG &DAG) {
16280 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16280, __extension__
__PRETTY_FUNCTION__))
;
16281 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16281, __extension__
__PRETTY_FUNCTION__))
;
16282 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__))
;
16283
16284 // Whenever we can lower this as a zext, that instruction is strictly faster
16285 // than any alternative.
16286 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
16287 Zeroable, Subtarget, DAG))
16288 return ZExt;
16289
16290 // Try to use lower using a truncation.
16291 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16292 Subtarget, DAG))
16293 return V;
16294
16295 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
16296
16297 if (NumV2Inputs == 0) {
16298 // Try to use shift instructions.
16299 if (SDValue Shift =
16300 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
16301 Subtarget, DAG, /*BitwiseOnly*/ false))
16302 return Shift;
16303
16304 // Check for being able to broadcast a single element.
16305 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
16306 Mask, Subtarget, DAG))
16307 return Broadcast;
16308
16309 // Try to use bit rotation instructions.
16310 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
16311 Subtarget, DAG))
16312 return Rotate;
16313
16314 // Use dedicated unpack instructions for masks that match their pattern.
16315 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16316 return V;
16317
16318 // Use dedicated pack instructions for masks that match their pattern.
16319 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16320 Subtarget))
16321 return V;
16322
16323 // Try to use byte rotation instructions.
16324 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
16325 Subtarget, DAG))
16326 return Rotate;
16327
16328 // Make a copy of the mask so it can be modified.
16329 SmallVector<int, 8> MutableMask(Mask);
16330 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
16331 Subtarget, DAG);
16332 }
16333
16334 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16336, __extension__
__PRETTY_FUNCTION__))
16335 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16336, __extension__
__PRETTY_FUNCTION__))
16336 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16336, __extension__
__PRETTY_FUNCTION__))
;
16337
16338 // Try to use shift instructions.
16339 if (SDValue Shift =
16340 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
16341 DAG, /*BitwiseOnly*/ false))
16342 return Shift;
16343
16344 // See if we can use SSE4A Extraction / Insertion.
16345 if (Subtarget.hasSSE4A())
16346 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
16347 Zeroable, DAG))
16348 return V;
16349
16350 // There are special ways we can lower some single-element blends.
16351 if (NumV2Inputs == 1)
16352 if (SDValue V = lowerShuffleAsElementInsertion(
16353 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16354 return V;
16355
16356 // We have different paths for blend lowering, but they all must use the
16357 // *exact* same predicate.
16358 bool IsBlendSupported = Subtarget.hasSSE41();
16359 if (IsBlendSupported)
16360 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
16361 Zeroable, Subtarget, DAG))
16362 return Blend;
16363
16364 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
16365 Zeroable, Subtarget, DAG))
16366 return Masked;
16367
16368 // Use dedicated unpack instructions for masks that match their pattern.
16369 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16370 return V;
16371
16372 // Use dedicated pack instructions for masks that match their pattern.
16373 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16374 Subtarget))
16375 return V;
16376
16377 // Try to use lower using a truncation.
16378 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16379 Subtarget, DAG))
16380 return V;
16381
16382 // Try to use byte rotation instructions.
16383 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
16384 Subtarget, DAG))
16385 return Rotate;
16386
16387 if (SDValue BitBlend =
16388 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
16389 return BitBlend;
16390
16391 // Try to use byte shift instructions to mask.
16392 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
16393 Zeroable, Subtarget, DAG))
16394 return V;
16395
16396 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
16397 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
16398 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
16399 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
16400 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
16401 !Subtarget.hasVLX()) {
16402 // Check if this is part of a 256-bit vector truncation.
16403 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
16404 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16405 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
16406 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
16407 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
16408 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
16409 DAG.getTargetConstant(0xEE, DL, MVT::i8));
16410 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
16411 V1 = extract128BitVector(V1V2, 0, DAG, DL);
16412 V2 = extract128BitVector(V1V2, 4, DAG, DL);
16413 } else {
16414 SmallVector<SDValue, 4> DWordClearOps(4,
16415 DAG.getConstant(0, DL, MVT::i32));
16416 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
16417 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
16418 SDValue DWordClearMask =
16419 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
16420 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
16421 DWordClearMask);
16422 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
16423 DWordClearMask);
16424 }
16425 // Now pack things back together.
16426 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
16427 if (NumEvenDrops == 2) {
16428 Result = DAG.getBitcast(MVT::v4i32, Result);
16429 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
16430 }
16431 return Result;
16432 }
16433
16434 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
16435 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
16436 if (NumOddDrops == 1) {
16437 bool HasSSE41 = Subtarget.hasSSE41();
16438 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16439 DAG.getBitcast(MVT::v4i32, V1),
16440 DAG.getTargetConstant(16, DL, MVT::i8));
16441 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16442 DAG.getBitcast(MVT::v4i32, V2),
16443 DAG.getTargetConstant(16, DL, MVT::i8));
16444 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
16445 MVT::v8i16, V1, V2);
16446 }
16447
16448 // Try to lower by permuting the inputs into an unpack instruction.
16449 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
16450 Mask, Subtarget, DAG))
16451 return Unpack;
16452
16453 // If we can't directly blend but can use PSHUFB, that will be better as it
16454 // can both shuffle and set up the inefficient blend.
16455 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
16456 bool V1InUse, V2InUse;
16457 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
16458 Zeroable, DAG, V1InUse, V2InUse);
16459 }
16460
16461 // We can always bit-blend if we have to so the fallback strategy is to
16462 // decompose into single-input permutes and blends/unpacks.
16463 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
16464 Mask, Subtarget, DAG);
16465}
16466
16467/// Lower 8-lane 16-bit floating point shuffles.
16468static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16469 const APInt &Zeroable, SDValue V1, SDValue V2,
16470 const X86Subtarget &Subtarget,
16471 SelectionDAG &DAG) {
16472 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16472, __extension__
__PRETTY_FUNCTION__))
;
16473 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16473, __extension__
__PRETTY_FUNCTION__))
;
16474 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16474, __extension__
__PRETTY_FUNCTION__))
;
16475 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16476
16477 if (Subtarget.hasFP16()) {
16478 if (NumV2Elements == 0) {
16479 // Check for being able to broadcast a single element.
16480 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
16481 Mask, Subtarget, DAG))
16482 return Broadcast;
16483 }
16484 if (NumV2Elements == 1 && Mask[0] >= 8)
16485 if (SDValue V = lowerShuffleAsElementInsertion(
16486 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16487 return V;
16488 }
16489
16490 V1 = DAG.getBitcast(MVT::v8i16, V1);
16491 V2 = DAG.getBitcast(MVT::v8i16, V2);
16492 return DAG.getBitcast(MVT::v8f16,
16493 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
16494}
16495
16496// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
16497// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
16498// the active subvector is extracted.
16499static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
16500 ArrayRef<int> Mask, SDValue V1, SDValue V2,
16501 const X86Subtarget &Subtarget,
16502 SelectionDAG &DAG) {
16503 MVT MaskVT = VT.changeTypeToInteger();
16504 SDValue MaskNode;
16505 MVT ShuffleVT = VT;
16506 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
16507 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
16508 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
16509 ShuffleVT = V1.getSimpleValueType();
16510
16511 // Adjust mask to correct indices for the second input.
16512 int NumElts = VT.getVectorNumElements();
16513 unsigned Scale = 512 / VT.getSizeInBits();
16514 SmallVector<int, 32> AdjustedMask(Mask);
16515 for (int &M : AdjustedMask)
16516 if (NumElts <= M)
16517 M += (Scale - 1) * NumElts;
16518 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
16519 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
16520 } else {
16521 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
16522 }
16523
16524 SDValue Result;
16525 if (V2.isUndef())
16526 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
16527 else
16528 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
16529
16530 if (VT != ShuffleVT)
16531 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
16532
16533 return Result;
16534}
16535
16536/// Generic lowering of v16i8 shuffles.
16537///
16538/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
16539/// detect any complexity reducing interleaving. If that doesn't help, it uses
16540/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
16541/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
16542/// back together.
16543static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16544 const APInt &Zeroable, SDValue V1, SDValue V2,
16545 const X86Subtarget &Subtarget,
16546 SelectionDAG &DAG) {
16547 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16547, __extension__
__PRETTY_FUNCTION__))
;
16548 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16548, __extension__
__PRETTY_FUNCTION__))
;
16549 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16549, __extension__
__PRETTY_FUNCTION__))
;
16550
16551 // Try to use shift instructions.
16552 if (SDValue Shift =
16553 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
16554 DAG, /*BitwiseOnly*/ false))
16555 return Shift;
16556
16557 // Try to use byte rotation instructions.
16558 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
16559 Subtarget, DAG))
16560 return Rotate;
16561
16562 // Use dedicated pack instructions for masks that match their pattern.
16563 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
16564 Subtarget))
16565 return V;
16566
16567 // Try to use a zext lowering.
16568 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
16569 Zeroable, Subtarget, DAG))
16570 return ZExt;
16571
16572 // Try to use lower using a truncation.
16573 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16574 Subtarget, DAG))
16575 return V;
16576
16577 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16578 Subtarget, DAG))
16579 return V;
16580
16581 // See if we can use SSE4A Extraction / Insertion.
16582 if (Subtarget.hasSSE4A())
16583 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16584 Zeroable, DAG))
16585 return V;
16586
16587 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16588
16589 // For single-input shuffles, there are some nicer lowering tricks we can use.
16590 if (NumV2Elements == 0) {
16591 // Check for being able to broadcast a single element.
16592 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16593 Mask, Subtarget, DAG))
16594 return Broadcast;
16595
16596 // Try to use bit rotation instructions.
16597 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16598 Subtarget, DAG))
16599 return Rotate;
16600
16601 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16602 return V;
16603
16604 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16605 // Notably, this handles splat and partial-splat shuffles more efficiently.
16606 // However, it only makes sense if the pre-duplication shuffle simplifies
16607 // things significantly. Currently, this means we need to be able to
16608 // express the pre-duplication shuffle as an i16 shuffle.
16609 //
16610 // FIXME: We should check for other patterns which can be widened into an
16611 // i16 shuffle as well.
16612 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16613 for (int i = 0; i < 16; i += 2)
16614 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16615 return false;
16616
16617 return true;
16618 };
16619 auto tryToWidenViaDuplication = [&]() -> SDValue {
16620 if (!canWidenViaDuplication(Mask))
16621 return SDValue();
16622 SmallVector<int, 4> LoInputs;
16623 copy_if(Mask, std::back_inserter(LoInputs),
16624 [](int M) { return M >= 0 && M < 8; });
16625 array_pod_sort(LoInputs.begin(), LoInputs.end());
16626 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16627 LoInputs.end());
16628 SmallVector<int, 4> HiInputs;
16629 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16630 array_pod_sort(HiInputs.begin(), HiInputs.end());
16631 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16632 HiInputs.end());
16633
16634 bool TargetLo = LoInputs.size() >= HiInputs.size();
16635 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16636 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16637
16638 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16639 SmallDenseMap<int, int, 8> LaneMap;
16640 for (int I : InPlaceInputs) {
16641 PreDupI16Shuffle[I/2] = I/2;
16642 LaneMap[I] = I;
16643 }
16644 int j = TargetLo ? 0 : 4, je = j + 4;
16645 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16646 // Check if j is already a shuffle of this input. This happens when
16647 // there are two adjacent bytes after we move the low one.
16648 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16649 // If we haven't yet mapped the input, search for a slot into which
16650 // we can map it.
16651 while (j < je && PreDupI16Shuffle[j] >= 0)
16652 ++j;
16653
16654 if (j == je)
16655 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16656 return SDValue();
16657
16658 // Map this input with the i16 shuffle.
16659 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16660 }
16661
16662 // Update the lane map based on the mapping we ended up with.
16663 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16664 }
16665 V1 = DAG.getBitcast(
16666 MVT::v16i8,
16667 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16668 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16669
16670 // Unpack the bytes to form the i16s that will be shuffled into place.
16671 bool EvenInUse = false, OddInUse = false;
16672 for (int i = 0; i < 16; i += 2) {
16673 EvenInUse |= (Mask[i + 0] >= 0);
16674 OddInUse |= (Mask[i + 1] >= 0);
16675 if (EvenInUse && OddInUse)
16676 break;
16677 }
16678 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16679 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16680 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16681
16682 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16683 for (int i = 0; i < 16; ++i)
16684 if (Mask[i] >= 0) {
16685 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16686 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16686, __extension__
__PRETTY_FUNCTION__))
;
16687 if (PostDupI16Shuffle[i / 2] < 0)
16688 PostDupI16Shuffle[i / 2] = MappedMask;
16689 else
16690 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16691, __extension__
__PRETTY_FUNCTION__))
16691 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16691, __extension__
__PRETTY_FUNCTION__))
;
16692 }
16693 return DAG.getBitcast(
16694 MVT::v16i8,
16695 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16696 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16697 };
16698 if (SDValue V = tryToWidenViaDuplication())
16699 return V;
16700 }
16701
16702 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16703 Zeroable, Subtarget, DAG))
16704 return Masked;
16705
16706 // Use dedicated unpack instructions for masks that match their pattern.
16707 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16708 return V;
16709
16710 // Try to use byte shift instructions to mask.
16711 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16712 Zeroable, Subtarget, DAG))
16713 return V;
16714
16715 // Check for compaction patterns.
16716 bool IsSingleInput = V2.isUndef();
16717 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16718
16719 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16720 // with PSHUFB. It is important to do this before we attempt to generate any
16721 // blends but after all of the single-input lowerings. If the single input
16722 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16723 // want to preserve that and we can DAG combine any longer sequences into
16724 // a PSHUFB in the end. But once we start blending from multiple inputs,
16725 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16726 // and there are *very* few patterns that would actually be faster than the
16727 // PSHUFB approach because of its ability to zero lanes.
16728 //
16729 // If the mask is a binary compaction, we can more efficiently perform this
16730 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16731 //
16732 // FIXME: The only exceptions to the above are blends which are exact
16733 // interleavings with direct instructions supporting them. We currently don't
16734 // handle those well here.
16735 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16736 bool V1InUse = false;
16737 bool V2InUse = false;
16738
16739 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16740 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16741
16742 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16743 // do so. This avoids using them to handle blends-with-zero which is
16744 // important as a single pshufb is significantly faster for that.
16745 if (V1InUse && V2InUse) {
16746 if (Subtarget.hasSSE41())
16747 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16748 Zeroable, Subtarget, DAG))
16749 return Blend;
16750
16751 // We can use an unpack to do the blending rather than an or in some
16752 // cases. Even though the or may be (very minorly) more efficient, we
16753 // preference this lowering because there are common cases where part of
16754 // the complexity of the shuffles goes away when we do the final blend as
16755 // an unpack.
16756 // FIXME: It might be worth trying to detect if the unpack-feeding
16757 // shuffles will both be pshufb, in which case we shouldn't bother with
16758 // this.
16759 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16760 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16761 return Unpack;
16762
16763 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16764 if (Subtarget.hasVBMI())
16765 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16766 DAG);
16767
16768 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16769 if (Subtarget.hasXOP()) {
16770 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16771 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16772 }
16773
16774 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16775 // PALIGNR will be cheaper than the second PSHUFB+OR.
16776 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16777 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16778 return V;
16779 }
16780
16781 return PSHUFB;
16782 }
16783
16784 // There are special ways we can lower some single-element blends.
16785 if (NumV2Elements == 1)
16786 if (SDValue V = lowerShuffleAsElementInsertion(
16787 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16788 return V;
16789
16790 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16791 return Blend;
16792
16793 // Check whether a compaction lowering can be done. This handles shuffles
16794 // which take every Nth element for some even N. See the helper function for
16795 // details.
16796 //
16797 // We special case these as they can be particularly efficiently handled with
16798 // the PACKUSB instruction on x86 and they show up in common patterns of
16799 // rearranging bytes to truncate wide elements.
16800 if (NumEvenDrops) {
16801 // NumEvenDrops is the power of two stride of the elements. Another way of
16802 // thinking about it is that we need to drop the even elements this many
16803 // times to get the original input.
16804
16805 // First we need to zero all the dropped bytes.
16806 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16807, __extension__
__PRETTY_FUNCTION__))
16807 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16807, __extension__
__PRETTY_FUNCTION__))
;
16808 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16809 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16810 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16811 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16812 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16813 WordClearMask);
16814 if (!IsSingleInput)
16815 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16816 WordClearMask);
16817
16818 // Now pack things back together.
16819 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16820 IsSingleInput ? V1 : V2);
16821 for (int i = 1; i < NumEvenDrops; ++i) {
16822 Result = DAG.getBitcast(MVT::v8i16, Result);
16823 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16824 }
16825 return Result;
16826 }
16827
16828 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16829 if (NumOddDrops == 1) {
16830 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16831 DAG.getBitcast(MVT::v8i16, V1),
16832 DAG.getTargetConstant(8, DL, MVT::i8));
16833 if (!IsSingleInput)
16834 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16835 DAG.getBitcast(MVT::v8i16, V2),
16836 DAG.getTargetConstant(8, DL, MVT::i8));
16837 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16838 IsSingleInput ? V1 : V2);
16839 }
16840
16841 // Handle multi-input cases by blending/unpacking single-input shuffles.
16842 if (NumV2Elements > 0)
16843 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16844 Subtarget, DAG);
16845
16846 // The fallback path for single-input shuffles widens this into two v8i16
16847 // vectors with unpacks, shuffles those, and then pulls them back together
16848 // with a pack.
16849 SDValue V = V1;
16850
16851 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16852 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16853 for (int i = 0; i < 16; ++i)
16854 if (Mask[i] >= 0)
16855 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16856
16857 SDValue VLoHalf, VHiHalf;
16858 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16859 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16860 // i16s.
16861 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16862 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16863 // Use a mask to drop the high bytes.
16864 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16865 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16866 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16867
16868 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16869 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16870
16871 // Squash the masks to point directly into VLoHalf.
16872 for (int &M : LoBlendMask)
16873 if (M >= 0)
16874 M /= 2;
16875 for (int &M : HiBlendMask)
16876 if (M >= 0)
16877 M /= 2;
16878 } else {
16879 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16880 // VHiHalf so that we can blend them as i16s.
16881 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16882
16883 VLoHalf = DAG.getBitcast(
16884 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16885 VHiHalf = DAG.getBitcast(
16886 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16887 }
16888
16889 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16890 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16891
16892 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16893}
16894
16895/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16896///
16897/// This routine breaks down the specific type of 128-bit shuffle and
16898/// dispatches to the lowering routines accordingly.
16899static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16900 MVT VT, SDValue V1, SDValue V2,
16901 const APInt &Zeroable,
16902 const X86Subtarget &Subtarget,
16903 SelectionDAG &DAG) {
16904 switch (VT.SimpleTy) {
16905 case MVT::v2i64:
16906 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16907 case MVT::v2f64:
16908 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16909 case MVT::v4i32:
16910 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16911 case MVT::v4f32:
16912 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16913 case MVT::v8i16:
16914 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16915 case MVT::v8f16:
16916 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16917 case MVT::v16i8:
16918 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16919
16920 default:
16921 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16921)
;
16922 }
16923}
16924
16925/// Generic routine to split vector shuffle into half-sized shuffles.
16926///
16927/// This routine just extracts two subvectors, shuffles them independently, and
16928/// then concatenates them back together. This should work effectively with all
16929/// AVX vector shuffle types.
16930static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16931 SDValue V2, ArrayRef<int> Mask,
16932 SelectionDAG &DAG, bool SimpleOnly) {
16933 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16934, __extension__
__PRETTY_FUNCTION__))
16934 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16934, __extension__
__PRETTY_FUNCTION__))
;
16935 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16935, __extension__
__PRETTY_FUNCTION__))
;
16936 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16936, __extension__
__PRETTY_FUNCTION__))
;
16937
16938 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16939 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16940
16941 int NumElements = VT.getVectorNumElements();
16942 int SplitNumElements = NumElements / 2;
16943 MVT ScalarVT = VT.getVectorElementType();
16944 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16945
16946 // Use splitVector/extractSubVector so that split build-vectors just build two
16947 // narrower build vectors. This helps shuffling with splats and zeros.
16948 auto SplitVector = [&](SDValue V) {
16949 SDValue LoV, HiV;
16950 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16951 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16952 DAG.getBitcast(SplitVT, HiV));
16953 };
16954
16955 SDValue LoV1, HiV1, LoV2, HiV2;
16956 std::tie(LoV1, HiV1) = SplitVector(V1);
16957 std::tie(LoV2, HiV2) = SplitVector(V2);
16958
16959 // Now create two 4-way blends of these half-width vectors.
16960 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
16961 bool &UseHiV1, bool &UseLoV2,
16962 bool &UseHiV2) {
16963 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
16964 for (int i = 0; i < SplitNumElements; ++i) {
16965 int M = HalfMask[i];
16966 if (M >= NumElements) {
16967 if (M >= NumElements + SplitNumElements)
16968 UseHiV2 = true;
16969 else
16970 UseLoV2 = true;
16971 } else if (M >= 0) {
16972 if (M >= SplitNumElements)
16973 UseHiV1 = true;
16974 else
16975 UseLoV1 = true;
16976 }
16977 }
16978 };
16979
16980 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
16981 if (!SimpleOnly)
16982 return true;
16983
16984 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16985 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16986
16987 return !(UseHiV1 || UseHiV2);
16988 };
16989
16990 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16991 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16992 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16993 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16994 for (int i = 0; i < SplitNumElements; ++i) {
16995 int M = HalfMask[i];
16996 if (M >= NumElements) {
16997 V2BlendMask[i] = M - NumElements;
16998 BlendMask[i] = SplitNumElements + i;
16999 } else if (M >= 0) {
17000 V1BlendMask[i] = M;
17001 BlendMask[i] = i;
17002 }
17003 }
17004
17005 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17006 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17007
17008 // Because the lowering happens after all combining takes place, we need to
17009 // manually combine these blend masks as much as possible so that we create
17010 // a minimal number of high-level vector shuffle nodes.
17011 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17011, __extension__
__PRETTY_FUNCTION__))
;
17012
17013 // First try just blending the halves of V1 or V2.
17014 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
17015 return DAG.getUNDEF(SplitVT);
17016 if (!UseLoV2 && !UseHiV2)
17017 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17018 if (!UseLoV1 && !UseHiV1)
17019 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17020
17021 SDValue V1Blend, V2Blend;
17022 if (UseLoV1 && UseHiV1) {
17023 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17024 } else {
17025 // We only use half of V1 so map the usage down into the final blend mask.
17026 V1Blend = UseLoV1 ? LoV1 : HiV1;
17027 for (int i = 0; i < SplitNumElements; ++i)
17028 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
17029 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
17030 }
17031 if (UseLoV2 && UseHiV2) {
17032 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17033 } else {
17034 // We only use half of V2 so map the usage down into the final blend mask.
17035 V2Blend = UseLoV2 ? LoV2 : HiV2;
17036 for (int i = 0; i < SplitNumElements; ++i)
17037 if (BlendMask[i] >= SplitNumElements)
17038 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
17039 }
17040 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
17041 };
17042
17043 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
17044 return SDValue();
17045
17046 SDValue Lo = HalfBlend(LoMask);
17047 SDValue Hi = HalfBlend(HiMask);
17048 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17049}
17050
17051/// Either split a vector in halves or decompose the shuffles and the
17052/// blend/unpack.
17053///
17054/// This is provided as a good fallback for many lowerings of non-single-input
17055/// shuffles with more than one 128-bit lane. In those cases, we want to select
17056/// between splitting the shuffle into 128-bit components and stitching those
17057/// back together vs. extracting the single-input shuffles and blending those
17058/// results.
17059static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
17060 SDValue V2, ArrayRef<int> Mask,
17061 const X86Subtarget &Subtarget,
17062 SelectionDAG &DAG) {
17063 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17064, __extension__
__PRETTY_FUNCTION__))
17064 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17064, __extension__
__PRETTY_FUNCTION__))
;
17065 int Size = Mask.size();
17066
17067 // If this can be modeled as a broadcast of two elements followed by a blend,
17068 // prefer that lowering. This is especially important because broadcasts can
17069 // often fold with memory operands.
17070 auto DoBothBroadcast = [&] {
17071 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
17072 for (int M : Mask)
17073 if (M >= Size) {
17074 if (V2BroadcastIdx < 0)
17075 V2BroadcastIdx = M - Size;
17076 else if (M - Size != V2BroadcastIdx)
17077 return false;
17078 } else if (M >= 0) {
17079 if (V1BroadcastIdx < 0)
17080 V1BroadcastIdx = M;
17081 else if (M != V1BroadcastIdx)
17082 return false;
17083 }
17084 return true;
17085 };
17086 if (DoBothBroadcast())
17087 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17088 DAG);
17089
17090 // If the inputs all stem from a single 128-bit lane of each input, then we
17091 // split them rather than blending because the split will decompose to
17092 // unusually few instructions.
17093 int LaneCount = VT.getSizeInBits() / 128;
17094 int LaneSize = Size / LaneCount;
17095 SmallBitVector LaneInputs[2];
17096 LaneInputs[0].resize(LaneCount, false);
17097 LaneInputs[1].resize(LaneCount, false);
17098 for (int i = 0; i < Size; ++i)
17099 if (Mask[i] >= 0)
17100 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
17101 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17102 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17103 /*SimpleOnly*/ false);
17104
17105 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
17106 // requires that the decomposed single-input shuffles don't end up here.
17107 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17108 DAG);
17109}
17110
17111// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17112// TODO: Extend to support v8f32 (+ 512-bit shuffles).
17113static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
17114 SDValue V1, SDValue V2,
17115 ArrayRef<int> Mask,
17116 SelectionDAG &DAG) {
17117 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17117, __extension__
__PRETTY_FUNCTION__))
;
17118
17119 int LHSMask[4] = {-1, -1, -1, -1};
17120 int RHSMask[4] = {-1, -1, -1, -1};
17121 unsigned SHUFPMask = 0;
17122
17123 // As SHUFPD uses a single LHS/RHS element per lane, we can always
17124 // perform the shuffle once the lanes have been shuffled in place.
17125 for (int i = 0; i != 4; ++i) {
17126 int M = Mask[i];
17127 if (M < 0)
17128 continue;
17129 int LaneBase = i & ~1;
17130 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
17131 LaneMask[LaneBase + (M & 1)] = M;
17132 SHUFPMask |= (M & 1) << i;
17133 }
17134
17135 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
17136 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
17137 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
17138 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
17139}
17140
17141/// Lower a vector shuffle crossing multiple 128-bit lanes as
17142/// a lane permutation followed by a per-lane permutation.
17143///
17144/// This is mainly for cases where we can have non-repeating permutes
17145/// in each lane.
17146///
17147/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
17148/// we should investigate merging them.
17149static SDValue lowerShuffleAsLanePermuteAndPermute(
17150 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17151 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17152 int NumElts = VT.getVectorNumElements();
17153 int NumLanes = VT.getSizeInBits() / 128;
17154 int NumEltsPerLane = NumElts / NumLanes;
17155 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
17156
17157 /// Attempts to find a sublane permute with the given size
17158 /// that gets all elements into their target lanes.
17159 ///
17160 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
17161 /// If unsuccessful, returns false and may overwrite InLaneMask.
17162 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
17163 int NumSublanesPerLane = NumSublanes / NumLanes;
17164 int NumEltsPerSublane = NumElts / NumSublanes;
17165
17166 SmallVector<int, 16> CrossLaneMask;
17167 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
17168 // CrossLaneMask but one entry == one sublane.
17169 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
17170
17171 for (int i = 0; i != NumElts; ++i) {
17172 int M = Mask[i];
17173 if (M < 0)
17174 continue;
17175
17176 int SrcSublane = M / NumEltsPerSublane;
17177 int DstLane = i / NumEltsPerLane;
17178
17179 // We only need to get the elements into the right lane, not sublane.
17180 // So search all sublanes that make up the destination lane.
17181 bool Found = false;
17182 int DstSubStart = DstLane * NumSublanesPerLane;
17183 int DstSubEnd = DstSubStart + NumSublanesPerLane;
17184 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
17185 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
17186 continue;
17187
17188 Found = true;
17189 CrossLaneMaskLarge[DstSublane] = SrcSublane;
17190 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
17191 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
17192 break;
17193 }
17194 if (!Found)
17195 return SDValue();
17196 }
17197
17198 // Fill CrossLaneMask using CrossLaneMaskLarge.
17199 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
17200
17201 if (!CanUseSublanes) {
17202 // If we're only shuffling a single lowest lane and the rest are identity
17203 // then don't bother.
17204 // TODO - isShuffleMaskInputInPlace could be extended to something like
17205 // this.
17206 int NumIdentityLanes = 0;
17207 bool OnlyShuffleLowestLane = true;
17208 for (int i = 0; i != NumLanes; ++i) {
17209 int LaneOffset = i * NumEltsPerLane;
17210 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
17211 i * NumEltsPerLane))
17212 NumIdentityLanes++;
17213 else if (CrossLaneMask[LaneOffset] != 0)
17214 OnlyShuffleLowestLane = false;
17215 }
17216 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
17217 return SDValue();
17218 }
17219
17220 // Avoid returning the same shuffle operation. For example,
17221 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
17222 // undef:v16i16
17223 if (CrossLaneMask == Mask || InLaneMask == Mask)
17224 return SDValue();
17225
17226 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
17227 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
17228 InLaneMask);
17229 };
17230
17231 // First attempt a solution with full lanes.
17232 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
17233 return V;
17234
17235 // The rest of the solutions use sublanes.
17236 if (!CanUseSublanes)
17237 return SDValue();
17238
17239 // Then attempt a solution with 64-bit sublanes (vpermq).
17240 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
17241 return V;
17242
17243 // If that doesn't work and we have fast variable cross-lane shuffle,
17244 // attempt 32-bit sublanes (vpermd).
17245 if (!Subtarget.hasFastVariableCrossLaneShuffle())
17246 return SDValue();
17247
17248 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17249}
17250
17251/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17252static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17253 SmallVector<int> &InLaneMask) {
17254 int Size = Mask.size();
17255 InLaneMask.assign(Mask.begin(), Mask.end());
17256 for (int i = 0; i < Size; ++i) {
17257 int &M = InLaneMask[i];
17258 if (M < 0)
17259 continue;
17260 if (((M % Size) / LaneSize) != (i / LaneSize))
17261 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17262 }
17263}
17264
17265/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17266/// source with a lane permutation.
17267///
17268/// This lowering strategy results in four instructions in the worst case for a
17269/// single-input cross lane shuffle which is lower than any other fully general
17270/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
17271/// shuffle pattern should be handled prior to trying this lowering.
17272static SDValue lowerShuffleAsLanePermuteAndShuffle(
17273 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17274 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17275 // FIXME: This should probably be generalized for 512-bit vectors as well.
17276 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17276, __extension__
__PRETTY_FUNCTION__))
;
17277 int Size = Mask.size();
17278 int LaneSize = Size / 2;
17279
17280 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17281 // Only do this if the elements aren't all from the lower lane,
17282 // otherwise we're (probably) better off doing a split.
17283 if (VT == MVT::v4f64 &&
17284 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
17285 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
17286
17287 // If there are only inputs from one 128-bit lane, splitting will in fact be
17288 // less expensive. The flags track whether the given lane contains an element
17289 // that crosses to another lane.
17290 bool AllLanes;
17291 if (!Subtarget.hasAVX2()) {
17292 bool LaneCrossing[2] = {false, false};
17293 for (int i = 0; i < Size; ++i)
17294 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
17295 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
17296 AllLanes = LaneCrossing[0] && LaneCrossing[1];
17297 } else {
17298 bool LaneUsed[2] = {false, false};
17299 for (int i = 0; i < Size; ++i)
17300 if (Mask[i] >= 0)
17301 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
17302 AllLanes = LaneUsed[0] && LaneUsed[1];
17303 }
17304
17305 // TODO - we could support shuffling V2 in the Flipped input.
17306 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17307, __extension__
__PRETTY_FUNCTION__))
17307 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17307, __extension__
__PRETTY_FUNCTION__))
;
17308
17309 SmallVector<int> InLaneMask;
17310 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17311
17312 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17313, __extension__
__PRETTY_FUNCTION__))
17313 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17313, __extension__
__PRETTY_FUNCTION__))
;
17314
17315 // If we're not using both lanes in each lane and the inlane mask is not
17316 // repeating, then we're better off splitting.
17317 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17318 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17319 /*SimpleOnly*/ false);
17320
17321 // Flip the lanes, and shuffle the results which should now be in-lane.
17322 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
17323 SDValue Flipped = DAG.getBitcast(PVT, V1);
17324 Flipped =
17325 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
17326 Flipped = DAG.getBitcast(VT, Flipped);
17327 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
17328}
17329
17330/// Handle lowering 2-lane 128-bit shuffles.
17331static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
17332 SDValue V2, ArrayRef<int> Mask,
17333 const APInt &Zeroable,
17334 const X86Subtarget &Subtarget,
17335 SelectionDAG &DAG) {
17336 if (V2.isUndef()) {
17337 // Attempt to match VBROADCAST*128 subvector broadcast load.
17338 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
17339 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
17340 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
17341 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
17342 MVT MemVT = VT.getHalfNumVectorElementsVT();
17343 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
17344 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
17345 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
17346 VT, MemVT, Ld, Ofs, DAG))
17347 return BcstLd;
17348 }
17349
17350 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
17351 if (Subtarget.hasAVX2())
17352 return SDValue();
17353 }
17354
17355 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
17356
17357 SmallVector<int, 4> WidenedMask;
17358 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
17359 return SDValue();
17360
17361 bool IsLowZero = (Zeroable & 0x3) == 0x3;
17362 bool IsHighZero = (Zeroable & 0xc) == 0xc;
17363
17364 // Try to use an insert into a zero vector.
17365 if (WidenedMask[0] == 0 && IsHighZero) {
17366 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17367 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17368 DAG.getIntPtrConstant(0, DL));
17369 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17370 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17371 DAG.getIntPtrConstant(0, DL));
17372 }
17373
17374 // TODO: If minimizing size and one of the inputs is a zero vector and the
17375 // the zero vector has only one use, we could use a VPERM2X128 to save the
17376 // instruction bytes needed to explicitly generate the zero vector.
17377
17378 // Blends are faster and handle all the non-lane-crossing cases.
17379 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
17380 Subtarget, DAG))
17381 return Blend;
17382
17383 // If either input operand is a zero vector, use VPERM2X128 because its mask
17384 // allows us to replace the zero input with an implicit zero.
17385 if (!IsLowZero && !IsHighZero) {
17386 // Check for patterns which can be matched with a single insert of a 128-bit
17387 // subvector.
17388 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
17389 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
17390
17391 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
17392 // this will likely become vinsertf128 which can't fold a 256-bit memop.
17393 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
17394 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17395 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
17396 OnlyUsesV1 ? V1 : V2,
17397 DAG.getIntPtrConstant(0, DL));
17398 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17399 DAG.getIntPtrConstant(2, DL));
17400 }
17401 }
17402
17403 // Try to use SHUF128 if possible.
17404 if (Subtarget.hasVLX()) {
17405 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
17406 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
17407 ((WidenedMask[1] % 2) << 1);
17408 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
17409 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17410 }
17411 }
17412 }
17413
17414 // Otherwise form a 128-bit permutation. After accounting for undefs,
17415 // convert the 64-bit shuffle mask selection values into 128-bit
17416 // selection bits by dividing the indexes by 2 and shifting into positions
17417 // defined by a vperm2*128 instruction's immediate control byte.
17418
17419 // The immediate permute control byte looks like this:
17420 // [1:0] - select 128 bits from sources for low half of destination
17421 // [2] - ignore
17422 // [3] - zero low half of destination
17423 // [5:4] - select 128 bits from sources for high half of destination
17424 // [6] - ignore
17425 // [7] - zero high half of destination
17426
17427 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17428, __extension__
__PRETTY_FUNCTION__))
17428 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17428, __extension__
__PRETTY_FUNCTION__))
;
17429
17430 unsigned PermMask = 0;
17431 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
17432 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
17433
17434 // Check the immediate mask and replace unused sources with undef.
17435 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
17436 V1 = DAG.getUNDEF(VT);
17437 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
17438 V2 = DAG.getUNDEF(VT);
17439
17440 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
17441 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17442}
17443
17444/// Lower a vector shuffle by first fixing the 128-bit lanes and then
17445/// shuffling each lane.
17446///
17447/// This attempts to create a repeated lane shuffle where each lane uses one
17448/// or two of the lanes of the inputs. The lanes of the input vectors are
17449/// shuffled in one or two independent shuffles to get the lanes into the
17450/// position needed by the final shuffle.
17451static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
17452 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17453 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17454 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17454, __extension__
__PRETTY_FUNCTION__))
;
14
'?' condition is true
17455
17456 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15
Assuming the condition is false
16
Taking false branch
17457 return SDValue();
17458
17459 int NumElts = Mask.size();
17460 int NumLanes = VT.getSizeInBits() / 128;
17461 int NumLaneElts = 128 / VT.getScalarSizeInBits();
17
'NumLaneElts' initialized here
17462 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
17463 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
17464
17465 // First pass will try to fill in the RepeatMask from lanes that need two
17466 // sources.
17467 for (int Lane = 0; Lane != NumLanes; ++Lane) {
18
Assuming 'Lane' is not equal to 'NumLanes'
19
Loop condition is true. Entering loop body
24
Assuming 'Lane' is equal to 'NumLanes'
25
Loop condition is false. Execution continues on line 17539
17468 int Srcs[2] = {-1, -1};
17469 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
17470 for (int i = 0; i != NumLaneElts; ++i) {
20
Assuming 'i' is equal to 'NumLaneElts'
21
Loop condition is false. Execution continues on line 17492
17471 int M = Mask[(Lane * NumLaneElts) + i];
17472 if (M < 0)
17473 continue;
17474 // Determine which of the possible input lanes (NumLanes from each source)
17475 // this element comes from. Assign that as one of the sources for this
17476 // lane. We can assign up to 2 sources for this lane. If we run out
17477 // sources we can't do anything.
17478 int LaneSrc = M / NumLaneElts;
17479 int Src;
17480 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
17481 Src = 0;
17482 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
17483 Src = 1;
17484 else
17485 return SDValue();
17486
17487 Srcs[Src] = LaneSrc;
17488 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
17489 }
17490
17491 // If this lane has two sources, see if it fits with the repeat mask so far.
17492 if (Srcs[1] < 0)
22
Taking true branch
17493 continue;
23
Execution continues on line 17467
17494
17495 LaneSrcs[Lane][0] = Srcs[0];
17496 LaneSrcs[Lane][1] = Srcs[1];
17497
17498 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
17499 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17499, __extension__
__PRETTY_FUNCTION__))
;
17500 for (int i = 0, e = M1.size(); i != e; ++i)
17501 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
17502 return false;
17503 return true;
17504 };
17505
17506 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
17507 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17507, __extension__
__PRETTY_FUNCTION__))
;
17508 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
17509 int M = Mask[i];
17510 if (M < 0)
17511 continue;
17512 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17513, __extension__
__PRETTY_FUNCTION__))
17513 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17513, __extension__
__PRETTY_FUNCTION__))
;
17514 MergedMask[i] = M;
17515 }
17516 };
17517
17518 if (MatchMasks(InLaneMask, RepeatMask)) {
17519 // Merge this lane mask into the final repeat mask.
17520 MergeMasks(InLaneMask, RepeatMask);
17521 continue;
17522 }
17523
17524 // Didn't find a match. Swap the operands and try again.
17525 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
17526 ShuffleVectorSDNode::commuteMask(InLaneMask);
17527
17528 if (MatchMasks(InLaneMask, RepeatMask)) {
17529 // Merge this lane mask into the final repeat mask.
17530 MergeMasks(InLaneMask, RepeatMask);
17531 continue;
17532 }
17533
17534 // Couldn't find a match with the operands in either order.
17535 return SDValue();
17536 }
17537
17538 // Now handle any lanes with only one source.
17539 for (int Lane = 0; Lane != NumLanes; ++Lane) {
26
Loop condition is true. Entering loop body
30
Loop condition is false. Execution continues on line 17568
17540 // If this lane has already been processed, skip it.
17541 if (LaneSrcs[Lane][0] >= 0)
27
Assuming the condition is true
28
Taking true branch
17542 continue;
29
Execution continues on line 17539
17543
17544 for (int i = 0; i != NumLaneElts; ++i) {
17545 int M = Mask[(Lane * NumLaneElts) + i];
17546 if (M < 0)
17547 continue;
17548
17549 // If RepeatMask isn't defined yet we can define it ourself.
17550 if (RepeatMask[i] < 0)
17551 RepeatMask[i] = M % NumLaneElts;
17552
17553 if (RepeatMask[i] < NumElts) {
17554 if (RepeatMask[i] != M % NumLaneElts)
17555 return SDValue();
17556 LaneSrcs[Lane][0] = M / NumLaneElts;
17557 } else {
17558 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
17559 return SDValue();
17560 LaneSrcs[Lane][1] = M / NumLaneElts;
17561 }
17562 }
17563
17564 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
17565 return SDValue();
17566 }
17567
17568 SmallVector<int, 16> NewMask(NumElts, -1);
17569 for (int Lane = 0; Lane != NumLanes; ++Lane) {
31
Loop condition is true. Entering loop body
33
Loop condition is false. Execution continues on line 17578
17570 int Src = LaneSrcs[Lane][0];
17571 for (int i = 0; i != NumLaneElts; ++i) {
32
Loop condition is false. Execution continues on line 17569
17572 int M = -1;
17573 if (Src >= 0)
17574 M = Src * NumLaneElts + i;
17575 NewMask[Lane * NumLaneElts + i] = M;
17576 }
17577 }
17578 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17579 // Ensure we didn't get back the shuffle we started with.
17580 // FIXME: This is a hack to make up for some splat handling code in
17581 // getVectorShuffle.
17582 if (isa<ShuffleVectorSDNode>(NewV1) &&
34
Assuming 'NewV1' is not a 'ShuffleVectorSDNode'
17583 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
17584 return SDValue();
17585
17586 for (int Lane = 0; Lane != NumLanes; ++Lane) {
35
Loop condition is true. Entering loop body
37
Loop condition is false. Execution continues on line 17595
17587 int Src = LaneSrcs[Lane][1];
17588 for (int i = 0; i != NumLaneElts; ++i) {
36
Loop condition is false. Execution continues on line 17586
17589 int M = -1;
17590 if (Src >= 0)
17591 M = Src * NumLaneElts + i;
17592 NewMask[Lane * NumLaneElts + i] = M;
17593 }
17594 }
17595 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17596 // Ensure we didn't get back the shuffle we started with.
17597 // FIXME: This is a hack to make up for some splat handling code in
17598 // getVectorShuffle.
17599 if (isa<ShuffleVectorSDNode>(NewV2) &&
38
Assuming 'NewV2' is not a 'ShuffleVectorSDNode'
17600 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
17601 return SDValue();
17602
17603 for (int i = 0; i != NumElts; ++i) {
39
Assuming 'i' is not equal to 'NumElts'
40
Loop condition is true. Entering loop body
17604 if (Mask[i] < 0) {
41
Assuming the condition is false
42
Taking false branch
17605 NewMask[i] = -1;
17606 continue;
17607 }
17608 NewMask[i] = RepeatMask[i % NumLaneElts];
43
Division by zero
17609 if (NewMask[i] < 0)
17610 continue;
17611
17612 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
17613 }
17614 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
17615}
17616
17617/// If the input shuffle mask results in a vector that is undefined in all upper
17618/// or lower half elements and that mask accesses only 2 halves of the
17619/// shuffle's operands, return true. A mask of half the width with mask indexes
17620/// adjusted to access the extracted halves of the original shuffle operands is
17621/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17622/// lower half of each input operand is accessed.
17623static bool
17624getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17625 int &HalfIdx1, int &HalfIdx2) {
17626 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__))
17627 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__))
;
17628
17629 // Exactly one half of the result must be undef to allow narrowing.
17630 bool UndefLower = isUndefLowerHalf(Mask);
17631 bool UndefUpper = isUndefUpperHalf(Mask);
17632 if (UndefLower == UndefUpper)
17633 return false;
17634
17635 unsigned HalfNumElts = HalfMask.size();
17636 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17637 HalfIdx1 = -1;
17638 HalfIdx2 = -1;
17639 for (unsigned i = 0; i != HalfNumElts; ++i) {
17640 int M = Mask[i + MaskIndexOffset];
17641 if (M < 0) {
17642 HalfMask[i] = M;
17643 continue;
17644 }
17645
17646 // Determine which of the 4 half vectors this element is from.
17647 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17648 int HalfIdx = M / HalfNumElts;
17649
17650 // Determine the element index into its half vector source.
17651 int HalfElt = M % HalfNumElts;
17652
17653 // We can shuffle with up to 2 half vectors, set the new 'half'
17654 // shuffle mask accordingly.
17655 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17656 HalfMask[i] = HalfElt;
17657 HalfIdx1 = HalfIdx;
17658 continue;
17659 }
17660 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17661 HalfMask[i] = HalfElt + HalfNumElts;
17662 HalfIdx2 = HalfIdx;
17663 continue;
17664 }
17665
17666 // Too many half vectors referenced.
17667 return false;
17668 }
17669
17670 return true;
17671}
17672
17673/// Given the output values from getHalfShuffleMask(), create a half width
17674/// shuffle of extracted vectors followed by an insert back to full width.
17675static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17676 ArrayRef<int> HalfMask, int HalfIdx1,
17677 int HalfIdx2, bool UndefLower,
17678 SelectionDAG &DAG, bool UseConcat = false) {
17679 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17679, __extension__
__PRETTY_FUNCTION__))
;
17680 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17680, __extension__
__PRETTY_FUNCTION__))
;
17681
17682 MVT VT = V1.getSimpleValueType();
17683 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17684 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17685
17686 auto getHalfVector = [&](int HalfIdx) {
17687 if (HalfIdx < 0)
17688 return DAG.getUNDEF(HalfVT);
17689 SDValue V = (HalfIdx < 2 ? V1 : V2);
17690 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17691 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17692 DAG.getIntPtrConstant(HalfIdx, DL));
17693 };
17694
17695 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17696 SDValue Half1 = getHalfVector(HalfIdx1);
17697 SDValue Half2 = getHalfVector(HalfIdx2);
17698 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17699 if (UseConcat) {
17700 SDValue Op0 = V;
17701 SDValue Op1 = DAG.getUNDEF(HalfVT);
17702 if (UndefLower)
17703 std::swap(Op0, Op1);
17704 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17705 }
17706
17707 unsigned Offset = UndefLower ? HalfNumElts : 0;
17708 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17709 DAG.getIntPtrConstant(Offset, DL));
17710}
17711
17712/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17713/// This allows for fast cases such as subvector extraction/insertion
17714/// or shuffling smaller vector types which can lower more efficiently.
17715static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17716 SDValue V2, ArrayRef<int> Mask,
17717 const X86Subtarget &Subtarget,
17718 SelectionDAG &DAG) {
17719 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17720, __extension__
__PRETTY_FUNCTION__))
17720 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17720, __extension__
__PRETTY_FUNCTION__))
;
17721
17722 bool UndefLower = isUndefLowerHalf(Mask);
17723 if (!UndefLower && !isUndefUpperHalf(Mask))
17724 return SDValue();
17725
17726 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17727, __extension__
__PRETTY_FUNCTION__))
17727 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17727, __extension__
__PRETTY_FUNCTION__))
;
17728
17729 // Upper half is undef and lower half is whole upper subvector.
17730 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17731 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17732 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17733 if (!UndefLower &&
17734 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17735 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17736 DAG.getIntPtrConstant(HalfNumElts, DL));
17737 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17738 DAG.getIntPtrConstant(0, DL));
17739 }
17740
17741 // Lower half is undef and upper half is whole lower subvector.
17742 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17743 if (UndefLower &&
17744 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17745 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17746 DAG.getIntPtrConstant(0, DL));
17747 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17748 DAG.getIntPtrConstant(HalfNumElts, DL));
17749 }
17750
17751 int HalfIdx1, HalfIdx2;
17752 SmallVector<int, 8> HalfMask(HalfNumElts);
17753 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17754 return SDValue();
17755
17756 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17756, __extension__
__PRETTY_FUNCTION__))
;
17757
17758 // Only shuffle the halves of the inputs when useful.
17759 unsigned NumLowerHalves =
17760 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17761 unsigned NumUpperHalves =
17762 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17763 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17763, __extension__
__PRETTY_FUNCTION__))
;
17764
17765 // Determine the larger pattern of undef/halves, then decide if it's worth
17766 // splitting the shuffle based on subtarget capabilities and types.
17767 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17768 if (!UndefLower) {
17769 // XXXXuuuu: no insert is needed.
17770 // Always extract lowers when setting lower - these are all free subreg ops.
17771 if (NumUpperHalves == 0)
17772 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17773 UndefLower, DAG);
17774
17775 if (NumUpperHalves == 1) {
17776 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17777 if (Subtarget.hasAVX2()) {
17778 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17779 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17780 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
17781 (!isSingleSHUFPSMask(HalfMask) ||
17782 Subtarget.hasFastVariableCrossLaneShuffle()))
17783 return SDValue();
17784 // If this is a unary shuffle (assume that the 2nd operand is
17785 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17786 // are better off extracting the upper half of 1 operand and using a
17787 // narrow shuffle.
17788 if (EltWidth == 64 && V2.isUndef())
17789 return SDValue();
17790 }
17791 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17792 if (Subtarget.hasAVX512() && VT.is512BitVector())
17793 return SDValue();
17794 // Extract + narrow shuffle is better than the wide alternative.
17795 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17796 UndefLower, DAG);
17797 }
17798
17799 // Don't extract both uppers, instead shuffle and then extract.
17800 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17800, __extension__
__PRETTY_FUNCTION__))
;
17801 return SDValue();
17802 }
17803
17804 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17805 if (NumUpperHalves == 0) {
17806 // AVX2 has efficient 64-bit element cross-lane shuffles.
17807 // TODO: Refine to account for unary shuffle, splat, and other masks?
17808 if (Subtarget.hasAVX2() && EltWidth == 64)
17809 return SDValue();
17810 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17811 if (Subtarget.hasAVX512() && VT.is512BitVector())
17812 return SDValue();
17813 // Narrow shuffle + insert is better than the wide alternative.
17814 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17815 UndefLower, DAG);
17816 }
17817
17818 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17819 return SDValue();
17820}
17821
17822/// Handle case where shuffle sources are coming from the same 128-bit lane and
17823/// every lane can be represented as the same repeating mask - allowing us to
17824/// shuffle the sources with the repeating shuffle and then permute the result
17825/// to the destination lanes.
17826static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17827 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17828 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17829 int NumElts = VT.getVectorNumElements();
17830 int NumLanes = VT.getSizeInBits() / 128;
17831 int NumLaneElts = NumElts / NumLanes;
17832
17833 // On AVX2 we may be able to just shuffle the lowest elements and then
17834 // broadcast the result.
17835 if (Subtarget.hasAVX2()) {
17836 for (unsigned BroadcastSize : {16, 32, 64}) {
17837 if (BroadcastSize <= VT.getScalarSizeInBits())
17838 continue;
17839 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17840
17841 // Attempt to match a repeating pattern every NumBroadcastElts,
17842 // accounting for UNDEFs but only references the lowest 128-bit
17843 // lane of the inputs.
17844 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17845 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17846 for (int j = 0; j != NumBroadcastElts; ++j) {
17847 int M = Mask[i + j];
17848 if (M < 0)
17849 continue;
17850 int &R = RepeatMask[j];
17851 if (0 != ((M % NumElts) / NumLaneElts))
17852 return false;
17853 if (0 <= R && R != M)
17854 return false;
17855 R = M;
17856 }
17857 return true;
17858 };
17859
17860 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17861 if (!FindRepeatingBroadcastMask(RepeatMask))
17862 continue;
17863
17864 // Shuffle the (lowest) repeated elements in place for broadcast.
17865 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17866
17867 // Shuffle the actual broadcast.
17868 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17869 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17870 for (int j = 0; j != NumBroadcastElts; ++j)
17871 BroadcastMask[i + j] = j;
17872 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17873 BroadcastMask);
17874 }
17875 }
17876
17877 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17878 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17879 return SDValue();
17880
17881 // Bail if we already have a repeated lane shuffle mask.
17882 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17883 return SDValue();
17884
17885 // Helper to look for repeated mask in each split sublane, and that those
17886 // sublanes can then be permuted into place.
17887 auto ShuffleSubLanes = [&](int SubLaneScale) {
17888 int NumSubLanes = NumLanes * SubLaneScale;
17889 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17890
17891 // Check that all the sources are coming from the same lane and see if we
17892 // can form a repeating shuffle mask (local to each sub-lane). At the same
17893 // time, determine the source sub-lane for each destination sub-lane.
17894 int TopSrcSubLane = -1;
17895 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17896 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17897 SubLaneScale,
17898 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17899
17900 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17901 // Extract the sub-lane mask, check that it all comes from the same lane
17902 // and normalize the mask entries to come from the first lane.
17903 int SrcLane = -1;
17904 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17905 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17906 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17907 if (M < 0)
17908 continue;
17909 int Lane = (M % NumElts) / NumLaneElts;
17910 if ((0 <= SrcLane) && (SrcLane != Lane))
17911 return SDValue();
17912 SrcLane = Lane;
17913 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17914 SubLaneMask[Elt] = LocalM;
17915 }
17916
17917 // Whole sub-lane is UNDEF.
17918 if (SrcLane < 0)
17919 continue;
17920
17921 // Attempt to match against the candidate repeated sub-lane masks.
17922 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17923 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17924 for (int i = 0; i != NumSubLaneElts; ++i) {
17925 if (M1[i] < 0 || M2[i] < 0)
17926 continue;
17927 if (M1[i] != M2[i])
17928 return false;
17929 }
17930 return true;
17931 };
17932
17933 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17934 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
17935 continue;
17936
17937 // Merge the sub-lane mask into the matching repeated sub-lane mask.
17938 for (int i = 0; i != NumSubLaneElts; ++i) {
17939 int M = SubLaneMask[i];
17940 if (M < 0)
17941 continue;
17942 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17943, __extension__
__PRETTY_FUNCTION__))
17943 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17943, __extension__
__PRETTY_FUNCTION__))
;
17944 RepeatedSubLaneMask[i] = M;
17945 }
17946
17947 // Track the top most source sub-lane - by setting the remaining to
17948 // UNDEF we can greatly simplify shuffle matching.
17949 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
17950 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
17951 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
17952 break;
17953 }
17954
17955 // Bail if we failed to find a matching repeated sub-lane mask.
17956 if (Dst2SrcSubLanes[DstSubLane] < 0)
17957 return SDValue();
17958 }
17959 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17960, __extension__
__PRETTY_FUNCTION__))
17960 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17960, __extension__
__PRETTY_FUNCTION__))
;
17961
17962 // Create a repeating shuffle mask for the entire vector.
17963 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17964 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17965 int Lane = SubLane / SubLaneScale;
17966 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17967 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17968 int M = RepeatedSubLaneMask[Elt];
17969 if (M < 0)
17970 continue;
17971 int Idx = (SubLane * NumSubLaneElts) + Elt;
17972 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17973 }
17974 }
17975
17976 // Shuffle each source sub-lane to its destination.
17977 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17978 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17979 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17980 if (SrcSubLane < 0)
17981 continue;
17982 for (int j = 0; j != NumSubLaneElts; ++j)
17983 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17984 }
17985
17986 // Avoid returning the same shuffle operation.
17987 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
17988 if (RepeatedMask == Mask || SubLaneMask == Mask)
17989 return SDValue();
17990
17991 SDValue RepeatedShuffle =
17992 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17993
17994 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17995 SubLaneMask);
17996 };
17997
17998 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
17999 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
18000 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
18001 // Otherwise we can only permute whole 128-bit lanes.
18002 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
18003 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
18004 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
18005 MinSubLaneScale = 2;
18006 MaxSubLaneScale =
18007 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
18008 }
18009 if (Subtarget.hasBWI() && VT == MVT::v64i8)
18010 MinSubLaneScale = MaxSubLaneScale = 4;
18011
18012 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
18013 if (SDValue Shuffle = ShuffleSubLanes(Scale))
18014 return Shuffle;
18015
18016 return SDValue();
18017}
18018
18019static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
18020 bool &ForceV1Zero, bool &ForceV2Zero,
18021 unsigned &ShuffleImm, ArrayRef<int> Mask,
18022 const APInt &Zeroable) {
18023 int NumElts = VT.getVectorNumElements();
18024 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18026, __extension__
__PRETTY_FUNCTION__))
18025 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18026, __extension__
__PRETTY_FUNCTION__))
18026 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18026, __extension__
__PRETTY_FUNCTION__))
;
18027 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18028, __extension__
__PRETTY_FUNCTION__))
18028 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18028, __extension__
__PRETTY_FUNCTION__))
;
18029
18030 bool ZeroLane[2] = { true, true };
18031 for (int i = 0; i < NumElts; ++i)
18032 ZeroLane[i & 1] &= Zeroable[i];
18033
18034 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
18035 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
18036 ShuffleImm = 0;
18037 bool ShufpdMask = true;
18038 bool CommutableMask = true;
18039 for (int i = 0; i < NumElts; ++i) {
18040 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
18041 continue;
18042 if (Mask[i] < 0)
18043 return false;
18044 int Val = (i & 6) + NumElts * (i & 1);
18045 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
18046 if (Mask[i] < Val || Mask[i] > Val + 1)
18047 ShufpdMask = false;
18048 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
18049 CommutableMask = false;
18050 ShuffleImm |= (Mask[i] % 2) << i;
18051 }
18052
18053 if (!ShufpdMask && !CommutableMask)
18054 return false;
18055
18056 if (!ShufpdMask && CommutableMask)
18057 std::swap(V1, V2);
18058
18059 ForceV1Zero = ZeroLane[0];
18060 ForceV2Zero = ZeroLane[1];
18061 return true;
18062}
18063
18064static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
18065 SDValue V2, ArrayRef<int> Mask,
18066 const APInt &Zeroable,
18067 const X86Subtarget &Subtarget,
18068 SelectionDAG &DAG) {
18069 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18070, __extension__
__PRETTY_FUNCTION__))
18070 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18070, __extension__
__PRETTY_FUNCTION__))
;
18071
18072 unsigned Immediate = 0;
18073 bool ForceV1Zero = false, ForceV2Zero = false;
18074 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
18075 Mask, Zeroable))
18076 return SDValue();
18077
18078 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
18079 if (ForceV1Zero)
18080 V1 = getZeroVector(VT, Subtarget, DAG, DL);
18081 if (ForceV2Zero)
18082 V2 = getZeroVector(VT, Subtarget, DAG, DL);
18083
18084 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
18085 DAG.getTargetConstant(Immediate, DL, MVT::i8));
18086}
18087
18088// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18089// by zeroable elements in the remaining 24 elements. Turn this into two
18090// vmovqb instructions shuffled together.
18091static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
18092 SDValue V1, SDValue V2,
18093 ArrayRef<int> Mask,
18094 const APInt &Zeroable,
18095 SelectionDAG &DAG) {
18096 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18096, __extension__
__PRETTY_FUNCTION__))
;
18097
18098 // The first 8 indices should be every 8th element.
18099 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
18100 return SDValue();
18101
18102 // Remaining elements need to be zeroable.
18103 if (Zeroable.countl_one() < (Mask.size() - 8))
18104 return SDValue();
18105
18106 V1 = DAG.getBitcast(MVT::v4i64, V1);
18107 V2 = DAG.getBitcast(MVT::v4i64, V2);
18108
18109 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
18110 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
18111
18112 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
18113 // the upper bits of the result using an unpckldq.
18114 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
18115 { 0, 1, 2, 3, 16, 17, 18, 19,
18116 4, 5, 6, 7, 20, 21, 22, 23 });
18117 // Insert the unpckldq into a zero vector to widen to v32i8.
18118 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
18119 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
18120 DAG.getIntPtrConstant(0, DL));
18121}
18122
18123// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
18124// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
18125// =>
18126// ul = unpckl v1, v2
18127// uh = unpckh v1, v2
18128// a = vperm ul, uh
18129// b = vperm ul, uh
18130//
18131// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
18132// and permute. We cannot directly match v3 because it is split into two
18133// 256-bit vectors in earlier isel stages. Therefore, this function matches a
18134// pair of 256-bit shuffles and makes sure the masks are consecutive.
18135//
18136// Once unpck and permute nodes are created, the permute corresponding to this
18137// shuffle is returned, while the other permute replaces the other half of the
18138// shuffle in the selection dag.
18139static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
18140 SDValue V1, SDValue V2,
18141 ArrayRef<int> Mask,
18142 SelectionDAG &DAG) {
18143 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
18144 VT != MVT::v32i8)
18145 return SDValue();
18146 // <B0, B1, B0+1, B1+1, ..., >
18147 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
18148 unsigned Begin1) {
18149 size_t Size = Mask.size();
18150 assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18150, __extension__
__PRETTY_FUNCTION__))
;
18151 for (unsigned I = 0; I < Size; I += 2) {
18152 if (Mask[I] != (int)(Begin0 + I / 2) ||
18153 Mask[I + 1] != (int)(Begin1 + I / 2))
18154 return false;
18155 }
18156 return true;
18157 };
18158 // Check which half is this shuffle node
18159 int NumElts = VT.getVectorNumElements();
18160 size_t FirstQtr = NumElts / 2;
18161 size_t ThirdQtr = NumElts + NumElts / 2;
18162 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
18163 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
18164 if (!IsFirstHalf && !IsSecondHalf)
18165 return SDValue();
18166
18167 // Find the intersection between shuffle users of V1 and V2.
18168 SmallVector<SDNode *, 2> Shuffles;
18169 for (SDNode *User : V1->uses())
18170 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
18171 User->getOperand(1) == V2)
18172 Shuffles.push_back(User);
18173 // Limit user size to two for now.
18174 if (Shuffles.size() != 2)
18175 return SDValue();
18176 // Find out which half of the 512-bit shuffles is each smaller shuffle
18177 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
18178 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
18179 SDNode *FirstHalf;
18180 SDNode *SecondHalf;
18181 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
18182 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
18183 FirstHalf = Shuffles[0];
18184 SecondHalf = Shuffles[1];
18185 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
18186 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
18187 FirstHalf = Shuffles[1];
18188 SecondHalf = Shuffles[0];
18189 } else {
18190 return SDValue();
18191 }
18192 // Lower into unpck and perm. Return the perm of this shuffle and replace
18193 // the other.
18194 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
18195 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
18196 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18197 DAG.getTargetConstant(0x20, DL, MVT::i8));
18198 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18199 DAG.getTargetConstant(0x31, DL, MVT::i8));
18200 if (IsFirstHalf) {
18201 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
18202 return Perm1;
18203 }
18204 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
18205 return Perm2;
18206}
18207
18208/// Handle lowering of 4-lane 64-bit floating point shuffles.
18209///
18210/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
18211/// isn't available.
18212static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18213 const APInt &Zeroable, SDValue V1, SDValue V2,
18214 const X86Subtarget &Subtarget,
18215 SelectionDAG &DAG) {
18216 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18216, __extension__
__PRETTY_FUNCTION__))
;
1
'?' condition is true
18217 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18217, __extension__
__PRETTY_FUNCTION__))
;
2
'?' condition is true
18218 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18218, __extension__
__PRETTY_FUNCTION__))
;
3
Assuming the condition is true
4
'?' condition is true
18219
18220 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
5
Taking false branch
18221 Subtarget, DAG))
18222 return V;
18223
18224 if (V2.isUndef()) {
6
Taking false branch
18225 // Check for being able to broadcast a single element.
18226 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
18227 Mask, Subtarget, DAG))
18228 return Broadcast;
18229
18230 // Use low duplicate instructions for masks that match their pattern.
18231 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
18232 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
18233
18234 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
18235 // Non-half-crossing single input shuffles can be lowered with an
18236 // interleaved permutation.
18237 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18238 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
18239 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
18240 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18241 }
18242
18243 // With AVX2 we have direct support for this permutation.
18244 if (Subtarget.hasAVX2())
18245 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
18246 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18247
18248 // Try to create an in-lane repeating shuffle mask and then shuffle the
18249 // results into the target lanes.
18250 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18251 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18252 return V;
18253
18254 // Try to permute the lanes and then use a per-lane permute.
18255 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
18256 Mask, DAG, Subtarget))
18257 return V;
18258
18259 // Otherwise, fall back.
18260 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
18261 DAG, Subtarget);
18262 }
18263
18264 // Use dedicated unpack instructions for masks that match their pattern.
18265 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
7
Taking false branch
18266 return V;
18267
18268 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
8
Taking false branch
18269 Zeroable, Subtarget, DAG))
18270 return Blend;
18271
18272 // Check if the blend happens to exactly fit that of SHUFPD.
18273 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
9
Taking false branch
18274 Zeroable, Subtarget, DAG))
18275 return Op;
18276
18277 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18278 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18279
18280 // If we have lane crossing shuffles AND they don't all come from the lower
18281 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
18282 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
18283 // canonicalize to a blend of splat which isn't necessary for this combine.
18284 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
10
Assuming the condition is false
18285 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
18286 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
18287 (V2.getOpcode() != ISD::BUILD_VECTOR))
18288 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
18289
18290 // If we have one input in place, then we can permute the other input and
18291 // blend the result.
18292 if (V1IsInPlace
10.1
'V1IsInPlace' is false
|| V2IsInPlace
10.2
'V2IsInPlace' is false
)
11
Taking false branch
18293 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18294 Subtarget, DAG);
18295
18296 // Try to create an in-lane repeating shuffle mask and then shuffle the
18297 // results into the target lanes.
18298 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18299 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18300 return V;
18301
18302 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18303 // shuffle. However, if we have AVX2 and either inputs are already in place,
18304 // we will be able to shuffle even across lanes the other input in a single
18305 // instruction so skip this pattern.
18306 if (!(Subtarget.hasAVX2() && (V1IsInPlace
11.1
'V1IsInPlace' is false
|| V2IsInPlace)))
12
Taking true branch
18307 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
13
Calling 'lowerShuffleAsLanePermuteAndRepeatedMask'
18308 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18309 return V;
18310
18311 // If we have VLX support, we can use VEXPAND.
18312 if (Subtarget.hasVLX())
18313 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
18314 DAG, Subtarget))
18315 return V;
18316
18317 // If we have AVX2 then we always want to lower with a blend because an v4 we
18318 // can fully permute the elements.
18319 if (Subtarget.hasAVX2())
18320 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18321 Subtarget, DAG);
18322
18323 // Otherwise fall back on generic lowering.
18324 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
18325 Subtarget, DAG);
18326}
18327
18328/// Handle lowering of 4-lane 64-bit integer shuffles.
18329///
18330/// This routine is only called when we have AVX2 and thus a reasonable
18331/// instruction set for v4i64 shuffling..
18332static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18333 const APInt &Zeroable, SDValue V1, SDValue V2,
18334 const X86Subtarget &Subtarget,
18335 SelectionDAG &DAG) {
18336 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18336, __extension__
__PRETTY_FUNCTION__))
;
18337 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18337, __extension__
__PRETTY_FUNCTION__))
;
18338 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18338, __extension__
__PRETTY_FUNCTION__))
;
18339 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18339, __extension__
__PRETTY_FUNCTION__))
;
18340
18341 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18342 Subtarget, DAG))
18343 return V;
18344
18345 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
18346 Zeroable, Subtarget, DAG))
18347 return Blend;
18348
18349 // Check for being able to broadcast a single element.
18350 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
18351 Subtarget, DAG))
18352 return Broadcast;
18353
18354 // Try to use shift instructions if fast.
18355 if (Subtarget.preferLowerShuffleAsShift())
18356 if (SDValue Shift =
18357 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18358 Subtarget, DAG, /*BitwiseOnly*/ true))
18359 return Shift;
18360
18361 if (V2.isUndef()) {
18362 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18363 // can use lower latency instructions that will operate on both lanes.
18364 SmallVector<int, 2> RepeatedMask;
18365 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
18366 SmallVector<int, 4> PSHUFDMask;
18367 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
18368 return DAG.getBitcast(
18369 MVT::v4i64,
18370 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
18371 DAG.getBitcast(MVT::v8i32, V1),
18372 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18373 }
18374
18375 // AVX2 provides a direct instruction for permuting a single input across
18376 // lanes.
18377 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
18378 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18379 }
18380
18381 // Try to use shift instructions.
18382 if (SDValue Shift =
18383 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
18384 DAG, /*BitwiseOnly*/ false))
18385 return Shift;
18386
18387 // If we have VLX support, we can use VALIGN or VEXPAND.
18388 if (Subtarget.hasVLX()) {
18389 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
18390 Subtarget, DAG))
18391 return Rotate;
18392
18393 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
18394 DAG, Subtarget))
18395 return V;
18396 }
18397
18398 // Try to use PALIGNR.
18399 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
18400 Subtarget, DAG))
18401 return Rotate;
18402
18403 // Use dedicated unpack instructions for masks that match their pattern.
18404 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
18405 return V;
18406
18407 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18408 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18409
18410 // If we have one input in place, then we can permute the other input and
18411 // blend the result.
18412 if (V1IsInPlace || V2IsInPlace)
18413 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18414 Subtarget, DAG);
18415
18416 // Try to create an in-lane repeating shuffle mask and then shuffle the
18417 // results into the target lanes.
18418 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18419 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18420 return V;
18421
18422 // Try to lower to PERMQ(BLENDD(V1,V2)).
18423 if (SDValue V =
18424 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
18425 return V;
18426
18427 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18428 // shuffle. However, if we have AVX2 and either inputs are already in place,
18429 // we will be able to shuffle even across lanes the other input in a single
18430 // instruction so skip this pattern.
18431 if (!V1IsInPlace && !V2IsInPlace)
18432 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18433 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18434 return Result;
18435
18436 // Otherwise fall back on generic blend lowering.
18437 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18438 Subtarget, DAG);
18439}
18440
18441/// Handle lowering of 8-lane 32-bit floating point shuffles.
18442///
18443/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
18444/// isn't available.
18445static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18446 const APInt &Zeroable, SDValue V1, SDValue V2,
18447 const X86Subtarget &Subtarget,
18448 SelectionDAG &DAG) {
18449 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18449, __extension__
__PRETTY_FUNCTION__))
;
18450 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18450, __extension__
__PRETTY_FUNCTION__))
;
18451 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18451, __extension__
__PRETTY_FUNCTION__))
;
18452
18453 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
18454 Zeroable, Subtarget, DAG))
18455 return Blend;
18456
18457 // Check for being able to broadcast a single element.
18458 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
18459 Subtarget, DAG))
18460 return Broadcast;
18461
18462 if (!Subtarget.hasAVX2()) {
18463 SmallVector<int> InLaneMask;
18464 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18465
18466 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
18467 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18468 /*SimpleOnly*/ true))
18469 return R;
18470 }
18471 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18472 Zeroable, Subtarget, DAG))
18473 return DAG.getBitcast(MVT::v8f32, ZExt);
18474
18475 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18476 // options to efficiently lower the shuffle.
18477 SmallVector<int, 4> RepeatedMask;
18478 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
18479 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18480, __extension__
__PRETTY_FUNCTION__))
18480 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18480, __extension__
__PRETTY_FUNCTION__))
;
18481
18482 // Use even/odd duplicate instructions for masks that match their pattern.
18483 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18484 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
18485 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18486 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
18487
18488 if (V2.isUndef())
18489 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
18490 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18491
18492 // Use dedicated unpack instructions for masks that match their pattern.
18493 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
18494 return V;
18495
18496 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
18497 // have already handled any direct blends.
18498 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
18499 }
18500
18501 // Try to create an in-lane repeating shuffle mask and then shuffle the
18502 // results into the target lanes.
18503 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18504 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18505 return V;
18506
18507 // If we have a single input shuffle with different shuffle patterns in the
18508 // two 128-bit lanes use the variable mask to VPERMILPS.
18509 if (V2.isUndef()) {
18510 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
18511 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18512 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
18513 }
18514 if (Subtarget.hasAVX2()) {
18515 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18516 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
18517 }
18518 // Otherwise, fall back.
18519 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
18520 DAG, Subtarget);
18521 }
18522
18523 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18524 // shuffle.
18525 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18526 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18527 return Result;
18528
18529 // If we have VLX support, we can use VEXPAND.
18530 if (Subtarget.hasVLX())
18531 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
18532 DAG, Subtarget))
18533 return V;
18534
18535 // Try to match an interleave of two v8f32s and lower them as unpck and
18536 // permutes using ymms. This needs to go before we try to split the vectors.
18537 //
18538 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18539 // this path inadvertently.
18540 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18541 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18542 Mask, DAG))
18543 return V;
18544
18545 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18546 // since after split we get a more efficient code using vpunpcklwd and
18547 // vpunpckhwd instrs than vblend.
18548 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
18549 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
18550 DAG);
18551
18552 // If we have AVX2 then we always want to lower with a blend because at v8 we
18553 // can fully permute the elements.
18554 if (Subtarget.hasAVX2())
18555 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
18556 Subtarget, DAG);
18557
18558 // Otherwise fall back on generic lowering.
18559 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
18560 Subtarget, DAG);
18561}
18562
18563/// Handle lowering of 8-lane 32-bit integer shuffles.
18564///
18565/// This routine is only called when we have AVX2 and thus a reasonable
18566/// instruction set for v8i32 shuffling..
18567static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18568 const APInt &Zeroable, SDValue V1, SDValue V2,
18569 const X86Subtarget &Subtarget,
18570 SelectionDAG &DAG) {
18571 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18571, __extension__
__PRETTY_FUNCTION__))
;
18572 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18572, __extension__
__PRETTY_FUNCTION__))
;
18573 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18573, __extension__
__PRETTY_FUNCTION__))
;
18574 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18574, __extension__
__PRETTY_FUNCTION__))
;
18575
18576 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
18577
18578 // Whenever we can lower this as a zext, that instruction is strictly faster
18579 // than any alternative. It also allows us to fold memory operands into the
18580 // shuffle in many cases.
18581 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18582 Zeroable, Subtarget, DAG))
18583 return ZExt;
18584
18585 // Try to match an interleave of two v8i32s and lower them as unpck and
18586 // permutes using ymms. This needs to go before we try to split the vectors.
18587 if (!Subtarget.hasAVX512())
18588 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18589 Mask, DAG))
18590 return V;
18591
18592 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18593 // since after split we get a more efficient code than vblend by using
18594 // vpunpcklwd and vpunpckhwd instrs.
18595 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
18596 !Subtarget.hasAVX512())
18597 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
18598 DAG);
18599
18600 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
18601 Zeroable, Subtarget, DAG))
18602 return Blend;
18603
18604 // Check for being able to broadcast a single element.
18605 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
18606 Subtarget, DAG))
18607 return Broadcast;
18608
18609 // Try to use shift instructions if fast.
18610 if (Subtarget.preferLowerShuffleAsShift()) {
18611 if (SDValue Shift =
18612 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
18613 Subtarget, DAG, /*BitwiseOnly*/ true))
18614 return Shift;
18615 if (NumV2Elements == 0)
18616 if (SDValue Rotate =
18617 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18618 return Rotate;
18619 }
18620
18621 // If the shuffle mask is repeated in each 128-bit lane we can use more
18622 // efficient instructions that mirror the shuffles across the two 128-bit
18623 // lanes.
18624 SmallVector<int, 4> RepeatedMask;
18625 bool Is128BitLaneRepeatedShuffle =
18626 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
18627 if (Is128BitLaneRepeatedShuffle) {
18628 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18628, __extension__
__PRETTY_FUNCTION__))
;
18629 if (V2.isUndef())
18630 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
18631 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18632
18633 // Use dedicated unpack instructions for masks that match their pattern.
18634 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
18635 return V;
18636 }
18637
18638 // Try to use shift instructions.
18639 if (SDValue Shift =
18640 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
18641 DAG, /*BitwiseOnly*/ false))
18642 return Shift;
18643
18644 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
18645 if (SDValue Rotate =
18646 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18647 return Rotate;
18648
18649 // If we have VLX support, we can use VALIGN or EXPAND.
18650 if (Subtarget.hasVLX()) {
18651 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
18652 Subtarget, DAG))
18653 return Rotate;
18654
18655 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
18656 DAG, Subtarget))
18657 return V;
18658 }
18659
18660 // Try to use byte rotation instructions.
18661 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
18662 Subtarget, DAG))
18663 return Rotate;
18664
18665 // Try to create an in-lane repeating shuffle mask and then shuffle the
18666 // results into the target lanes.
18667 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18668 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18669 return V;
18670
18671 if (V2.isUndef()) {
18672 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18673 // because that should be faster than the variable permute alternatives.
18674 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
18675 return V;
18676
18677 // If the shuffle patterns aren't repeated but it's a single input, directly
18678 // generate a cross-lane VPERMD instruction.
18679 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18680 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
18681 }
18682
18683 // Assume that a single SHUFPS is faster than an alternative sequence of
18684 // multiple instructions (even if the CPU has a domain penalty).
18685 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18686 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18687 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
18688 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
18689 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
18690 CastV1, CastV2, DAG);
18691 return DAG.getBitcast(MVT::v8i32, ShufPS);
18692 }
18693
18694 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18695 // shuffle.
18696 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18697 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18698 return Result;
18699
18700 // Otherwise fall back on generic blend lowering.
18701 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
18702 Subtarget, DAG);
18703}
18704
18705/// Handle lowering of 16-lane 16-bit integer shuffles.
18706///
18707/// This routine is only called when we have AVX2 and thus a reasonable
18708/// instruction set for v16i16 shuffling..
18709static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18710 const APInt &Zeroable, SDValue V1, SDValue V2,
18711 const X86Subtarget &Subtarget,
18712 SelectionDAG &DAG) {
18713 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18713, __extension__
__PRETTY_FUNCTION__))
;
18714 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18714, __extension__
__PRETTY_FUNCTION__))
;
18715 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18715, __extension__
__PRETTY_FUNCTION__))
;
18716 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18716, __extension__
__PRETTY_FUNCTION__))
;
18717
18718 // Whenever we can lower this as a zext, that instruction is strictly faster
18719 // than any alternative. It also allows us to fold memory operands into the
18720 // shuffle in many cases.
18721 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18722 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18723 return ZExt;
18724
18725 // Check for being able to broadcast a single element.
18726 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
18727 Subtarget, DAG))
18728 return Broadcast;
18729
18730 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
18731 Zeroable, Subtarget, DAG))
18732 return Blend;
18733
18734 // Use dedicated unpack instructions for masks that match their pattern.
18735 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
18736 return V;
18737
18738 // Use dedicated pack instructions for masks that match their pattern.
18739 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
18740 Subtarget))
18741 return V;
18742
18743 // Try to use lower using a truncation.
18744 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18745 Subtarget, DAG))
18746 return V;
18747
18748 // Try to use shift instructions.
18749 if (SDValue Shift =
18750 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18751 Subtarget, DAG, /*BitwiseOnly*/ false))
18752 return Shift;
18753
18754 // Try to use byte rotation instructions.
18755 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
18756 Subtarget, DAG))
18757 return Rotate;
18758
18759 // Try to create an in-lane repeating shuffle mask and then shuffle the
18760 // results into the target lanes.
18761 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18762 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18763 return V;
18764
18765 if (V2.isUndef()) {
18766 // Try to use bit rotation instructions.
18767 if (SDValue Rotate =
18768 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
18769 return Rotate;
18770
18771 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18772 // because that should be faster than the variable permute alternatives.
18773 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
18774 return V;
18775
18776 // There are no generalized cross-lane shuffle operations available on i16
18777 // element types.
18778 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18779 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18780 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18781 return V;
18782
18783 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18784 DAG, Subtarget);
18785 }
18786
18787 SmallVector<int, 8> RepeatedMask;
18788 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18789 // As this is a single-input shuffle, the repeated mask should be
18790 // a strictly valid v8i16 mask that we can pass through to the v8i16
18791 // lowering to handle even the v16 case.
18792 return lowerV8I16GeneralSingleInputShuffle(
18793 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18794 }
18795 }
18796
18797 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18798 Zeroable, Subtarget, DAG))
18799 return PSHUFB;
18800
18801 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18802 if (Subtarget.hasBWI())
18803 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18804
18805 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18806 // shuffle.
18807 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18808 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18809 return Result;
18810
18811 // Try to permute the lanes and then use a per-lane permute.
18812 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18813 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18814 return V;
18815
18816 // Try to match an interleave of two v16i16s and lower them as unpck and
18817 // permutes using ymms.
18818 if (!Subtarget.hasAVX512())
18819 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18820 Mask, DAG))
18821 return V;
18822
18823 // Otherwise fall back on generic lowering.
18824 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18825 Subtarget, DAG);
18826}
18827
18828/// Handle lowering of 32-lane 8-bit integer shuffles.
18829///
18830/// This routine is only called when we have AVX2 and thus a reasonable
18831/// instruction set for v32i8 shuffling..
18832static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18833 const APInt &Zeroable, SDValue V1, SDValue V2,
18834 const X86Subtarget &Subtarget,
18835 SelectionDAG &DAG) {
18836 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18836, __extension__
__PRETTY_FUNCTION__))
;
18837 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18837, __extension__
__PRETTY_FUNCTION__))
;
18838 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18838, __extension__
__PRETTY_FUNCTION__))
;
18839 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__))
;
18840
18841 // Whenever we can lower this as a zext, that instruction is strictly faster
18842 // than any alternative. It also allows us to fold memory operands into the
18843 // shuffle in many cases.
18844 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18845 Zeroable, Subtarget, DAG))
18846 return ZExt;
18847
18848 // Check for being able to broadcast a single element.
18849 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18850 Subtarget, DAG))
18851 return Broadcast;
18852
18853 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18854 Zeroable, Subtarget, DAG))
18855 return Blend;
18856
18857 // Use dedicated unpack instructions for masks that match their pattern.
18858 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18859 return V;
18860
18861 // Use dedicated pack instructions for masks that match their pattern.
18862 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18863 Subtarget))
18864 return V;
18865
18866 // Try to use lower using a truncation.
18867 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18868 Subtarget, DAG))
18869 return V;
18870
18871 // Try to use shift instructions.
18872 if (SDValue Shift =
18873 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
18874 DAG, /*BitwiseOnly*/ false))
18875 return Shift;
18876
18877 // Try to use byte rotation instructions.
18878 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18879 Subtarget, DAG))
18880 return Rotate;
18881
18882 // Try to use bit rotation instructions.
18883 if (V2.isUndef())
18884 if (SDValue Rotate =
18885 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18886 return Rotate;
18887
18888 // Try to create an in-lane repeating shuffle mask and then shuffle the
18889 // results into the target lanes.
18890 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18891 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18892 return V;
18893
18894 // There are no generalized cross-lane shuffle operations available on i8
18895 // element types.
18896 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18897 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18898 // because that should be faster than the variable permute alternatives.
18899 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18900 return V;
18901
18902 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18903 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18904 return V;
18905
18906 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18907 DAG, Subtarget);
18908 }
18909
18910 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18911 Zeroable, Subtarget, DAG))
18912 return PSHUFB;
18913
18914 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18915 if (Subtarget.hasVBMI())
18916 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18917
18918 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18919 // shuffle.
18920 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18921 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18922 return Result;
18923
18924 // Try to permute the lanes and then use a per-lane permute.
18925 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18926 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18927 return V;
18928
18929 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18930 // by zeroable elements in the remaining 24 elements. Turn this into two
18931 // vmovqb instructions shuffled together.
18932 if (Subtarget.hasVLX())
18933 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18934 Mask, Zeroable, DAG))
18935 return V;
18936
18937 // Try to match an interleave of two v32i8s and lower them as unpck and
18938 // permutes using ymms.
18939 if (!Subtarget.hasAVX512())
18940 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
18941 Mask, DAG))
18942 return V;
18943
18944 // Otherwise fall back on generic lowering.
18945 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
18946 Subtarget, DAG);
18947}
18948
18949/// High-level routine to lower various 256-bit x86 vector shuffles.
18950///
18951/// This routine either breaks down the specific type of a 256-bit x86 vector
18952/// shuffle or splits it into two 128-bit shuffles and fuses the results back
18953/// together based on the available instructions.
18954static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18955 SDValue V1, SDValue V2, const APInt &Zeroable,
18956 const X86Subtarget &Subtarget,
18957 SelectionDAG &DAG) {
18958 // If we have a single input to the zero element, insert that into V1 if we
18959 // can do so cheaply.
18960 int NumElts = VT.getVectorNumElements();
18961 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18962
18963 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18964 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18965 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18966 return Insertion;
18967
18968 // Handle special cases where the lower or upper half is UNDEF.
18969 if (SDValue V =
18970 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18971 return V;
18972
18973 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
18974 // can check for those subtargets here and avoid much of the subtarget
18975 // querying in the per-vector-type lowering routines. With AVX1 we have
18976 // essentially *zero* ability to manipulate a 256-bit vector with integer
18977 // types. Since we'll use floating point types there eventually, just
18978 // immediately cast everything to a float and operate entirely in that domain.
18979 if (VT.isInteger() && !Subtarget.hasAVX2()) {
18980 int ElementBits = VT.getScalarSizeInBits();
18981 if (ElementBits < 32) {
18982 // No floating point type available, if we can't use the bit operations
18983 // for masking/blending then decompose into 128-bit vectors.
18984 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18985 Subtarget, DAG))
18986 return V;
18987 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18988 return V;
18989 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18990 }
18991
18992 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
18993 VT.getVectorNumElements());
18994 V1 = DAG.getBitcast(FpVT, V1);
18995 V2 = DAG.getBitcast(FpVT, V2);
18996 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
18997 }
18998
18999 if (VT == MVT::v16f16) {
19000 V1 = DAG.getBitcast(MVT::v16i16, V1);
19001 V2 = DAG.getBitcast(MVT::v16i16, V2);
19002 return DAG.getBitcast(MVT::v16f16,
19003 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
19004 }
19005
19006 switch (VT.SimpleTy) {
19007 case MVT::v4f64:
19008 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19009 case MVT::v4i64:
19010 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19011 case MVT::v8f32:
19012 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19013 case MVT::v8i32:
19014 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19015 case MVT::v16i16:
19016 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19017 case MVT::v32i8:
19018 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19019
19020 default:
19021 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19021)
;
19022 }
19023}
19024
19025/// Try to lower a vector shuffle as a 128-bit shuffles.
19026static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
19027 const APInt &Zeroable, SDValue V1, SDValue V2,
19028 const X86Subtarget &Subtarget,
19029 SelectionDAG &DAG) {
19030 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19031, __extension__
__PRETTY_FUNCTION__))
19031 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19031, __extension__
__PRETTY_FUNCTION__))
;
19032
19033 // To handle 256 bit vector requires VLX and most probably
19034 // function lowerV2X128VectorShuffle() is better solution.
19035 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19035, __extension__
__PRETTY_FUNCTION__))
;
19036
19037 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
19038 SmallVector<int, 4> Widened128Mask;
19039 if (!canWidenShuffleElements(Mask, Widened128Mask))
19040 return SDValue();
19041 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19041, __extension__
__PRETTY_FUNCTION__))
;
19042
19043 // Try to use an insert into a zero vector.
19044 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
19045 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
19046 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
19047 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
19048 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
19049 DAG.getIntPtrConstant(0, DL));
19050 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19051 getZeroVector(VT, Subtarget, DAG, DL), LoV,
19052 DAG.getIntPtrConstant(0, DL));
19053 }
19054
19055 // Check for patterns which can be matched with a single insert of a 256-bit
19056 // subvector.
19057 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
19058 if (OnlyUsesV1 ||
19059 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
19060 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
19061 SDValue SubVec =
19062 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
19063 DAG.getIntPtrConstant(0, DL));
19064 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
19065 DAG.getIntPtrConstant(4, DL));
19066 }
19067
19068 // See if this is an insertion of the lower 128-bits of V2 into V1.
19069 bool IsInsert = true;
19070 int V2Index = -1;
19071 for (int i = 0; i < 4; ++i) {
19072 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19072, __extension__
__PRETTY_FUNCTION__))
;
19073 if (Widened128Mask[i] < 0)
19074 continue;
19075
19076 // Make sure all V1 subvectors are in place.
19077 if (Widened128Mask[i] < 4) {
19078 if (Widened128Mask[i] != i) {
19079 IsInsert = false;
19080 break;
19081 }
19082 } else {
19083 // Make sure we only have a single V2 index and its the lowest 128-bits.
19084 if (V2Index >= 0 || Widened128Mask[i] != 4) {
19085 IsInsert = false;
19086 break;
19087 }
19088 V2Index = i;
19089 }
19090 }
19091 if (IsInsert && V2Index >= 0) {
19092 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
19093 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
19094 DAG.getIntPtrConstant(0, DL));
19095 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
19096 }
19097
19098 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
19099 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
19100 // possible we at least ensure the lanes stay sequential to help later
19101 // combines.
19102 SmallVector<int, 2> Widened256Mask;
19103 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
19104 Widened128Mask.clear();
19105 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
19106 }
19107
19108 // Try to lower to vshuf64x2/vshuf32x4.
19109 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
19110 unsigned PermMask = 0;
19111 // Insure elements came from the same Op.
19112 for (int i = 0; i < 4; ++i) {
19113 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19113, __extension__
__PRETTY_FUNCTION__))
;
19114 if (Widened128Mask[i] < 0)
19115 continue;
19116
19117 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
19118 unsigned OpIndex = i / 2;
19119 if (Ops[OpIndex].isUndef())
19120 Ops[OpIndex] = Op;
19121 else if (Ops[OpIndex] != Op)
19122 return SDValue();
19123
19124 // Convert the 128-bit shuffle mask selection values into 128-bit selection
19125 // bits defined by a vshuf64x2 instruction's immediate control byte.
19126 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
19127 }
19128
19129 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
19130 DAG.getTargetConstant(PermMask, DL, MVT::i8));
19131}
19132
19133/// Handle lowering of 8-lane 64-bit floating point shuffles.
19134static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19135 const APInt &Zeroable, SDValue V1, SDValue V2,
19136 const X86Subtarget &Subtarget,
19137 SelectionDAG &DAG) {
19138 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19138, __extension__
__PRETTY_FUNCTION__))
;
19139 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19139, __extension__
__PRETTY_FUNCTION__))
;
19140 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19140, __extension__
__PRETTY_FUNCTION__))
;
19141
19142 if (V2.isUndef()) {
19143 // Use low duplicate instructions for masks that match their pattern.
19144 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
19145 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
19146
19147 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
19148 // Non-half-crossing single input shuffles can be lowered with an
19149 // interleaved permutation.
19150 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
19151 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
19152 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
19153 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
19154 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
19155 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
19156 }
19157
19158 SmallVector<int, 4> RepeatedMask;
19159 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
19160 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
19161 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19162 }
19163
19164 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
19165 V2, Subtarget, DAG))
19166 return Shuf128;
19167
19168 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
19169 return Unpck;
19170
19171 // Check if the blend happens to exactly fit that of SHUFPD.
19172 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
19173 Zeroable, Subtarget, DAG))
19174 return Op;
19175
19176 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
19177 DAG, Subtarget))
19178 return V;
19179
19180 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
19181 Zeroable, Subtarget, DAG))
19182 return Blend;
19183
19184 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
19185}
19186
19187/// Handle lowering of 16-lane 32-bit floating point shuffles.
19188static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19189 const APInt &Zeroable, SDValue V1, SDValue V2,
19190 const X86Subtarget &Subtarget,
19191 SelectionDAG &DAG) {
19192 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19192, __extension__
__PRETTY_FUNCTION__))
;
19193 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19193, __extension__
__PRETTY_FUNCTION__))
;
19194 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19194, __extension__
__PRETTY_FUNCTION__))
;
19195
19196 // If the shuffle mask is repeated in each 128-bit lane, we have many more
19197 // options to efficiently lower the shuffle.
19198 SmallVector<int, 4> RepeatedMask;
19199 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
19200 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19200, __extension__
__PRETTY_FUNCTION__))
;
19201
19202 // Use even/odd duplicate instructions for masks that match their pattern.
19203 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
19204 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
19205 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
19206 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
19207
19208 if (V2.isUndef())
19209 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
19210 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19211
19212 // Use dedicated unpack instructions for masks that match their pattern.
19213 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
19214 return V;
19215
19216 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19217 Zeroable, Subtarget, DAG))
19218 return Blend;
19219
19220 // Otherwise, fall back to a SHUFPS sequence.
19221 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
19222 }
19223
19224 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19225 Zeroable, Subtarget, DAG))
19226 return Blend;
19227
19228 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19229 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19230 return DAG.getBitcast(MVT::v16f32, ZExt);
19231
19232 // Try to create an in-lane repeating shuffle mask and then shuffle the
19233 // results into the target lanes.
19234 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19235 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
19236 return V;
19237
19238 // If we have a single input shuffle with different shuffle patterns in the
19239 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
19240 if (V2.isUndef() &&
19241 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
19242 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
19243 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
19244 }
19245
19246 // If we have AVX512F support, we can use VEXPAND.
19247 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
19248 V1, V2, DAG, Subtarget))
19249 return V;
19250
19251 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
19252}
19253
19254/// Handle lowering of 8-lane 64-bit integer shuffles.
19255static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19256 const APInt &Zeroable, SDValue V1, SDValue V2,
19257 const X86Subtarget &Subtarget,
19258 SelectionDAG &DAG) {
19259 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19259, __extension__
__PRETTY_FUNCTION__))
;
19260 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19260, __extension__
__PRETTY_FUNCTION__))
;
19261 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19261, __extension__
__PRETTY_FUNCTION__))
;
19262
19263 // Try to use shift instructions if fast.
19264 if (Subtarget.preferLowerShuffleAsShift())
19265 if (SDValue Shift =
19266 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
19267 Subtarget, DAG, /*BitwiseOnly*/ true))
19268 return Shift;
19269
19270 if (V2.isUndef()) {
19271 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
19272 // can use lower latency instructions that will operate on all four
19273 // 128-bit lanes.
19274 SmallVector<int, 2> Repeated128Mask;
19275 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
19276 SmallVector<int, 4> PSHUFDMask;
19277 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
19278 return DAG.getBitcast(
19279 MVT::v8i64,
19280 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
19281 DAG.getBitcast(MVT::v16i32, V1),
19282 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
19283 }
19284
19285 SmallVector<int, 4> Repeated256Mask;
19286 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
19287 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
19288 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
19289 }
19290
19291 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
19292 V2, Subtarget, DAG))
19293 return Shuf128;
19294
19295 // Try to use shift instructions.
19296 if (SDValue Shift =
19297 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
19298 DAG, /*BitwiseOnly*/ false))
19299 return Shift;
19300
19301 // Try to use VALIGN.
19302 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
19303 Subtarget, DAG))
19304 return Rotate;
19305
19306 // Try to use PALIGNR.
19307 if (Subtarget.hasBWI())
19308 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
19309 Subtarget, DAG))
19310 return Rotate;
19311
19312 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
19313 return Unpck;
19314
19315 // If we have AVX512F support, we can use VEXPAND.
19316 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
19317 DAG, Subtarget))
19318 return V;
19319
19320 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
19321 Zeroable, Subtarget, DAG))
19322 return Blend;
19323
19324 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
19325}
19326
19327/// Handle lowering of 16-lane 32-bit integer shuffles.
19328static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19329 const APInt &Zeroable, SDValue V1, SDValue V2,
19330 const X86Subtarget &Subtarget,
19331 SelectionDAG &DAG) {
19332 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19332, __extension__
__PRETTY_FUNCTION__))
;
19333 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19333, __extension__
__PRETTY_FUNCTION__))
;
19334 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19334, __extension__
__PRETTY_FUNCTION__))
;
19335
19336 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
19337
19338 // Whenever we can lower this as a zext, that instruction is strictly faster
19339 // than any alternative. It also allows us to fold memory operands into the
19340 // shuffle in many cases.
19341 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19342 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19343 return ZExt;
19344
19345 // Try to use shift instructions if fast.
19346 if (Subtarget.preferLowerShuffleAsShift()) {
19347 if (SDValue Shift =
19348 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19349 Subtarget, DAG, /*BitwiseOnly*/ true))
19350 return Shift;
19351 if (NumV2Elements == 0)
19352 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
19353 Subtarget, DAG))
19354 return Rotate;
19355 }
19356
19357 // If the shuffle mask is repeated in each 128-bit lane we can use more
19358 // efficient instructions that mirror the shuffles across the four 128-bit
19359 // lanes.
19360 SmallVector<int, 4> RepeatedMask;
19361 bool Is128BitLaneRepeatedShuffle =
19362 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
19363 if (Is128BitLaneRepeatedShuffle) {
19364 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19364, __extension__
__PRETTY_FUNCTION__))
;
19365 if (V2.isUndef())
19366 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
19367 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19368
19369 // Use dedicated unpack instructions for masks that match their pattern.
19370 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
19371 return V;
19372 }
19373
19374 // Try to use shift instructions.
19375 if (SDValue Shift =
19376 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19377 Subtarget, DAG, /*BitwiseOnly*/ false))
19378 return Shift;
19379
19380 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
19381 if (SDValue Rotate =
19382 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
19383 return Rotate;
19384
19385 // Try to use VALIGN.
19386 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
19387 Subtarget, DAG))
19388 return Rotate;
19389
19390 // Try to use byte rotation instructions.
19391 if (Subtarget.hasBWI())
19392 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
19393 Subtarget, DAG))
19394 return Rotate;
19395
19396 // Assume that a single SHUFPS is faster than using a permv shuffle.
19397 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
19398 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
19399 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
19400 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
19401 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
19402 CastV1, CastV2, DAG);
19403 return DAG.getBitcast(MVT::v16i32, ShufPS);
19404 }
19405
19406 // Try to create an in-lane repeating shuffle mask and then shuffle the
19407 // results into the target lanes.
19408 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19409 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
19410 return V;
19411
19412 // If we have AVX512F support, we can use VEXPAND.
19413 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
19414 DAG, Subtarget))
19415 return V;
19416
19417 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
19418 Zeroable, Subtarget, DAG))
19419 return Blend;
19420
19421 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
19422}
19423
19424/// Handle lowering of 32-lane 16-bit integer shuffles.
19425static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19426 const APInt &Zeroable, SDValue V1, SDValue V2,
19427 const X86Subtarget &Subtarget,
19428 SelectionDAG &DAG) {
19429 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19429, __extension__
__PRETTY_FUNCTION__))
;
19430 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19430, __extension__
__PRETTY_FUNCTION__))
;
19431 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19431, __extension__
__PRETTY_FUNCTION__))
;
19432 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19432, __extension__
__PRETTY_FUNCTION__))
;
19433
19434 // Whenever we can lower this as a zext, that instruction is strictly faster
19435 // than any alternative. It also allows us to fold memory operands into the
19436 // shuffle in many cases.
19437 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19438 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
19439 return ZExt;
19440
19441 // Use dedicated unpack instructions for masks that match their pattern.
19442 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
19443 return V;
19444
19445 // Use dedicated pack instructions for masks that match their pattern.
19446 if (SDValue V =
19447 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
19448 return V;
19449
19450 // Try to use shift instructions.
19451 if (SDValue Shift =
19452 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
19453 Subtarget, DAG, /*BitwiseOnly*/ false))
19454 return Shift;
19455
19456 // Try to use byte rotation instructions.
19457 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
19458 Subtarget, DAG))
19459 return Rotate;
19460
19461 if (V2.isUndef()) {
19462 // Try to use bit rotation instructions.
19463 if (SDValue Rotate =
19464 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
19465 return Rotate;
19466
19467 SmallVector<int, 8> RepeatedMask;
19468 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
19469 // As this is a single-input shuffle, the repeated mask should be
19470 // a strictly valid v8i16 mask that we can pass through to the v8i16
19471 // lowering to handle even the v32 case.
19472 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
19473 RepeatedMask, Subtarget, DAG);
19474 }
19475 }
19476
19477 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
19478 Zeroable, Subtarget, DAG))
19479 return Blend;
19480
19481 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
19482 Zeroable, Subtarget, DAG))
19483 return PSHUFB;
19484
19485 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
19486}
19487
19488/// Handle lowering of 64-lane 8-bit integer shuffles.
19489static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19490 const APInt &Zeroable, SDValue V1, SDValue V2,
19491 const X86Subtarget &Subtarget,
19492 SelectionDAG &DAG) {
19493 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19493, __extension__
__PRETTY_FUNCTION__))
;
19494 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19494, __extension__
__PRETTY_FUNCTION__))
;
19495 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19495, __extension__
__PRETTY_FUNCTION__))
;
19496 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__))
;
19497
19498 // Whenever we can lower this as a zext, that instruction is strictly faster
19499 // than any alternative. It also allows us to fold memory operands into the
19500 // shuffle in many cases.
19501 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19502 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
19503 return ZExt;
19504
19505 // Use dedicated unpack instructions for masks that match their pattern.
19506 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
19507 return V;
19508
19509 // Use dedicated pack instructions for masks that match their pattern.
19510 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
19511 Subtarget))
19512 return V;
19513
19514 // Try to use shift instructions.
19515 if (SDValue Shift =
19516 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
19517 DAG, /*BitwiseOnly*/ false))
19518 return Shift;
19519
19520 // Try to use byte rotation instructions.
19521 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
19522 Subtarget, DAG))
19523 return Rotate;
19524
19525 // Try to use bit rotation instructions.
19526 if (V2.isUndef())
19527 if (SDValue Rotate =
19528 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
19529 return Rotate;
19530
19531 // Lower as AND if possible.
19532 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
19533 Zeroable, Subtarget, DAG))
19534 return Masked;
19535
19536 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
19537 Zeroable, Subtarget, DAG))
19538 return PSHUFB;
19539
19540 // Try to create an in-lane repeating shuffle mask and then shuffle the
19541 // results into the target lanes.
19542 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19543 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19544 return V;
19545
19546 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
19547 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
19548 return Result;
19549
19550 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
19551 Zeroable, Subtarget, DAG))
19552 return Blend;
19553
19554 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
19555 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
19556 // PALIGNR will be cheaper than the second PSHUFB+OR.
19557 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
19558 Mask, Subtarget, DAG))
19559 return V;
19560
19561 // If we can't directly blend but can use PSHUFB, that will be better as it
19562 // can both shuffle and set up the inefficient blend.
19563 bool V1InUse, V2InUse;
19564 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
19565 DAG, V1InUse, V2InUse);
19566 }
19567
19568 // Try to simplify this by merging 128-bit lanes to enable a lane-based
19569 // shuffle.
19570 if (!V2.isUndef())
19571 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19572 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19573 return Result;
19574
19575 // VBMI can use VPERMV/VPERMV3 byte shuffles.
19576 if (Subtarget.hasVBMI())
19577 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19578
19579 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19580}
19581
19582/// High-level routine to lower various 512-bit x86 vector shuffles.
19583///
19584/// This routine either breaks down the specific type of a 512-bit x86 vector
19585/// shuffle or splits it into two 256-bit shuffles and fuses the results back
19586/// together based on the available instructions.
19587static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19588 MVT VT, SDValue V1, SDValue V2,
19589 const APInt &Zeroable,
19590 const X86Subtarget &Subtarget,
19591 SelectionDAG &DAG) {
19592 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19593, __extension__
__PRETTY_FUNCTION__))
19593 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19593, __extension__
__PRETTY_FUNCTION__))
;
19594
19595 // If we have a single input to the zero element, insert that into V1 if we
19596 // can do so cheaply.
19597 int NumElts = Mask.size();
19598 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19599
19600 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19601 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19602 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19603 return Insertion;
19604
19605 // Handle special cases where the lower or upper half is UNDEF.
19606 if (SDValue V =
19607 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19608 return V;
19609
19610 // Check for being able to broadcast a single element.
19611 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
19612 Subtarget, DAG))
19613 return Broadcast;
19614
19615 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
19616 // Try using bit ops for masking and blending before falling back to
19617 // splitting.
19618 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19619 Subtarget, DAG))
19620 return V;
19621 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19622 return V;
19623
19624 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19625 }
19626
19627 if (VT == MVT::v32f16) {
19628 V1 = DAG.getBitcast(MVT::v32i16, V1);
19629 V2 = DAG.getBitcast(MVT::v32i16, V2);
19630 return DAG.getBitcast(MVT::v32f16,
19631 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
19632 }
19633
19634 // Dispatch to each element type for lowering. If we don't have support for
19635 // specific element type shuffles at 512 bits, immediately split them and
19636 // lower them. Each lowering routine of a given type is allowed to assume that
19637 // the requisite ISA extensions for that element type are available.
19638 switch (VT.SimpleTy) {
19639 case MVT::v8f64:
19640 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19641 case MVT::v16f32:
19642 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19643 case MVT::v8i64:
19644 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19645 case MVT::v16i32:
19646 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19647 case MVT::v32i16:
19648 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19649 case MVT::v64i8:
19650 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19651
19652 default:
19653 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19653)
;
19654 }
19655}
19656
19657static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
19658 MVT VT, SDValue V1, SDValue V2,
19659 const X86Subtarget &Subtarget,
19660 SelectionDAG &DAG) {
19661 // Shuffle should be unary.
19662 if (!V2.isUndef())
19663 return SDValue();
19664
19665 int ShiftAmt = -1;
19666 int NumElts = Mask.size();
19667 for (int i = 0; i != NumElts; ++i) {
19668 int M = Mask[i];
19669 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19670, __extension__
__PRETTY_FUNCTION__))
19670 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19670, __extension__
__PRETTY_FUNCTION__))
;
19671 if (M < 0)
19672 continue;
19673
19674 // The first non-undef element determines our shift amount.
19675 if (ShiftAmt < 0) {
19676 ShiftAmt = M - i;
19677 // Need to be shifting right.
19678 if (ShiftAmt <= 0)
19679 return SDValue();
19680 }
19681 // All non-undef elements must shift by the same amount.
19682 if (ShiftAmt != M - i)
19683 return SDValue();
19684 }
19685 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19685, __extension__
__PRETTY_FUNCTION__))
;
19686
19687 // Great we found a shift right.
19688 MVT WideVT = VT;
19689 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19690 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19691 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19692 DAG.getUNDEF(WideVT), V1,
19693 DAG.getIntPtrConstant(0, DL));
19694 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
19695 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19696 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19697 DAG.getIntPtrConstant(0, DL));
19698}
19699
19700// Determine if this shuffle can be implemented with a KSHIFT instruction.
19701// Returns the shift amount if possible or -1 if not. This is a simplified
19702// version of matchShuffleAsShift.
19703static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
19704 int MaskOffset, const APInt &Zeroable) {
19705 int Size = Mask.size();
19706
19707 auto CheckZeros = [&](int Shift, bool Left) {
19708 for (int j = 0; j < Shift; ++j)
19709 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
19710 return false;
19711
19712 return true;
19713 };
19714
19715 auto MatchShift = [&](int Shift, bool Left) {
19716 unsigned Pos = Left ? Shift : 0;
19717 unsigned Low = Left ? 0 : Shift;
19718 unsigned Len = Size - Shift;
19719 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
19720 };
19721
19722 for (int Shift = 1; Shift != Size; ++Shift)
19723 for (bool Left : {true, false})
19724 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
19725 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
19726 return Shift;
19727 }
19728
19729 return -1;
19730}
19731
19732
19733// Lower vXi1 vector shuffles.
19734// There is no a dedicated instruction on AVX-512 that shuffles the masks.
19735// The only way to shuffle bits is to sign-extend the mask vector to SIMD
19736// vector, shuffle and then truncate it back.
19737static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19738 MVT VT, SDValue V1, SDValue V2,
19739 const APInt &Zeroable,
19740 const X86Subtarget &Subtarget,
19741 SelectionDAG &DAG) {
19742 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19743, __extension__
__PRETTY_FUNCTION__))
19743 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19743, __extension__
__PRETTY_FUNCTION__))
;
19744
19745 int NumElts = Mask.size();
19746
19747 // Try to recognize shuffles that are just padding a subvector with zeros.
19748 int SubvecElts = 0;
19749 int Src = -1;
19750 for (int i = 0; i != NumElts; ++i) {
19751 if (Mask[i] >= 0) {
19752 // Grab the source from the first valid mask. All subsequent elements need
19753 // to use this same source.
19754 if (Src < 0)
19755 Src = Mask[i] / NumElts;
19756 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
19757 break;
19758 }
19759
19760 ++SubvecElts;
19761 }
19762 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19762, __extension__
__PRETTY_FUNCTION__))
;
19763
19764 // Clip to a power 2.
19765 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
19766
19767 // Make sure the number of zeroable bits in the top at least covers the bits
19768 // not covered by the subvector.
19769 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
19770 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19770, __extension__
__PRETTY_FUNCTION__))
;
19771 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
19772 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
19773 Src == 0 ? V1 : V2,
19774 DAG.getIntPtrConstant(0, DL));
19775 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19776 DAG.getConstant(0, DL, VT),
19777 Extract, DAG.getIntPtrConstant(0, DL));
19778 }
19779
19780 // Try a simple shift right with undef elements. Later we'll try with zeros.
19781 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
19782 DAG))
19783 return Shift;
19784
19785 // Try to match KSHIFTs.
19786 unsigned Offset = 0;
19787 for (SDValue V : { V1, V2 }) {
19788 unsigned Opcode;
19789 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
19790 if (ShiftAmt >= 0) {
19791 MVT WideVT = VT;
19792 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19793 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19794 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19795 DAG.getUNDEF(WideVT), V,
19796 DAG.getIntPtrConstant(0, DL));
19797 // Widened right shifts need two shifts to ensure we shift in zeroes.
19798 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
19799 int WideElts = WideVT.getVectorNumElements();
19800 // Shift left to put the original vector in the MSBs of the new size.
19801 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
19802 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
19803 // Increase the shift amount to account for the left shift.
19804 ShiftAmt += WideElts - NumElts;
19805 }
19806
19807 Res = DAG.getNode(Opcode, DL, WideVT, Res,
19808 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19809 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19810 DAG.getIntPtrConstant(0, DL));
19811 }
19812 Offset += NumElts; // Increment for next iteration.
19813 }
19814
19815 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
19816 // TODO: What other unary shuffles would benefit from this?
19817 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
19818 V1->hasOneUse()) {
19819 SDValue Op0 = V1.getOperand(0);
19820 SDValue Op1 = V1.getOperand(1);
19821 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
19822 EVT OpVT = Op0.getValueType();
19823 return DAG.getSetCC(
19824 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
19825 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
19826 }
19827
19828 MVT ExtVT;
19829 switch (VT.SimpleTy) {
19830 default:
19831 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19831)
;
19832 case MVT::v2i1:
19833 ExtVT = MVT::v2i64;
19834 break;
19835 case MVT::v4i1:
19836 ExtVT = MVT::v4i32;
19837 break;
19838 case MVT::v8i1:
19839 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19840 // shuffle.
19841 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19842 break;
19843 case MVT::v16i1:
19844 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19845 // 256-bit operation available.
19846 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19847 break;
19848 case MVT::v32i1:
19849 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19850 // 256-bit operation available.
19851 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19851, __extension__
__PRETTY_FUNCTION__))
;
19852 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19853 break;
19854 case MVT::v64i1:
19855 // Fall back to scalarization. FIXME: We can do better if the shuffle
19856 // can be partitioned cleanly.
19857 if (!Subtarget.useBWIRegs())
19858 return SDValue();
19859 ExtVT = MVT::v64i8;
19860 break;
19861 }
19862
19863 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19864 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19865
19866 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19867 // i1 was sign extended we can use X86ISD::CVT2MASK.
19868 int NumElems = VT.getVectorNumElements();
19869 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19870 (Subtarget.hasDQI() && (NumElems < 32)))
19871 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19872 Shuffle, ISD::SETGT);
19873
19874 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19875}
19876
19877/// Helper function that returns true if the shuffle mask should be
19878/// commuted to improve canonicalization.
19879static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19880 int NumElements = Mask.size();
19881
19882 int NumV1Elements = 0, NumV2Elements = 0;
19883 for (int M : Mask)
19884 if (M < 0)
19885 continue;
19886 else if (M < NumElements)
19887 ++NumV1Elements;
19888 else
19889 ++NumV2Elements;
19890
19891 // Commute the shuffle as needed such that more elements come from V1 than
19892 // V2. This allows us to match the shuffle pattern strictly on how many
19893 // elements come from V1 without handling the symmetric cases.
19894 if (NumV2Elements > NumV1Elements)
19895 return true;
19896
19897 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__))
;
19898
19899 if (NumV2Elements == 0)
19900 return false;
19901
19902 // When the number of V1 and V2 elements are the same, try to minimize the
19903 // number of uses of V2 in the low half of the vector. When that is tied,
19904 // ensure that the sum of indices for V1 is equal to or lower than the sum
19905 // indices for V2. When those are equal, try to ensure that the number of odd
19906 // indices for V1 is lower than the number of odd indices for V2.
19907 if (NumV1Elements == NumV2Elements) {
19908 int LowV1Elements = 0, LowV2Elements = 0;
19909 for (int M : Mask.slice(0, NumElements / 2))
19910 if (M >= NumElements)
19911 ++LowV2Elements;
19912 else if (M >= 0)
19913 ++LowV1Elements;
19914 if (LowV2Elements > LowV1Elements)
19915 return true;
19916 if (LowV2Elements == LowV1Elements) {
19917 int SumV1Indices = 0, SumV2Indices = 0;
19918 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19919 if (Mask[i] >= NumElements)
19920 SumV2Indices += i;
19921 else if (Mask[i] >= 0)
19922 SumV1Indices += i;
19923 if (SumV2Indices < SumV1Indices)
19924 return true;
19925 if (SumV2Indices == SumV1Indices) {
19926 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19927 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19928 if (Mask[i] >= NumElements)
19929 NumV2OddIndices += i % 2;
19930 else if (Mask[i] >= 0)
19931 NumV1OddIndices += i % 2;
19932 if (NumV2OddIndices < NumV1OddIndices)
19933 return true;
19934 }
19935 }
19936 }
19937
19938 return false;
19939}
19940
19941static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,
19942 const X86Subtarget &Subtarget) {
19943 if (!Subtarget.hasAVX512())
19944 return false;
19945
19946 MVT VT = V1.getSimpleValueType().getScalarType();
19947 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
19948 return false;
19949
19950 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
19951 // are preferable to blendw/blendvb/masked-mov.
19952 if ((VT == MVT::i16 || VT == MVT::i8) &&
19953 V1.getSimpleValueType().getSizeInBits() < 512)
19954 return false;
19955
19956 auto HasMaskOperation = [&](SDValue V) {
19957 // TODO: Currently we only check limited opcode. We probably extend
19958 // it to all binary operation by checking TLI.isBinOp().
19959 switch (V->getOpcode()) {
19960 default:
19961 return false;
19962 case ISD::ADD:
19963 case ISD::SUB:
19964 case ISD::AND:
19965 case ISD::XOR:
19966 case ISD::OR:
19967 case ISD::SMAX:
19968 case ISD::SMIN:
19969 case ISD::UMAX:
19970 case ISD::UMIN:
19971 case ISD::ABS:
19972 case ISD::SHL:
19973 case ISD::SRL:
19974 case ISD::SRA:
19975 case ISD::MUL:
19976 break;
19977 }
19978 if (!V->hasOneUse())
19979 return false;
19980
19981 return true;
19982 };
19983
19984 if (HasMaskOperation(V1) || HasMaskOperation(V2))
19985 return true;
19986
19987 return false;
19988}
19989
19990// Forward declaration.
19991static SDValue canonicalizeShuffleMaskWithHorizOp(
19992 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
19993 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
19994 const X86Subtarget &Subtarget);
19995
19996 /// Top-level lowering for x86 vector shuffles.
19997///
19998/// This handles decomposition, canonicalization, and lowering of all x86
19999/// vector shuffles. Most of the specific lowering strategies are encapsulated
20000/// above in helper routines. The canonicalization attempts to widen shuffles
20001/// to involve fewer lanes of wider elements, consolidate symmetric patterns
20002/// s.t. only one of the two inputs needs to be tested, etc.
20003static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
20004 SelectionDAG &DAG) {
20005 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
20006 ArrayRef<int> OrigMask = SVOp->getMask();
20007 SDValue V1 = Op.getOperand(0);
20008 SDValue V2 = Op.getOperand(1);
20009 MVT VT = Op.getSimpleValueType();
20010 int NumElements = VT.getVectorNumElements();
20011 SDLoc DL(Op);
20012 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
20013
20014 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20015, __extension__
__PRETTY_FUNCTION__))
20015 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20015, __extension__
__PRETTY_FUNCTION__))
;
20016
20017 bool V1IsUndef = V1.isUndef();
20018 bool V2IsUndef = V2.isUndef();
20019 if (V1IsUndef && V2IsUndef)
20020 return DAG.getUNDEF(VT);
20021
20022 // When we create a shuffle node we put the UNDEF node to second operand,
20023 // but in some cases the first operand may be transformed to UNDEF.
20024 // In this case we should just commute the node.
20025 if (V1IsUndef)
20026 return DAG.getCommutedVectorShuffle(*SVOp);
20027
20028 // Check for non-undef masks pointing at an undef vector and make the masks
20029 // undef as well. This makes it easier to match the shuffle based solely on
20030 // the mask.
20031 if (V2IsUndef &&
20032 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
20033 SmallVector<int, 8> NewMask(OrigMask);
20034 for (int &M : NewMask)
20035 if (M >= NumElements)
20036 M = -1;
20037 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
20038 }
20039
20040 // Check for illegal shuffle mask element index values.
20041 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
20042 (void)MaskUpperLimit;
20043 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20045, __extension__
__PRETTY_FUNCTION__))
20044 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20045, __extension__
__PRETTY_FUNCTION__))
20045 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20045, __extension__
__PRETTY_FUNCTION__))
;
20046
20047 // We actually see shuffles that are entirely re-arrangements of a set of
20048 // zero inputs. This mostly happens while decomposing complex shuffles into
20049 // simple ones. Directly lower these as a buildvector of zeros.
20050 APInt KnownUndef, KnownZero;
20051 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
20052
20053 APInt Zeroable = KnownUndef | KnownZero;
20054 if (Zeroable.isAllOnes())
20055 return getZeroVector(VT, Subtarget, DAG, DL);
20056
20057 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
20058
20059 // Try to collapse shuffles into using a vector type with fewer elements but
20060 // wider element types. We cap this to not form integers or floating point
20061 // elements wider than 64 bits. It does not seem beneficial to form i128
20062 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
20063 SmallVector<int, 16> WidenedMask;
20064 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
20065 !canCombineAsMaskOperation(V1, V2, Subtarget) &&
20066 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
20067 // Shuffle mask widening should not interfere with a broadcast opportunity
20068 // by obfuscating the operands with bitcasts.
20069 // TODO: Avoid lowering directly from this top-level function: make this
20070 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
20071 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
20072 Subtarget, DAG))
20073 return Broadcast;
20074
20075 MVT NewEltVT = VT.isFloatingPoint()
20076 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
20077 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
20078 int NewNumElts = NumElements / 2;
20079 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
20080 // Make sure that the new vector type is legal. For example, v2f64 isn't
20081 // legal on SSE1.
20082 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
20083 if (V2IsZero) {
20084 // Modify the new Mask to take all zeros from the all-zero vector.
20085 // Choose indices that are blend-friendly.
20086 bool UsedZeroVector = false;
20087 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))
20088 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))
;
20089 for (int i = 0; i != NewNumElts; ++i)
20090 if (WidenedMask[i] == SM_SentinelZero) {
20091 WidenedMask[i] = i + NewNumElts;
20092 UsedZeroVector = true;
20093 }
20094 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
20095 // some elements to be undef.
20096 if (UsedZeroVector)
20097 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
20098 }
20099 V1 = DAG.getBitcast(NewVT, V1);
20100 V2 = DAG.getBitcast(NewVT, V2);
20101 return DAG.getBitcast(
20102 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
20103 }
20104 }
20105
20106 SmallVector<SDValue> Ops = {V1, V2};
20107 SmallVector<int> Mask(OrigMask);
20108
20109 // Canonicalize the shuffle with any horizontal ops inputs.
20110 // NOTE: This may update Ops and Mask.
20111 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
20112 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
20113 return DAG.getBitcast(VT, HOp);
20114
20115 V1 = DAG.getBitcast(VT, Ops[0]);
20116 V2 = DAG.getBitcast(VT, Ops[1]);
20117 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20119, __extension__
__PRETTY_FUNCTION__))
20118 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20119, __extension__
__PRETTY_FUNCTION__))
20119 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20119, __extension__
__PRETTY_FUNCTION__))
;
20120
20121 // Commute the shuffle if it will improve canonicalization.
20122 if (canonicalizeShuffleMaskWithCommute(Mask)) {
20123 ShuffleVectorSDNode::commuteMask(Mask);
20124 std::swap(V1, V2);
20125 }
20126
20127 // For each vector width, delegate to a specialized lowering routine.
20128 if (VT.is128BitVector())
20129 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20130
20131 if (VT.is256BitVector())
20132 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20133
20134 if (VT.is512BitVector())
20135 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20136
20137 if (Is1BitVector)
20138 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20139
20140 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20140)
;
20141}
20142
20143/// Try to lower a VSELECT instruction to a vector shuffle.
20144static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
20145 const X86Subtarget &Subtarget,
20146 SelectionDAG &DAG) {
20147 SDValue Cond = Op.getOperand(0);
20148 SDValue LHS = Op.getOperand(1);
20149 SDValue RHS = Op.getOperand(2);
20150 MVT VT = Op.getSimpleValueType();
20151
20152 // Only non-legal VSELECTs reach this lowering, convert those into generic
20153 // shuffles and re-use the shuffle lowering path for blends.
20154 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
20155 SmallVector<int, 32> Mask;
20156 if (createShuffleMaskFromVSELECT(Mask, Cond))
20157 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
20158 }
20159
20160 return SDValue();
20161}
20162
20163SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
20164 SDValue Cond = Op.getOperand(0);
20165 SDValue LHS = Op.getOperand(1);
20166 SDValue RHS = Op.getOperand(2);
20167
20168 SDLoc dl(Op);
20169 MVT VT = Op.getSimpleValueType();
20170 if (isSoftFP16(VT)) {
20171 MVT NVT = VT.changeVectorElementTypeToInteger();
20172 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
20173 DAG.getBitcast(NVT, LHS),
20174 DAG.getBitcast(NVT, RHS)));
20175 }
20176
20177 // A vselect where all conditions and data are constants can be optimized into
20178 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
20179 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
20180 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
20181 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
20182 return SDValue();
20183
20184 // Try to lower this to a blend-style vector shuffle. This can handle all
20185 // constant condition cases.
20186 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
20187 return BlendOp;
20188
20189 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
20190 // with patterns on the mask registers on AVX-512.
20191 MVT CondVT = Cond.getSimpleValueType();
20192 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
20193 if (CondEltSize == 1)
20194 return Op;
20195
20196 // Variable blends are only legal from SSE4.1 onward.
20197 if (!Subtarget.hasSSE41())
20198 return SDValue();
20199
20200 unsigned EltSize = VT.getScalarSizeInBits();
20201 unsigned NumElts = VT.getVectorNumElements();
20202
20203 // Expand v32i16/v64i8 without BWI.
20204 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
20205 return SDValue();
20206
20207 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
20208 // into an i1 condition so that we can use the mask-based 512-bit blend
20209 // instructions.
20210 if (VT.getSizeInBits() == 512) {
20211 // Build a mask by testing the condition against zero.
20212 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
20213 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
20214 DAG.getConstant(0, dl, CondVT),
20215 ISD::SETNE);
20216 // Now return a new VSELECT using the mask.
20217 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
20218 }
20219
20220 // SEXT/TRUNC cases where the mask doesn't match the destination size.
20221 if (CondEltSize != EltSize) {
20222 // If we don't have a sign splat, rely on the expansion.
20223 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
20224 return SDValue();
20225
20226 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
20227 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
20228 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
20229 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
20230 }
20231
20232 // Only some types will be legal on some subtargets. If we can emit a legal
20233 // VSELECT-matching blend, return Op, and but if we need to expand, return
20234 // a null value.
20235 switch (VT.SimpleTy) {
20236 default:
20237 // Most of the vector types have blends past SSE4.1.
20238 return Op;
20239
20240 case MVT::v32i8:
20241 // The byte blends for AVX vectors were introduced only in AVX2.
20242 if (Subtarget.hasAVX2())
20243 return Op;
20244
20245 return SDValue();
20246
20247 case MVT::v8i16:
20248 case MVT::v16i16: {
20249 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
20250 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
20251 Cond = DAG.getBitcast(CastVT, Cond);
20252 LHS = DAG.getBitcast(CastVT, LHS);
20253 RHS = DAG.getBitcast(CastVT, RHS);
20254 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
20255 return DAG.getBitcast(VT, Select);
20256 }
20257 }
20258}
20259
20260static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
20261 MVT VT = Op.getSimpleValueType();
20262 SDValue Vec = Op.getOperand(0);
20263 SDValue Idx = Op.getOperand(1);
20264 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20264, __extension__
__PRETTY_FUNCTION__))
;
20265 SDLoc dl(Op);
20266
20267 if (!Vec.getSimpleValueType().is128BitVector())
20268 return SDValue();
20269
20270 if (VT.getSizeInBits() == 8) {
20271 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
20272 // we're going to zero extend the register or fold the store.
20273 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
20274 !X86::mayFoldIntoStore(Op))
20275 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
20276 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20277 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20278
20279 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
20280 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
20281 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20282 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20283 }
20284
20285 if (VT == MVT::f32) {
20286 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
20287 // the result back to FR32 register. It's only worth matching if the
20288 // result has a single use which is a store or a bitcast to i32. And in
20289 // the case of a store, it's not worth it if the index is a constant 0,
20290 // because a MOVSSmr can be used instead, which is smaller and faster.
20291 if (!Op.hasOneUse())
20292 return SDValue();
20293 SDNode *User = *Op.getNode()->use_begin();
20294 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
20295 (User->getOpcode() != ISD::BITCAST ||
20296 User->getValueType(0) != MVT::i32))
20297 return SDValue();
20298 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20299 DAG.getBitcast(MVT::v4i32, Vec), Idx);
20300 return DAG.getBitcast(MVT::f32, Extract);
20301 }
20302
20303 if (VT == MVT::i32 || VT == MVT::i64)
20304 return Op;
20305
20306 return SDValue();
20307}
20308
20309/// Extract one bit from mask vector, like v16i1 or v8i1.
20310/// AVX-512 feature.
20311static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
20312 const X86Subtarget &Subtarget) {
20313 SDValue Vec = Op.getOperand(0);
20314 SDLoc dl(Vec);
20315 MVT VecVT = Vec.getSimpleValueType();
20316 SDValue Idx = Op.getOperand(1);
20317 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20318 MVT EltVT = Op.getSimpleValueType();
20319
20320 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20321, __extension__
__PRETTY_FUNCTION__))
20321 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20321, __extension__
__PRETTY_FUNCTION__))
;
20322
20323 // variable index can't be handled in mask registers,
20324 // extend vector to VR512/128
20325 if (!IdxC) {
20326 unsigned NumElts = VecVT.getVectorNumElements();
20327 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
20328 // than extending to 128/256bit.
20329 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20330 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20331 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
20332 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
20333 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
20334 }
20335
20336 unsigned IdxVal = IdxC->getZExtValue();
20337 if (IdxVal == 0) // the operation is legal
20338 return Op;
20339
20340 // Extend to natively supported kshift.
20341 unsigned NumElems = VecVT.getVectorNumElements();
20342 MVT WideVecVT = VecVT;
20343 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20344 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20345 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20346 DAG.getUNDEF(WideVecVT), Vec,
20347 DAG.getIntPtrConstant(0, dl));
20348 }
20349
20350 // Use kshiftr instruction to move to the lower element.
20351 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20352 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20353
20354 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20355 DAG.getIntPtrConstant(0, dl));
20356}
20357
20358SDValue
20359X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
20360 SelectionDAG &DAG) const {
20361 SDLoc dl(Op);
20362 SDValue Vec = Op.getOperand(0);
20363 MVT VecVT = Vec.getSimpleValueType();
20364 SDValue Idx = Op.getOperand(1);
20365 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20366
20367 if (VecVT.getVectorElementType() == MVT::i1)
20368 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
20369
20370 if (!IdxC) {
20371 // Its more profitable to go through memory (1 cycles throughput)
20372 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
20373 // IACA tool was used to get performance estimation
20374 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
20375 //
20376 // example : extractelement <16 x i8> %a, i32 %i
20377 //
20378 // Block Throughput: 3.00 Cycles
20379 // Throughput Bottleneck: Port5
20380 //
20381 // | Num Of | Ports pressure in cycles | |
20382 // | Uops | 0 - DV | 5 | 6 | 7 | |
20383 // ---------------------------------------------
20384 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
20385 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
20386 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
20387 // Total Num Of Uops: 4
20388 //
20389 //
20390 // Block Throughput: 1.00 Cycles
20391 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
20392 //
20393 // | | Ports pressure in cycles | |
20394 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
20395 // ---------------------------------------------------------
20396 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
20397 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
20398 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
20399 // Total Num Of Uops: 4
20400
20401 return SDValue();
20402 }
20403
20404 unsigned IdxVal = IdxC->getZExtValue();
20405
20406 // If this is a 256-bit vector result, first extract the 128-bit vector and
20407 // then extract the element from the 128-bit vector.
20408 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
20409 // Get the 128-bit vector.
20410 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
20411 MVT EltVT = VecVT.getVectorElementType();
20412
20413 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
20414 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20414, __extension__
__PRETTY_FUNCTION__))
;
20415
20416 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
20417 // this can be done with a mask.
20418 IdxVal &= ElemsPerChunk - 1;
20419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20420 DAG.getIntPtrConstant(IdxVal, dl));
20421 }
20422
20423 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20423, __extension__
__PRETTY_FUNCTION__))
;
20424
20425 MVT VT = Op.getSimpleValueType();
20426
20427 if (VT == MVT::i16) {
20428 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
20429 // we're going to zero extend the register or fold the store (SSE41 only).
20430 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
20431 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
20432 if (Subtarget.hasFP16())
20433 return Op;
20434
20435 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
20436 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20437 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20438 }
20439
20440 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
20441 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20442 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20443 }
20444
20445 if (Subtarget.hasSSE41())
20446 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
20447 return Res;
20448
20449 // TODO: We only extract a single element from v16i8, we can probably afford
20450 // to be more aggressive here before using the default approach of spilling to
20451 // stack.
20452 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
20453 // Extract either the lowest i32 or any i16, and extract the sub-byte.
20454 int DWordIdx = IdxVal / 4;
20455 if (DWordIdx == 0) {
20456 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20457 DAG.getBitcast(MVT::v4i32, Vec),
20458 DAG.getIntPtrConstant(DWordIdx, dl));
20459 int ShiftVal = (IdxVal % 4) * 8;
20460 if (ShiftVal != 0)
20461 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
20462 DAG.getConstant(ShiftVal, dl, MVT::i8));
20463 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20464 }
20465
20466 int WordIdx = IdxVal / 2;
20467 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
20468 DAG.getBitcast(MVT::v8i16, Vec),
20469 DAG.getIntPtrConstant(WordIdx, dl));
20470 int ShiftVal = (IdxVal % 2) * 8;
20471 if (ShiftVal != 0)
20472 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
20473 DAG.getConstant(ShiftVal, dl, MVT::i8));
20474 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20475 }
20476
20477 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
20478 if (IdxVal == 0)
20479 return Op;
20480
20481 // Shuffle the element to the lowest element, then movss or movsh.
20482 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
20483 Mask[0] = static_cast<int>(IdxVal);
20484 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20485 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20486 DAG.getIntPtrConstant(0, dl));
20487 }
20488
20489 if (VT.getSizeInBits() == 64) {
20490 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
20491 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
20492 // to match extract_elt for f64.
20493 if (IdxVal == 0)
20494 return Op;
20495
20496 // UNPCKHPD the element to the lowest double word, then movsd.
20497 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
20498 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
20499 int Mask[2] = { 1, -1 };
20500 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20501 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20502 DAG.getIntPtrConstant(0, dl));
20503 }
20504
20505 return SDValue();
20506}
20507
20508/// Insert one bit to mask vector, like v16i1 or v8i1.
20509/// AVX-512 feature.
20510static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
20511 const X86Subtarget &Subtarget) {
20512 SDLoc dl(Op);
20513 SDValue Vec = Op.getOperand(0);
20514 SDValue Elt = Op.getOperand(1);
20515 SDValue Idx = Op.getOperand(2);
20516 MVT VecVT = Vec.getSimpleValueType();
20517
20518 if (!isa<ConstantSDNode>(Idx)) {
20519 // Non constant index. Extend source and destination,
20520 // insert element and then truncate the result.
20521 unsigned NumElts = VecVT.getVectorNumElements();
20522 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20523 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20524 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
20525 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
20526 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
20527 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
20528 }
20529
20530 // Copy into a k-register, extract to v1i1 and insert_subvector.
20531 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
20532 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
20533}
20534
20535SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
20536 SelectionDAG &DAG) const {
20537 MVT VT = Op.getSimpleValueType();
20538 MVT EltVT = VT.getVectorElementType();
20539 unsigned NumElts = VT.getVectorNumElements();
20540 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
20541
20542 if (EltVT == MVT::i1)
20543 return InsertBitToMaskVector(Op, DAG, Subtarget);
20544
20545 SDLoc dl(Op);
20546 SDValue N0 = Op.getOperand(0);
20547 SDValue N1 = Op.getOperand(1);
20548 SDValue N2 = Op.getOperand(2);
20549 auto *N2C = dyn_cast<ConstantSDNode>(N2);
20550
20551 if (!N2C) {
20552 // Variable insertion indices, usually we're better off spilling to stack,
20553 // but AVX512 can use a variable compare+select by comparing against all
20554 // possible vector indices, and FP insertion has less gpr->simd traffic.
20555 if (!(Subtarget.hasBWI() ||
20556 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
20557 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
20558 return SDValue();
20559
20560 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
20561 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
20562 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
20563 return SDValue();
20564
20565 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
20566 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
20567 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
20568
20569 SmallVector<SDValue, 16> RawIndices;
20570 for (unsigned I = 0; I != NumElts; ++I)
20571 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
20572 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
20573
20574 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
20575 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
20576 ISD::CondCode::SETEQ);
20577 }
20578
20579 if (N2C->getAPIntValue().uge(NumElts))
20580 return SDValue();
20581 uint64_t IdxVal = N2C->getZExtValue();
20582
20583 bool IsZeroElt = X86::isZeroNode(N1);
20584 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
20585
20586 if (IsZeroElt || IsAllOnesElt) {
20587 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
20588 // We don't deal with i8 0 since it appears to be handled elsewhere.
20589 if (IsAllOnesElt &&
20590 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
20591 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
20592 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
20593 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
20594 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
20595 CstVectorElts[IdxVal] = OnesCst;
20596 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
20597 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
20598 }
20599 // See if we can do this more efficiently with a blend shuffle with a
20600 // rematerializable vector.
20601 if (Subtarget.hasSSE41() &&
20602 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
20603 SmallVector<int, 8> BlendMask;
20604 for (unsigned i = 0; i != NumElts; ++i)
20605 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20606 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
20607 : getOnesVector(VT, DAG, dl);
20608 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
20609 }
20610 }
20611
20612 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
20613 // into that, and then insert the subvector back into the result.
20614 if (VT.is256BitVector() || VT.is512BitVector()) {
20615 // With a 256-bit vector, we can insert into the zero element efficiently
20616 // using a blend if we have AVX or AVX2 and the right data type.
20617 if (VT.is256BitVector() && IdxVal == 0) {
20618 // TODO: It is worthwhile to cast integer to floating point and back
20619 // and incur a domain crossing penalty if that's what we'll end up
20620 // doing anyway after extracting to a 128-bit vector.
20621 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
20622 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
20623 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20624 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
20625 DAG.getTargetConstant(1, dl, MVT::i8));
20626 }
20627 }
20628
20629 unsigned NumEltsIn128 = 128 / EltSizeInBits;
20630 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20631, __extension__
__PRETTY_FUNCTION__))
20631 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20631, __extension__
__PRETTY_FUNCTION__))
;
20632
20633 // If we are not inserting into the low 128-bit vector chunk,
20634 // then prefer the broadcast+blend sequence.
20635 // FIXME: relax the profitability check iff all N1 uses are insertions.
20636 if (IdxVal >= NumEltsIn128 &&
20637 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
20638 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
20639 X86::mayFoldLoad(N1, Subtarget)))) {
20640 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
20641 SmallVector<int, 8> BlendMask;
20642 for (unsigned i = 0; i != NumElts; ++i)
20643 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20644 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
20645 }
20646
20647 // Get the desired 128-bit vector chunk.
20648 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
20649
20650 // Insert the element into the desired chunk.
20651 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
20652 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
20653
20654 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
20655 DAG.getIntPtrConstant(IdxIn128, dl));
20656
20657 // Insert the changed part back into the bigger vector
20658 return insert128BitVector(N0, V, IdxVal, DAG, dl);
20659 }
20660 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20660, __extension__
__PRETTY_FUNCTION__))
;
20661
20662 // This will be just movw/movd/movq/movsh/movss/movsd.
20663 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
20664 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
20665 EltVT == MVT::f16 || EltVT == MVT::i64) {
20666 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20667 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20668 }
20669
20670 // We can't directly insert an i8 or i16 into a vector, so zero extend
20671 // it to i32 first.
20672 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
20673 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
20674 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20675 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
20676 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20677 return DAG.getBitcast(VT, N1);
20678 }
20679 }
20680
20681 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
20682 // argument. SSE41 required for pinsrb.
20683 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
20684 unsigned Opc;
20685 if (VT == MVT::v8i16) {
20686 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20686, __extension__
__PRETTY_FUNCTION__))
;
20687 Opc = X86ISD::PINSRW;
20688 } else {
20689 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20689, __extension__
__PRETTY_FUNCTION__))
;
20690 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20690, __extension__
__PRETTY_FUNCTION__))
;
20691 Opc = X86ISD::PINSRB;
20692 }
20693
20694 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20694, __extension__
__PRETTY_FUNCTION__))
;
20695 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
20696 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
20697 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
20698 }
20699
20700 if (Subtarget.hasSSE41()) {
20701 if (EltVT == MVT::f32) {
20702 // Bits [7:6] of the constant are the source select. This will always be
20703 // zero here. The DAG Combiner may combine an extract_elt index into
20704 // these bits. For example (insert (extract, 3), 2) could be matched by
20705 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
20706 // Bits [5:4] of the constant are the destination select. This is the
20707 // value of the incoming immediate.
20708 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
20709 // combine either bitwise AND or insert of float 0.0 to set these bits.
20710
20711 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
20712 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
20713 // If this is an insertion of 32-bits into the low 32-bits of
20714 // a vector, we prefer to generate a blend with immediate rather
20715 // than an insertps. Blends are simpler operations in hardware and so
20716 // will always have equal or better performance than insertps.
20717 // But if optimizing for size and there's a load folding opportunity,
20718 // generate insertps because blendps does not have a 32-bit memory
20719 // operand form.
20720 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20721 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
20722 DAG.getTargetConstant(1, dl, MVT::i8));
20723 }
20724 // Create this as a scalar to vector..
20725 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20726 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
20727 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
20728 }
20729
20730 // PINSR* works with constant index.
20731 if (EltVT == MVT::i32 || EltVT == MVT::i64)
20732 return Op;
20733 }
20734
20735 return SDValue();
20736}
20737
20738static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
20739 SelectionDAG &DAG) {
20740 SDLoc dl(Op);
20741 MVT OpVT = Op.getSimpleValueType();
20742
20743 // It's always cheaper to replace a xor+movd with xorps and simplifies further
20744 // combines.
20745 if (X86::isZeroNode(Op.getOperand(0)))
20746 return getZeroVector(OpVT, Subtarget, DAG, dl);
20747
20748 // If this is a 256-bit vector result, first insert into a 128-bit
20749 // vector and then insert into the 256-bit vector.
20750 if (!OpVT.is128BitVector()) {
20751 // Insert into a 128-bit vector.
20752 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
20753 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
20754 OpVT.getVectorNumElements() / SizeFactor);
20755
20756 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
20757
20758 // Insert the 128-bit vector.
20759 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
20760 }
20761 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20762, __extension__
__PRETTY_FUNCTION__))
20762 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20762, __extension__
__PRETTY_FUNCTION__))
;
20763
20764 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
20765 // tblgen.
20766 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
20767 return Op;
20768
20769 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
20770 return DAG.getBitcast(
20771 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
20772}
20773
20774// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
20775// simple superregister reference or explicit instructions to insert
20776// the upper bits of a vector.
20777static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20778 SelectionDAG &DAG) {
20779 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20779, __extension__
__PRETTY_FUNCTION__))
;
20780
20781 return insert1BitVector(Op, DAG, Subtarget);
20782}
20783
20784static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20785 SelectionDAG &DAG) {
20786 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20787, __extension__
__PRETTY_FUNCTION__))
20787 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20787, __extension__
__PRETTY_FUNCTION__))
;
20788
20789 SDLoc dl(Op);
20790 SDValue Vec = Op.getOperand(0);
20791 uint64_t IdxVal = Op.getConstantOperandVal(1);
20792
20793 if (IdxVal == 0) // the operation is legal
20794 return Op;
20795
20796 MVT VecVT = Vec.getSimpleValueType();
20797 unsigned NumElems = VecVT.getVectorNumElements();
20798
20799 // Extend to natively supported kshift.
20800 MVT WideVecVT = VecVT;
20801 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20802 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20803 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20804 DAG.getUNDEF(WideVecVT), Vec,
20805 DAG.getIntPtrConstant(0, dl));
20806 }
20807
20808 // Shift to the LSB.
20809 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20810 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20811
20812 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
20813 DAG.getIntPtrConstant(0, dl));
20814}
20815
20816// Returns the appropriate wrapper opcode for a global reference.
20817unsigned X86TargetLowering::getGlobalWrapperKind(
20818 const GlobalValue *GV, const unsigned char OpFlags) const {
20819 // References to absolute symbols are never PC-relative.
20820 if (GV && GV->isAbsoluteSymbolRef())
20821 return X86ISD::Wrapper;
20822
20823 CodeModel::Model M = getTargetMachine().getCodeModel();
20824 if (Subtarget.isPICStyleRIPRel() &&
20825 (M == CodeModel::Small || M == CodeModel::Kernel))
20826 return X86ISD::WrapperRIP;
20827
20828 // In the medium model, functions can always be referenced RIP-relatively,
20829 // since they must be within 2GiB. This is also possible in non-PIC mode, and
20830 // shorter than the 64-bit absolute immediate that would otherwise be emitted.
20831 if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))
20832 return X86ISD::WrapperRIP;
20833
20834 // GOTPCREL references must always use RIP.
20835 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
20836 return X86ISD::WrapperRIP;
20837
20838 return X86ISD::Wrapper;
20839}
20840
20841// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
20842// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
20843// one of the above mentioned nodes. It has to be wrapped because otherwise
20844// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
20845// be used to form addressing mode. These wrapped nodes will be selected
20846// into MOV32ri.
20847SDValue
20848X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
20849 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
20850
20851 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20852 // global base reg.
20853 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20854
20855 auto PtrVT = getPointerTy(DAG.getDataLayout());
20856 SDValue Result = DAG.getTargetConstantPool(
20857 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
20858 SDLoc DL(CP);
20859 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20860 // With PIC, the address is actually $g + Offset.
20861 if (OpFlag) {
20862 Result =
20863 DAG.getNode(ISD::ADD, DL, PtrVT,
20864 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20865 }
20866
20867 return Result;
20868}
20869
20870SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
20871 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
20872
20873 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20874 // global base reg.
20875 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20876
20877 auto PtrVT = getPointerTy(DAG.getDataLayout());
20878 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
20879 SDLoc DL(JT);
20880 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20881
20882 // With PIC, the address is actually $g + Offset.
20883 if (OpFlag)
20884 Result =
20885 DAG.getNode(ISD::ADD, DL, PtrVT,
20886 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20887
20888 return Result;
20889}
20890
20891SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
20892 SelectionDAG &DAG) const {
20893 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20894}
20895
20896SDValue
20897X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20898 // Create the TargetBlockAddressAddress node.
20899 unsigned char OpFlags =
20900 Subtarget.classifyBlockAddressReference();
20901 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20902 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20903 SDLoc dl(Op);
20904 auto PtrVT = getPointerTy(DAG.getDataLayout());
20905 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20906 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20907
20908 // With PIC, the address is actually $g + Offset.
20909 if (isGlobalRelativeToPICBase(OpFlags)) {
20910 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20911 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20912 }
20913
20914 return Result;
20915}
20916
20917/// Creates target global address or external symbol nodes for calls or
20918/// other uses.
20919SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20920 bool ForCall) const {
20921 // Unpack the global address or external symbol.
20922 const SDLoc &dl = SDLoc(Op);
20923 const GlobalValue *GV = nullptr;
20924 int64_t Offset = 0;
20925 const char *ExternalSym = nullptr;
20926 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20927 GV = G->getGlobal();
20928 Offset = G->getOffset();
20929 } else {
20930 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20931 ExternalSym = ES->getSymbol();
20932 }
20933
20934 // Calculate some flags for address lowering.
20935 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
20936 unsigned char OpFlags;
20937 if (ForCall)
20938 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
20939 else
20940 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
20941 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
20942 bool NeedsLoad = isGlobalStubReference(OpFlags);
20943
20944 CodeModel::Model M = DAG.getTarget().getCodeModel();
20945 auto PtrVT = getPointerTy(DAG.getDataLayout());
20946 SDValue Result;
20947
20948 if (GV) {
20949 // Create a target global address if this is a global. If possible, fold the
20950 // offset into the global address reference. Otherwise, ADD it on later.
20951 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
20952 // allowed because if the address of foo is 0, the ELF R_X86_64_32
20953 // relocation will compute to a negative value, which is invalid.
20954 int64_t GlobalOffset = 0;
20955 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20956 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
20957 std::swap(GlobalOffset, Offset);
20958 }
20959 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20960 } else {
20961 // If this is not a global address, this must be an external symbol.
20962 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20963 }
20964
20965 // If this is a direct call, avoid the wrapper if we don't need to do any
20966 // loads or adds. This allows SDAG ISel to match direct calls.
20967 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20968 return Result;
20969
20970 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20971
20972 // With PIC, the address is actually $g + Offset.
20973 if (HasPICReg) {
20974 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20975 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20976 }
20977
20978 // For globals that require a load from a stub to get the address, emit the
20979 // load.
20980 if (NeedsLoad)
20981 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20982 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20983
20984 // If there was a non-zero offset that we didn't fold, create an explicit
20985 // addition for it.
20986 if (Offset != 0)
20987 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20988 DAG.getConstant(Offset, dl, PtrVT));
20989
20990 return Result;
20991}
20992
20993SDValue
20994X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20995 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20996}
20997
20998static SDValue
20999GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
21000 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
21001 unsigned char OperandFlags, bool LocalDynamic = false) {
21002 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21003 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21004 SDLoc dl(GA);
21005 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21006 GA->getValueType(0),
21007 GA->getOffset(),
21008 OperandFlags);
21009
21010 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
21011 : X86ISD::TLSADDR;
21012
21013 if (InGlue) {
21014 SDValue Ops[] = { Chain, TGA, *InGlue };
21015 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21016 } else {
21017 SDValue Ops[] = { Chain, TGA };
21018 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21019 }
21020
21021 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
21022 MFI.setAdjustsStack(true);
21023 MFI.setHasCalls(true);
21024
21025 SDValue Glue = Chain.getValue(1);
21026 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
21027}
21028
21029// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
21030static SDValue
21031LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21032 const EVT PtrVT) {
21033 SDValue InGlue;
21034 SDLoc dl(GA); // ? function entry point might be better
21035 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21036 DAG.getNode(X86ISD::GlobalBaseReg,
21037 SDLoc(), PtrVT), InGlue);
21038 InGlue = Chain.getValue(1);
21039
21040 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
21041}
21042
21043// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
21044static SDValue
21045LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21046 const EVT PtrVT) {
21047 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21048 X86::RAX, X86II::MO_TLSGD);
21049}
21050
21051// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
21052static SDValue
21053LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21054 const EVT PtrVT) {
21055 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21056 X86::EAX, X86II::MO_TLSGD);
21057}
21058
21059static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
21060 SelectionDAG &DAG, const EVT PtrVT,
21061 bool Is64Bit, bool Is64BitLP64) {
21062 SDLoc dl(GA);
21063
21064 // Get the start address of the TLS block for this module.
21065 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
21066 .getInfo<X86MachineFunctionInfo>();
21067 MFI->incNumLocalDynamicTLSAccesses();
21068
21069 SDValue Base;
21070 if (Is64Bit) {
21071 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
21072 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
21073 X86II::MO_TLSLD, /*LocalDynamic=*/true);
21074 } else {
21075 SDValue InGlue;
21076 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21077 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
21078 InGlue = Chain.getValue(1);
21079 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
21080 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
21081 }
21082
21083 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
21084 // of Base.
21085
21086 // Build x@dtpoff.
21087 unsigned char OperandFlags = X86II::MO_DTPOFF;
21088 unsigned WrapperKind = X86ISD::Wrapper;
21089 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21090 GA->getValueType(0),
21091 GA->getOffset(), OperandFlags);
21092 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21093
21094 // Add x@dtpoff with the base.
21095 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
21096}
21097
21098// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
21099static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21100 const EVT PtrVT, TLSModel::Model model,
21101 bool is64Bit, bool isPIC) {
21102 SDLoc dl(GA);
21103
21104 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
21105 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
21106 is64Bit ? 257 : 256));
21107
21108 SDValue ThreadPointer =
21109 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
21110 MachinePointerInfo(Ptr));
21111
21112 unsigned char OperandFlags = 0;
21113 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
21114 // initialexec.
21115 unsigned WrapperKind = X86ISD::Wrapper;
21116 if (model == TLSModel::LocalExec) {
21117 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
21118 } else if (model == TLSModel::InitialExec) {
21119 if (is64Bit) {
21120 OperandFlags = X86II::MO_GOTTPOFF;
21121 WrapperKind = X86ISD::WrapperRIP;
21122 } else {
21123 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
21124 }
21125 } else {
21126 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21126)
;
21127 }
21128
21129 // emit "addl x@ntpoff,%eax" (local exec)
21130 // or "addl x@indntpoff,%eax" (initial exec)
21131 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
21132 SDValue TGA =
21133 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
21134 GA->getOffset(), OperandFlags);
21135 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21136
21137 if (model == TLSModel::InitialExec) {
21138 if (isPIC && !is64Bit) {
21139 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
21140 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21141 Offset);
21142 }
21143
21144 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
21145 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21146 }
21147
21148 // The address of the thread local variable is the add of the thread
21149 // pointer with the offset of the variable.
21150 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
21151}
21152
21153SDValue
21154X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
21155
21156 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
21157
21158 if (DAG.getTarget().useEmulatedTLS())
21159 return LowerToTLSEmulatedModel(GA, DAG);
21160
21161 const GlobalValue *GV = GA->getGlobal();
21162 auto PtrVT = getPointerTy(DAG.getDataLayout());
21163 bool PositionIndependent = isPositionIndependent();
21164
21165 if (Subtarget.isTargetELF()) {
21166 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
21167 switch (model) {
21168 case TLSModel::GeneralDynamic:
21169 if (Subtarget.is64Bit()) {
21170 if (Subtarget.isTarget64BitLP64())
21171 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
21172 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
21173 }
21174 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
21175 case TLSModel::LocalDynamic:
21176 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
21177 Subtarget.isTarget64BitLP64());
21178 case TLSModel::InitialExec:
21179 case TLSModel::LocalExec:
21180 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
21181 PositionIndependent);
21182 }
21183 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21183)
;
21184 }
21185
21186 if (Subtarget.isTargetDarwin()) {
21187 // Darwin only has one model of TLS. Lower to that.
21188 unsigned char OpFlag = 0;
21189 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
21190 X86ISD::WrapperRIP : X86ISD::Wrapper;
21191
21192 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21193 // global base reg.
21194 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
21195 if (PIC32)
21196 OpFlag = X86II::MO_TLVP_PIC_BASE;
21197 else
21198 OpFlag = X86II::MO_TLVP;
21199 SDLoc DL(Op);
21200 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
21201 GA->getValueType(0),
21202 GA->getOffset(), OpFlag);
21203 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
21204
21205 // With PIC32, the address is actually $g + Offset.
21206 if (PIC32)
21207 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
21208 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21209 Offset);
21210
21211 // Lowering the machine isd will make sure everything is in the right
21212 // location.
21213 SDValue Chain = DAG.getEntryNode();
21214 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21215 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
21216 SDValue Args[] = { Chain, Offset };
21217 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
21218 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
21219
21220 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
21221 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21222 MFI.setAdjustsStack(true);
21223
21224 // And our return value (tls address) is in the standard call return value
21225 // location.
21226 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
21227 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
21228 }
21229
21230 if (Subtarget.isOSWindows()) {
21231 // Just use the implicit TLS architecture
21232 // Need to generate something similar to:
21233 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
21234 // ; from TEB
21235 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
21236 // mov rcx, qword [rdx+rcx*8]
21237 // mov eax, .tls$:tlsvar
21238 // [rax+rcx] contains the address
21239 // Windows 64bit: gs:0x58
21240 // Windows 32bit: fs:__tls_array
21241
21242 SDLoc dl(GA);
21243 SDValue Chain = DAG.getEntryNode();
21244
21245 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
21246 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
21247 // use its literal value of 0x2C.
21248 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
21249 ? Type::getInt8PtrTy(*DAG.getContext(),
21250 256)
21251 : Type::getInt32PtrTy(*DAG.getContext(),
21252 257));
21253
21254 SDValue TlsArray = Subtarget.is64Bit()
21255 ? DAG.getIntPtrConstant(0x58, dl)
21256 : (Subtarget.isTargetWindowsGNU()
21257 ? DAG.getIntPtrConstant(0x2C, dl)
21258 : DAG.getExternalSymbol("_tls_array", PtrVT));
21259
21260 SDValue ThreadPointer =
21261 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
21262
21263 SDValue res;
21264 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
21265 res = ThreadPointer;
21266 } else {
21267 // Load the _tls_index variable
21268 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
21269 if (Subtarget.is64Bit())
21270 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
21271 MachinePointerInfo(), MVT::i32);
21272 else
21273 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
21274
21275 const DataLayout &DL = DAG.getDataLayout();
21276 SDValue Scale =
21277 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
21278 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
21279
21280 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
21281 }
21282
21283 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
21284
21285 // Get the offset of start of .tls section
21286 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21287 GA->getValueType(0),
21288 GA->getOffset(), X86II::MO_SECREL);
21289 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
21290
21291 // The address of the thread local variable is the add of the thread
21292 // pointer with the offset of the variable.
21293 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
21294 }
21295
21296 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21296)
;
21297}
21298
21299/// Lower SRA_PARTS and friends, which return two i32 values
21300/// and take a 2 x i32 value to shift plus a shift amount.
21301/// TODO: Can this be moved to general expansion code?
21302static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
21303 SDValue Lo, Hi;
21304 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
21305 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
21306}
21307
21308// Try to use a packed vector operation to handle i64 on 32-bit targets when
21309// AVX512DQ is enabled.
21310static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
21311 const X86Subtarget &Subtarget) {
21312 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))
21313 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))
21314 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))
21315 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))
21316 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))
;
21317 bool IsStrict = Op->isStrictFPOpcode();
21318 unsigned OpNo = IsStrict ? 1 : 0;
21319 SDValue Src = Op.getOperand(OpNo);
21320 MVT SrcVT = Src.getSimpleValueType();
21321 MVT VT = Op.getSimpleValueType();
21322
21323 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
21324 (VT != MVT::f32 && VT != MVT::f64))
21325 return SDValue();
21326
21327 // Pack the i64 into a vector, do the operation and extract.
21328
21329 // Using 256-bit to ensure result is 128-bits for f32 case.
21330 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
21331 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
21332 MVT VecVT = MVT::getVectorVT(VT, NumElts);
21333
21334 SDLoc dl(Op);
21335 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
21336 if (IsStrict) {
21337 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
21338 {Op.getOperand(0), InVec});
21339 SDValue Chain = CvtVec.getValue(1);
21340 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21341 DAG.getIntPtrConstant(0, dl));
21342 return DAG.getMergeValues({Value, Chain}, dl);
21343 }
21344
21345 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
21346
21347 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21348 DAG.getIntPtrConstant(0, dl));
21349}
21350
21351// Try to use a packed vector operation to handle i64 on 32-bit targets.
21352static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
21353 const X86Subtarget &Subtarget) {
21354 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))
21355 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))
21356 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))
21357 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))
21358 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))
;
21359 bool IsStrict = Op->isStrictFPOpcode();
21360 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21361 MVT SrcVT = Src.getSimpleValueType();
21362 MVT VT = Op.getSimpleValueType();
21363
21364 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
21365 return SDValue();
21366
21367 // Pack the i64 into a vector, do the operation and extract.
21368
21369 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__))
;
21370
21371 SDLoc dl(Op);
21372 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
21373 if (IsStrict) {
21374 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
21375 {Op.getOperand(0), InVec});
21376 SDValue Chain = CvtVec.getValue(1);
21377 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21378 DAG.getIntPtrConstant(0, dl));
21379 return DAG.getMergeValues({Value, Chain}, dl);
21380 }
21381
21382 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
21383
21384 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21385 DAG.getIntPtrConstant(0, dl));
21386}
21387
21388static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
21389 const X86Subtarget &Subtarget) {
21390 switch (Opcode) {
21391 case ISD::SINT_TO_FP:
21392 // TODO: Handle wider types with AVX/AVX512.
21393 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
21394 return false;
21395 // CVTDQ2PS or (V)CVTDQ2PD
21396 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
21397
21398 case ISD::UINT_TO_FP:
21399 // TODO: Handle wider types and i64 elements.
21400 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
21401 return false;
21402 // VCVTUDQ2PS or VCVTUDQ2PD
21403 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
21404
21405 default:
21406 return false;
21407 }
21408}
21409
21410/// Given a scalar cast operation that is extracted from a vector, try to
21411/// vectorize the cast op followed by extraction. This will avoid an expensive
21412/// round-trip between XMM and GPR.
21413static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
21414 const X86Subtarget &Subtarget) {
21415 // TODO: This could be enhanced to handle smaller integer types by peeking
21416 // through an extend.
21417 SDValue Extract = Cast.getOperand(0);
21418 MVT DestVT = Cast.getSimpleValueType();
21419 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21420 !isa<ConstantSDNode>(Extract.getOperand(1)))
21421 return SDValue();
21422
21423 // See if we have a 128-bit vector cast op for this type of cast.
21424 SDValue VecOp = Extract.getOperand(0);
21425 MVT FromVT = VecOp.getSimpleValueType();
21426 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
21427 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
21428 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
21429 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
21430 return SDValue();
21431
21432 // If we are extracting from a non-zero element, first shuffle the source
21433 // vector to allow extracting from element zero.
21434 SDLoc DL(Cast);
21435 if (!isNullConstant(Extract.getOperand(1))) {
21436 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
21437 Mask[0] = Extract.getConstantOperandVal(1);
21438 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
21439 }
21440 // If the source vector is wider than 128-bits, extract the low part. Do not
21441 // create an unnecessarily wide vector cast op.
21442 if (FromVT != Vec128VT)
21443 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
21444
21445 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
21446 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
21447 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
21448 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
21449 DAG.getIntPtrConstant(0, DL));
21450}
21451
21452/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
21453/// try to vectorize the cast ops. This will avoid an expensive round-trip
21454/// between XMM and GPR.
21455static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
21456 const X86Subtarget &Subtarget) {
21457 // TODO: Allow FP_TO_UINT.
21458 SDValue CastToInt = CastToFP.getOperand(0);
21459 MVT VT = CastToFP.getSimpleValueType();
21460 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
21461 return SDValue();
21462
21463 MVT IntVT = CastToInt.getSimpleValueType();
21464 SDValue X = CastToInt.getOperand(0);
21465 MVT SrcVT = X.getSimpleValueType();
21466 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
21467 return SDValue();
21468
21469 // See if we have 128-bit vector cast instructions for this type of cast.
21470 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
21471 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
21472 IntVT != MVT::i32)
21473 return SDValue();
21474
21475 unsigned SrcSize = SrcVT.getSizeInBits();
21476 unsigned IntSize = IntVT.getSizeInBits();
21477 unsigned VTSize = VT.getSizeInBits();
21478 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
21479 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
21480 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
21481
21482 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
21483 unsigned ToIntOpcode =
21484 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
21485 unsigned ToFPOpcode =
21486 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
21487
21488 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
21489 //
21490 // We are not defining the high elements (for example, zero them) because
21491 // that could nullify any performance advantage that we hoped to gain from
21492 // this vector op hack. We do not expect any adverse effects (like denorm
21493 // penalties) with cast ops.
21494 SDLoc DL(CastToFP);
21495 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
21496 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
21497 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
21498 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
21499 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
21500}
21501
21502static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
21503 const X86Subtarget &Subtarget) {
21504 SDLoc DL(Op);
21505 bool IsStrict = Op->isStrictFPOpcode();
21506 MVT VT = Op->getSimpleValueType(0);
21507 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
21508
21509 if (Subtarget.hasDQI()) {
21510 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__))
;
21511
21512 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21514, __extension__
__PRETTY_FUNCTION__))
21513 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21514, __extension__
__PRETTY_FUNCTION__))
21514 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21514, __extension__
__PRETTY_FUNCTION__))
;
21515
21516 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
21517 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21518, __extension__
__PRETTY_FUNCTION__))
21518 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21518, __extension__
__PRETTY_FUNCTION__))
;
21519 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21520
21521 // Need to concat with zero vector for strict fp to avoid spurious
21522 // exceptions.
21523 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
21524 : DAG.getUNDEF(MVT::v8i64);
21525 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
21526 DAG.getIntPtrConstant(0, DL));
21527 SDValue Res, Chain;
21528 if (IsStrict) {
21529 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
21530 {Op->getOperand(0), Src});
21531 Chain = Res.getValue(1);
21532 } else {
21533 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
21534 }
21535
21536 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21537 DAG.getIntPtrConstant(0, DL));
21538
21539 if (IsStrict)
21540 return DAG.getMergeValues({Res, Chain}, DL);
21541 return Res;
21542 }
21543
21544 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
21545 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
21546 if (VT != MVT::v4f32 || IsSigned)
21547 return SDValue();
21548
21549 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
21550 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
21551 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
21552 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
21553 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
21554 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
21555 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
21556 SmallVector<SDValue, 4> SignCvts(4);
21557 SmallVector<SDValue, 4> Chains(4);
21558 for (int i = 0; i != 4; ++i) {
21559 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
21560 DAG.getIntPtrConstant(i, DL));
21561 if (IsStrict) {
21562 SignCvts[i] =
21563 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
21564 {Op.getOperand(0), Elt});
21565 Chains[i] = SignCvts[i].getValue(1);
21566 } else {
21567 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
21568 }
21569 }
21570 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
21571
21572 SDValue Slow, Chain;
21573 if (IsStrict) {
21574 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
21575 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
21576 {Chain, SignCvt, SignCvt});
21577 Chain = Slow.getValue(1);
21578 } else {
21579 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
21580 }
21581
21582 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
21583 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
21584
21585 if (IsStrict)
21586 return DAG.getMergeValues({Cvt, Chain}, DL);
21587
21588 return Cvt;
21589}
21590
21591static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
21592 bool IsStrict = Op->isStrictFPOpcode();
21593 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21594 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21595 MVT VT = Op.getSimpleValueType();
21596 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21597 SDLoc dl(Op);
21598
21599 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
21600 if (IsStrict)
21601 return DAG.getNode(
21602 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
21603 {Chain,
21604 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
21605 Rnd});
21606 return DAG.getNode(ISD::FP_ROUND, dl, VT,
21607 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
21608}
21609
21610static bool isLegalConversion(MVT VT, bool IsSigned,
21611 const X86Subtarget &Subtarget) {
21612 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
21613 return true;
21614 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
21615 return true;
21616 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
21617 return true;
21618 if (Subtarget.useAVX512Regs()) {
21619 if (VT == MVT::v16i32)
21620 return true;
21621 if (VT == MVT::v8i64 && Subtarget.hasDQI())
21622 return true;
21623 }
21624 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
21625 (VT == MVT::v2i64 || VT == MVT::v4i64))
21626 return true;
21627 return false;
21628}
21629
21630SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
21631 SelectionDAG &DAG) const {
21632 bool IsStrict = Op->isStrictFPOpcode();
21633 unsigned OpNo = IsStrict ? 1 : 0;
21634 SDValue Src = Op.getOperand(OpNo);
21635 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21636 MVT SrcVT = Src.getSimpleValueType();
21637 MVT VT = Op.getSimpleValueType();
21638 SDLoc dl(Op);
21639
21640 if (isSoftFP16(VT))
21641 return promoteXINT_TO_FP(Op, DAG);
21642 else if (isLegalConversion(SrcVT, true, Subtarget))
21643 return Op;
21644
21645 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21646 return LowerWin64_INT128_TO_FP(Op, DAG);
21647
21648 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21649 return Extract;
21650
21651 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
21652 return R;
21653
21654 if (SrcVT.isVector()) {
21655 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
21656 // Note: Since v2f64 is a legal type. We don't need to zero extend the
21657 // source for strict FP.
21658 if (IsStrict)
21659 return DAG.getNode(
21660 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
21661 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21662 DAG.getUNDEF(SrcVT))});
21663 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
21664 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21665 DAG.getUNDEF(SrcVT)));
21666 }
21667 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
21668 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21669
21670 return SDValue();
21671 }
21672
21673 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21674, __extension__
__PRETTY_FUNCTION__))
21674 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21674, __extension__
__PRETTY_FUNCTION__))
;
21675
21676 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
21677
21678 // These are really Legal; return the operand so the caller accepts it as
21679 // Legal.
21680 if (SrcVT == MVT::i32 && UseSSEReg)
21681 return Op;
21682 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
21683 return Op;
21684
21685 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21686 return V;
21687 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21688 return V;
21689
21690 // SSE doesn't have an i16 conversion so we need to promote.
21691 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
21692 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
21693 if (IsStrict)
21694 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
21695 {Chain, Ext});
21696
21697 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
21698 }
21699
21700 if (VT == MVT::f128 || !Subtarget.hasX87())
21701 return SDValue();
21702
21703 SDValue ValueToStore = Src;
21704 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
21705 // Bitcasting to f64 here allows us to do a single 64-bit store from
21706 // an SSE register, avoiding the store forwarding penalty that would come
21707 // with two 32-bit stores.
21708 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21709
21710 unsigned Size = SrcVT.getStoreSize();
21711 Align Alignment(Size);
21712 MachineFunction &MF = DAG.getMachineFunction();
21713 auto PtrVT = getPointerTy(MF.getDataLayout());
21714 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
21715 MachinePointerInfo MPI =
21716 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21717 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21718 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
21719 std::pair<SDValue, SDValue> Tmp =
21720 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
21721
21722 if (IsStrict)
21723 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21724
21725 return Tmp.first;
21726}
21727
21728std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
21729 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
21730 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
21731 // Build the FILD
21732 SDVTList Tys;
21733 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
21734 if (useSSE)
21735 Tys = DAG.getVTList(MVT::f80, MVT::Other);
21736 else
21737 Tys = DAG.getVTList(DstVT, MVT::Other);
21738
21739 SDValue FILDOps[] = {Chain, Pointer};
21740 SDValue Result =
21741 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
21742 Alignment, MachineMemOperand::MOLoad);
21743 Chain = Result.getValue(1);
21744
21745 if (useSSE) {
21746 MachineFunction &MF = DAG.getMachineFunction();
21747 unsigned SSFISize = DstVT.getStoreSize();
21748 int SSFI =
21749 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
21750 auto PtrVT = getPointerTy(MF.getDataLayout());
21751 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21752 Tys = DAG.getVTList(MVT::Other);
21753 SDValue FSTOps[] = {Chain, Result, StackSlot};
21754 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
21755 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
21756 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
21757
21758 Chain =
21759 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
21760 Result = DAG.getLoad(
21761 DstVT, DL, Chain, StackSlot,
21762 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
21763 Chain = Result.getValue(1);
21764 }
21765
21766 return { Result, Chain };
21767}
21768
21769/// Horizontal vector math instructions may be slower than normal math with
21770/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
21771/// implementation, and likely shuffle complexity of the alternate sequence.
21772static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
21773 const X86Subtarget &Subtarget) {
21774 bool IsOptimizingSize = DAG.shouldOptForSize();
21775 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
21776 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
21777}
21778
21779/// 64-bit unsigned integer to double expansion.
21780static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
21781 const X86Subtarget &Subtarget) {
21782 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
21783 // when converting 0 when rounding toward negative infinity. Caller will
21784 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
21785 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21785, __extension__
__PRETTY_FUNCTION__))
;
21786 // This algorithm is not obvious. Here it is what we're trying to output:
21787 /*
21788 movq %rax, %xmm0
21789 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
21790 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
21791 #ifdef __SSE3__
21792 haddpd %xmm0, %xmm0
21793 #else
21794 pshufd $0x4e, %xmm0, %xmm1
21795 addpd %xmm1, %xmm0
21796 #endif
21797 */
21798
21799 SDLoc dl(Op);
21800 LLVMContext *Context = DAG.getContext();
21801
21802 // Build some magic constants.
21803 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21804 Constant *C0 = ConstantDataVector::get(*Context, CV0);
21805 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21806 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21807
21808 SmallVector<Constant*,2> CV1;
21809 CV1.push_back(
21810 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21811 APInt(64, 0x4330000000000000ULL))));
21812 CV1.push_back(
21813 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21814 APInt(64, 0x4530000000000000ULL))));
21815 Constant *C1 = ConstantVector::get(CV1);
21816 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21817
21818 // Load the 64-bit value into an XMM register.
21819 SDValue XR1 =
21820 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21821 SDValue CLod0 = DAG.getLoad(
21822 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21823 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21824 SDValue Unpck1 =
21825 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21826
21827 SDValue CLod1 = DAG.getLoad(
21828 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21829 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21830 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21831 // TODO: Are there any fast-math-flags to propagate here?
21832 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21833 SDValue Result;
21834
21835 if (Subtarget.hasSSE3() &&
21836 shouldUseHorizontalOp(true, DAG, Subtarget)) {
21837 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21838 } else {
21839 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21840 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21841 }
21842 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21843 DAG.getIntPtrConstant(0, dl));
21844 return Result;
21845}
21846
21847/// 32-bit unsigned integer to float expansion.
21848static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
21849 const X86Subtarget &Subtarget) {
21850 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21851 SDLoc dl(Op);
21852 // FP constant to bias correct the final result.
21853 SDValue Bias = DAG.getConstantFP(
21854 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
21855
21856 // Load the 32-bit value into an XMM register.
21857 SDValue Load =
21858 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21859
21860 // Zero out the upper parts of the register.
21861 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21862
21863 // Or the load with the bias.
21864 SDValue Or = DAG.getNode(
21865 ISD::OR, dl, MVT::v2i64,
21866 DAG.getBitcast(MVT::v2i64, Load),
21867 DAG.getBitcast(MVT::v2i64,
21868 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21869 Or =
21870 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21871 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
21872
21873 if (Op.getNode()->isStrictFPOpcode()) {
21874 // Subtract the bias.
21875 // TODO: Are there any fast-math-flags to propagate here?
21876 SDValue Chain = Op.getOperand(0);
21877 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21878 {Chain, Or, Bias});
21879
21880 if (Op.getValueType() == Sub.getValueType())
21881 return Sub;
21882
21883 // Handle final rounding.
21884 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21885 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21886
21887 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21888 }
21889
21890 // Subtract the bias.
21891 // TODO: Are there any fast-math-flags to propagate here?
21892 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21893
21894 // Handle final rounding.
21895 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21896}
21897
21898static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
21899 const X86Subtarget &Subtarget,
21900 const SDLoc &DL) {
21901 if (Op.getSimpleValueType() != MVT::v2f64)
21902 return SDValue();
21903
21904 bool IsStrict = Op->isStrictFPOpcode();
21905
21906 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21907 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21907, __extension__
__PRETTY_FUNCTION__))
;
21908
21909 if (Subtarget.hasAVX512()) {
21910 if (!Subtarget.hasVLX()) {
21911 // Let generic type legalization widen this.
21912 if (!IsStrict)
21913 return SDValue();
21914 // Otherwise pad the integer input with 0s and widen the operation.
21915 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21916 DAG.getConstant(0, DL, MVT::v2i32));
21917 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21918 {Op.getOperand(0), N0});
21919 SDValue Chain = Res.getValue(1);
21920 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21921 DAG.getIntPtrConstant(0, DL));
21922 return DAG.getMergeValues({Res, Chain}, DL);
21923 }
21924
21925 // Legalize to v4i32 type.
21926 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21927 DAG.getUNDEF(MVT::v2i32));
21928 if (IsStrict)
21929 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21930 {Op.getOperand(0), N0});
21931 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21932 }
21933
21934 // Zero extend to 2i64, OR with the floating point representation of 2^52.
21935 // This gives us the floating point equivalent of 2^52 + the i32 integer
21936 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
21937 // point leaving just our i32 integers in double format.
21938 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21939 SDValue VBias = DAG.getConstantFP(
21940 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
21941 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21942 DAG.getBitcast(MVT::v2i64, VBias));
21943 Or = DAG.getBitcast(MVT::v2f64, Or);
21944
21945 if (IsStrict)
21946 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21947 {Op.getOperand(0), Or, VBias});
21948 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21949}
21950
21951static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
21952 const X86Subtarget &Subtarget) {
21953 SDLoc DL(Op);
21954 bool IsStrict = Op->isStrictFPOpcode();
21955 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21956 MVT VecIntVT = V.getSimpleValueType();
21957 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21958, __extension__
__PRETTY_FUNCTION__))
21958 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21958, __extension__
__PRETTY_FUNCTION__))
;
21959
21960 if (Subtarget.hasAVX512()) {
21961 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21962 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21962, __extension__
__PRETTY_FUNCTION__))
;
21963 MVT VT = Op->getSimpleValueType(0);
21964
21965 // v8i32->v8f64 is legal with AVX512 so just return it.
21966 if (VT == MVT::v8f64)
21967 return Op;
21968
21969 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21970, __extension__
__PRETTY_FUNCTION__))
21970 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21970, __extension__
__PRETTY_FUNCTION__))
;
21971 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21972 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21973 // Need to concat with zero vector for strict fp to avoid spurious
21974 // exceptions.
21975 SDValue Tmp =
21976 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21977 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21978 DAG.getIntPtrConstant(0, DL));
21979 SDValue Res, Chain;
21980 if (IsStrict) {
21981 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21982 {Op->getOperand(0), V});
21983 Chain = Res.getValue(1);
21984 } else {
21985 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21986 }
21987
21988 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21989 DAG.getIntPtrConstant(0, DL));
21990
21991 if (IsStrict)
21992 return DAG.getMergeValues({Res, Chain}, DL);
21993 return Res;
21994 }
21995
21996 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21997 Op->getSimpleValueType(0) == MVT::v4f64) {
21998 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21999 Constant *Bias = ConstantFP::get(
22000 *DAG.getContext(),
22001 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
22002 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
22003 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
22004 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
22005 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
22006 SDValue VBias = DAG.getMemIntrinsicNode(
22007 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
22008 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
22009 MachineMemOperand::MOLoad);
22010
22011 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
22012 DAG.getBitcast(MVT::v4i64, VBias));
22013 Or = DAG.getBitcast(MVT::v4f64, Or);
22014
22015 if (IsStrict)
22016 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
22017 {Op.getOperand(0), Or, VBias});
22018 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
22019 }
22020
22021 // The algorithm is the following:
22022 // #ifdef __SSE4_1__
22023 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22024 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22025 // (uint4) 0x53000000, 0xaa);
22026 // #else
22027 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22028 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22029 // #endif
22030 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22031 // return (float4) lo + fhi;
22032
22033 bool Is128 = VecIntVT == MVT::v4i32;
22034 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
22035 // If we convert to something else than the supported type, e.g., to v4f64,
22036 // abort early.
22037 if (VecFloatVT != Op->getSimpleValueType(0))
22038 return SDValue();
22039
22040 // In the #idef/#else code, we have in common:
22041 // - The vector of constants:
22042 // -- 0x4b000000
22043 // -- 0x53000000
22044 // - A shift:
22045 // -- v >> 16
22046
22047 // Create the splat vector for 0x4b000000.
22048 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
22049 // Create the splat vector for 0x53000000.
22050 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
22051
22052 // Create the right shift.
22053 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
22054 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
22055
22056 SDValue Low, High;
22057 if (Subtarget.hasSSE41()) {
22058 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
22059 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22060 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
22061 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
22062 // Low will be bitcasted right away, so do not bother bitcasting back to its
22063 // original type.
22064 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
22065 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22066 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22067 // (uint4) 0x53000000, 0xaa);
22068 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
22069 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
22070 // High will be bitcasted right away, so do not bother bitcasting back to
22071 // its original type.
22072 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
22073 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22074 } else {
22075 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
22076 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22077 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
22078 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
22079
22080 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22081 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
22082 }
22083
22084 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
22085 SDValue VecCstFSub = DAG.getConstantFP(
22086 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
22087
22088 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22089 // NOTE: By using fsub of a positive constant instead of fadd of a negative
22090 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
22091 // enabled. See PR24512.
22092 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
22093 // TODO: Are there any fast-math-flags to propagate here?
22094 // (float4) lo;
22095 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
22096 // return (float4) lo + fhi;
22097 if (IsStrict) {
22098 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
22099 {Op.getOperand(0), HighBitcast, VecCstFSub});
22100 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
22101 {FHigh.getValue(1), LowBitcast, FHigh});
22102 }
22103
22104 SDValue FHigh =
22105 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
22106 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
22107}
22108
22109static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
22110 const X86Subtarget &Subtarget) {
22111 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
22112 SDValue N0 = Op.getOperand(OpNo);
22113 MVT SrcVT = N0.getSimpleValueType();
22114 SDLoc dl(Op);
22115
22116 switch (SrcVT.SimpleTy) {
22117 default:
22118 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22118)
;
22119 case MVT::v2i32:
22120 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
22121 case MVT::v4i32:
22122 case MVT::v8i32:
22123 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
22124 case MVT::v2i64:
22125 case MVT::v4i64:
22126 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
22127 }
22128}
22129
22130SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
22131 SelectionDAG &DAG) const {
22132 bool IsStrict = Op->isStrictFPOpcode();
22133 unsigned OpNo = IsStrict ? 1 : 0;
22134 SDValue Src = Op.getOperand(OpNo);
22135 SDLoc dl(Op);
22136 auto PtrVT = getPointerTy(DAG.getDataLayout());
22137 MVT SrcVT = Src.getSimpleValueType();
22138 MVT DstVT = Op->getSimpleValueType(0);
22139 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22140
22141 // Bail out when we don't have native conversion instructions.
22142 if (DstVT == MVT::f128)
22143 return SDValue();
22144
22145 if (isSoftFP16(DstVT))
22146 return promoteXINT_TO_FP(Op, DAG);
22147 else if (isLegalConversion(SrcVT, false, Subtarget))
22148 return Op;
22149
22150 if (DstVT.isVector())
22151 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
22152
22153 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
22154 return LowerWin64_INT128_TO_FP(Op, DAG);
22155
22156 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
22157 return Extract;
22158
22159 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
22160 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
22161 // Conversions from unsigned i32 to f32/f64 are legal,
22162 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
22163 return Op;
22164 }
22165
22166 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
22167 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
22168 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
22169 if (IsStrict)
22170 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
22171 {Chain, Src});
22172 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
22173 }
22174
22175 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
22176 return V;
22177 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
22178 return V;
22179
22180 // The transform for i64->f64 isn't correct for 0 when rounding to negative
22181 // infinity. It produces -0.0, so disable under strictfp.
22182 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
22183 !IsStrict)
22184 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
22185 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
22186 // negative infinity. So disable under strictfp. Using FILD instead.
22187 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
22188 !IsStrict)
22189 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
22190 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
22191 (DstVT == MVT::f32 || DstVT == MVT::f64))
22192 return SDValue();
22193
22194 // Make a 64-bit buffer, and use it to build an FILD.
22195 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
22196 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
22197 Align SlotAlign(8);
22198 MachinePointerInfo MPI =
22199 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
22200 if (SrcVT == MVT::i32) {
22201 SDValue OffsetSlot =
22202 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
22203 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
22204 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
22205 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
22206 std::pair<SDValue, SDValue> Tmp =
22207 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
22208 if (IsStrict)
22209 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
22210
22211 return Tmp.first;
22212 }
22213
22214 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22214, __extension__
__PRETTY_FUNCTION__))
;
22215 SDValue ValueToStore = Src;
22216 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
22217 // Bitcasting to f64 here allows us to do a single 64-bit store from
22218 // an SSE register, avoiding the store forwarding penalty that would come
22219 // with two 32-bit stores.
22220 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
22221 }
22222 SDValue Store =
22223 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
22224 // For i64 source, we need to add the appropriate power of 2 if the input
22225 // was negative. We must be careful to do the computation in x87 extended
22226 // precision, not in SSE.
22227 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22228 SDValue Ops[] = { Store, StackSlot };
22229 SDValue Fild =
22230 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
22231 SlotAlign, MachineMemOperand::MOLoad);
22232 Chain = Fild.getValue(1);
22233
22234
22235 // Check whether the sign bit is set.
22236 SDValue SignSet = DAG.getSetCC(
22237 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
22238 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
22239
22240 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
22241 APInt FF(64, 0x5F80000000000000ULL);
22242 SDValue FudgePtr = DAG.getConstantPool(
22243 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
22244 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
22245
22246 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
22247 SDValue Zero = DAG.getIntPtrConstant(0, dl);
22248 SDValue Four = DAG.getIntPtrConstant(4, dl);
22249 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
22250 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
22251
22252 // Load the value out, extending it from f32 to f80.
22253 SDValue Fudge = DAG.getExtLoad(
22254 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
22255 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
22256 CPAlignment);
22257 Chain = Fudge.getValue(1);
22258 // Extend everything to 80 bits to force it to be done on x87.
22259 // TODO: Are there any fast-math-flags to propagate here?
22260 if (IsStrict) {
22261 unsigned Opc = ISD::STRICT_FADD;
22262 // Windows needs the precision control changed to 80bits around this add.
22263 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22264 Opc = X86ISD::STRICT_FP80_ADD;
22265
22266 SDValue Add =
22267 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
22268 // STRICT_FP_ROUND can't handle equal types.
22269 if (DstVT == MVT::f80)
22270 return Add;
22271 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
22272 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
22273 }
22274 unsigned Opc = ISD::FADD;
22275 // Windows needs the precision control changed to 80bits around this add.
22276 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22277 Opc = X86ISD::FP80_ADD;
22278
22279 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
22280 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
22281 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22282}
22283
22284// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
22285// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
22286// just return an SDValue().
22287// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
22288// to i16, i32 or i64, and we lower it to a legal sequence and return the
22289// result.
22290SDValue
22291X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
22292 bool IsSigned, SDValue &Chain) const {
22293 bool IsStrict = Op->isStrictFPOpcode();
22294 SDLoc DL(Op);
22295
22296 EVT DstTy = Op.getValueType();
22297 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
22298 EVT TheVT = Value.getValueType();
22299 auto PtrVT = getPointerTy(DAG.getDataLayout());
22300
22301 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
22302 // f16 must be promoted before using the lowering in this routine.
22303 // fp128 does not use this lowering.
22304 return SDValue();
22305 }
22306
22307 // If using FIST to compute an unsigned i64, we'll need some fixup
22308 // to handle values above the maximum signed i64. A FIST is always
22309 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
22310 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
22311
22312 // FIXME: This does not generate an invalid exception if the input does not
22313 // fit in i32. PR44019
22314 if (!IsSigned && DstTy != MVT::i64) {
22315 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
22316 // The low 32 bits of the fist result will have the correct uint32 result.
22317 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22317, __extension__
__PRETTY_FUNCTION__))
;
22318 DstTy = MVT::i64;
22319 }
22320
22321 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22323, __extension__
__PRETTY_FUNCTION__))
22322 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22323, __extension__
__PRETTY_FUNCTION__))
22323 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22323, __extension__
__PRETTY_FUNCTION__))
;
22324
22325 // We lower FP->int64 into FISTP64 followed by a load from a temporary
22326 // stack slot.
22327 MachineFunction &MF = DAG.getMachineFunction();
22328 unsigned MemSize = DstTy.getStoreSize();
22329 int SSFI =
22330 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
22331 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
22332
22333 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22334
22335 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
22336
22337 if (UnsignedFixup) {
22338 //
22339 // Conversion to unsigned i64 is implemented with a select,
22340 // depending on whether the source value fits in the range
22341 // of a signed i64. Let Thresh be the FP equivalent of
22342 // 0x8000000000000000ULL.
22343 //
22344 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
22345 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
22346 // FistSrc = (Value - FltOfs);
22347 // Fist-to-mem64 FistSrc
22348 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
22349 // to XOR'ing the high 32 bits with Adjust.
22350 //
22351 // Being a power of 2, Thresh is exactly representable in all FP formats.
22352 // For X87 we'd like to use the smallest FP type for this constant, but
22353 // for DAG type consistency we have to match the FP operand type.
22354
22355 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
22356 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
22357 bool LosesInfo = false;
22358 if (TheVT == MVT::f64)
22359 // The rounding mode is irrelevant as the conversion should be exact.
22360 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
22361 &LosesInfo);
22362 else if (TheVT == MVT::f80)
22363 Status = Thresh.convert(APFloat::x87DoubleExtended(),
22364 APFloat::rmNearestTiesToEven, &LosesInfo);
22365
22366 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22367, __extension__
__PRETTY_FUNCTION__))
22367 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22367, __extension__
__PRETTY_FUNCTION__))
;
22368
22369 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
22370
22371 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
22372 *DAG.getContext(), TheVT);
22373 SDValue Cmp;
22374 if (IsStrict) {
22375 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
22376 /*IsSignaling*/ true);
22377 Chain = Cmp.getValue(1);
22378 } else {
22379 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
22380 }
22381
22382 // Our preferred lowering of
22383 //
22384 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
22385 //
22386 // is
22387 //
22388 // (Value >= Thresh) << 63
22389 //
22390 // but since we can get here after LegalOperations, DAGCombine might do the
22391 // wrong thing if we create a select. So, directly create the preferred
22392 // version.
22393 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
22394 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
22395 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
22396
22397 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
22398 DAG.getConstantFP(0.0, DL, TheVT));
22399
22400 if (IsStrict) {
22401 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
22402 { Chain, Value, FltOfs });
22403 Chain = Value.getValue(1);
22404 } else
22405 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
22406 }
22407
22408 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
22409
22410 // FIXME This causes a redundant load/store if the SSE-class value is already
22411 // in memory, such as if it is on the callstack.
22412 if (isScalarFPTypeInSSEReg(TheVT)) {
22413 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22413, __extension__
__PRETTY_FUNCTION__))
;
22414 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
22415 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22416 SDValue Ops[] = { Chain, StackSlot };
22417
22418 unsigned FLDSize = TheVT.getStoreSize();
22419 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22419, __extension__
__PRETTY_FUNCTION__))
;
22420 MachineMemOperand *MMO = MF.getMachineMemOperand(
22421 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
22422 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
22423 Chain = Value.getValue(1);
22424 }
22425
22426 // Build the FP_TO_INT*_IN_MEM
22427 MachineMemOperand *MMO = MF.getMachineMemOperand(
22428 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
22429 SDValue Ops[] = { Chain, Value, StackSlot };
22430 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
22431 DAG.getVTList(MVT::Other),
22432 Ops, DstTy, MMO);
22433
22434 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
22435 Chain = Res.getValue(1);
22436
22437 // If we need an unsigned fixup, XOR the result with adjust.
22438 if (UnsignedFixup)
22439 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
22440
22441 return Res;
22442}
22443
22444static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
22445 const X86Subtarget &Subtarget) {
22446 MVT VT = Op.getSimpleValueType();
22447 SDValue In = Op.getOperand(0);
22448 MVT InVT = In.getSimpleValueType();
22449 SDLoc dl(Op);
22450 unsigned Opc = Op.getOpcode();
22451
22452 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22452, __extension__
__PRETTY_FUNCTION__))
;
22453 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22454, __extension__
__PRETTY_FUNCTION__))
22454 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22454, __extension__
__PRETTY_FUNCTION__))
;
22455 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22456, __extension__
__PRETTY_FUNCTION__))
22456 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22456, __extension__
__PRETTY_FUNCTION__))
;
22457 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))
22458 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))
22459 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))
22460 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))
;
22461 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))
22462 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))
22463 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))
22464 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))
;
22465
22466 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
22467
22468 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
22469 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22469, __extension__
__PRETTY_FUNCTION__))
;
22470 return splitVectorIntUnary(Op, DAG);
22471 }
22472
22473 if (Subtarget.hasInt256())
22474 return Op;
22475
22476 // Optimize vectors in AVX mode:
22477 //
22478 // v8i16 -> v8i32
22479 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
22480 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
22481 // Concat upper and lower parts.
22482 //
22483 // v4i32 -> v4i64
22484 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
22485 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
22486 // Concat upper and lower parts.
22487 //
22488 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22489 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
22490
22491 // Short-circuit if we can determine that each 128-bit half is the same value.
22492 // Otherwise, this is difficult to match and optimize.
22493 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
22494 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
22495 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
22496
22497 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
22498 SDValue Undef = DAG.getUNDEF(InVT);
22499 bool NeedZero = Opc == ISD::ZERO_EXTEND;
22500 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
22501 OpHi = DAG.getBitcast(HalfVT, OpHi);
22502
22503 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22504}
22505
22506// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
22507static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
22508 const SDLoc &dl, SelectionDAG &DAG) {
22509 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22509, __extension__
__PRETTY_FUNCTION__))
;
22510 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22511 DAG.getIntPtrConstant(0, dl));
22512 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22513 DAG.getIntPtrConstant(8, dl));
22514 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
22515 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
22516 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
22517 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22518}
22519
22520static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
22521 const X86Subtarget &Subtarget,
22522 SelectionDAG &DAG) {
22523 MVT VT = Op->getSimpleValueType(0);
22524 SDValue In = Op->getOperand(0);
22525 MVT InVT = In.getSimpleValueType();
22526 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22526, __extension__
__PRETTY_FUNCTION__))
;
22527 SDLoc DL(Op);
22528 unsigned NumElts = VT.getVectorNumElements();
22529
22530 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
22531 // avoids a constant pool load.
22532 if (VT.getVectorElementType() != MVT::i8) {
22533 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
22534 return DAG.getNode(ISD::SRL, DL, VT, Extend,
22535 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
22536 }
22537
22538 // Extend VT if BWI is not supported.
22539 MVT ExtVT = VT;
22540 if (!Subtarget.hasBWI()) {
22541 // If v16i32 is to be avoided, we'll need to split and concatenate.
22542 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22543 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
22544
22545 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22546 }
22547
22548 // Widen to 512-bits if VLX is not supported.
22549 MVT WideVT = ExtVT;
22550 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22551 NumElts *= 512 / ExtVT.getSizeInBits();
22552 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22553 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
22554 In, DAG.getIntPtrConstant(0, DL));
22555 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
22556 NumElts);
22557 }
22558
22559 SDValue One = DAG.getConstant(1, DL, WideVT);
22560 SDValue Zero = DAG.getConstant(0, DL, WideVT);
22561
22562 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
22563
22564 // Truncate if we had to extend above.
22565 if (VT != ExtVT) {
22566 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
22567 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
22568 }
22569
22570 // Extract back to 128/256-bit if we widened.
22571 if (WideVT != VT)
22572 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
22573 DAG.getIntPtrConstant(0, DL));
22574
22575 return SelectedVal;
22576}
22577
22578static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22579 SelectionDAG &DAG) {
22580 SDValue In = Op.getOperand(0);
22581 MVT SVT = In.getSimpleValueType();
22582
22583 if (SVT.getVectorElementType() == MVT::i1)
22584 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
22585
22586 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22586, __extension__
__PRETTY_FUNCTION__))
;
22587 return LowerAVXExtend(Op, DAG, Subtarget);
22588}
22589
22590/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
22591/// It makes use of the fact that vectors with enough leading sign/zero bits
22592/// prevent the PACKSS/PACKUS from saturating the results.
22593/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
22594/// within each 128-bit lane.
22595static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
22596 const SDLoc &DL, SelectionDAG &DAG,
22597 const X86Subtarget &Subtarget) {
22598 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22599, __extension__
__PRETTY_FUNCTION__))
22599 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22599, __extension__
__PRETTY_FUNCTION__))
;
22600 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22600, __extension__
__PRETTY_FUNCTION__))
;
22601
22602 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
22603 if (!Subtarget.hasSSE2())
22604 return SDValue();
22605
22606 EVT SrcVT = In.getValueType();
22607
22608 // No truncation required, we might get here due to recursive calls.
22609 if (SrcVT == DstVT)
22610 return In;
22611
22612 // We only support vector truncation to 64bits or greater from a
22613 // 128bits or greater source.
22614 unsigned DstSizeInBits = DstVT.getSizeInBits();
22615 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
22616 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
22617 return SDValue();
22618
22619 unsigned NumElems = SrcVT.getVectorNumElements();
22620 if (!isPowerOf2_32(NumElems))
22621 return SDValue();
22622
22623 LLVMContext &Ctx = *DAG.getContext();
22624 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22624, __extension__
__PRETTY_FUNCTION__))
;
22625 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22625, __extension__
__PRETTY_FUNCTION__))
;
22626
22627 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
22628
22629 // Pack to the largest type possible:
22630 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
22631 EVT InVT = MVT::i16, OutVT = MVT::i8;
22632 if (SrcVT.getScalarSizeInBits() > 16 &&
22633 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
22634 InVT = MVT::i32;
22635 OutVT = MVT::i16;
22636 }
22637
22638 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
22639 if (SrcVT.is128BitVector()) {
22640 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
22641 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
22642 In = DAG.getBitcast(InVT, In);
22643 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
22644 Res = extractSubVector(Res, 0, DAG, DL, 64);
22645 return DAG.getBitcast(DstVT, Res);
22646 }
22647
22648 // Split lower/upper subvectors.
22649 SDValue Lo, Hi;
22650 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
22651
22652 unsigned SubSizeInBits = SrcSizeInBits / 2;
22653 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
22654 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
22655
22656 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
22657 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
22658 Lo = DAG.getBitcast(InVT, Lo);
22659 Hi = DAG.getBitcast(InVT, Hi);
22660 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22661 return DAG.getBitcast(DstVT, Res);
22662 }
22663
22664 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
22665 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
22666 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
22667 Lo = DAG.getBitcast(InVT, Lo);
22668 Hi = DAG.getBitcast(InVT, Hi);
22669 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22670
22671 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
22672 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
22673 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
22674 SmallVector<int, 64> Mask;
22675 int Scale = 64 / OutVT.getScalarSizeInBits();
22676 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
22677 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
22678
22679 if (DstVT.is256BitVector())
22680 return DAG.getBitcast(DstVT, Res);
22681
22682 // If 512bit -> 128bit truncate another stage.
22683 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22684 Res = DAG.getBitcast(PackedVT, Res);
22685 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22686 }
22687
22688 // Recursively pack lower/upper subvectors, concat result and pack again.
22689 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22689, __extension__
__PRETTY_FUNCTION__))
;
22690 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
22691 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
22692 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
22693
22694 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22695 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
22696 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22697}
22698
22699static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
22700 const X86Subtarget &Subtarget) {
22701
22702 SDLoc DL(Op);
22703 MVT VT = Op.getSimpleValueType();
22704 SDValue In = Op.getOperand(0);
22705 MVT InVT = In.getSimpleValueType();
22706
22707 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22707, __extension__
__PRETTY_FUNCTION__))
;
22708
22709 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
22710 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
22711 if (InVT.getScalarSizeInBits() <= 16) {
22712 if (Subtarget.hasBWI()) {
22713 // legal, will go to VPMOVB2M, VPMOVW2M
22714 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22715 // We need to shift to get the lsb into sign position.
22716 // Shift packed bytes not supported natively, bitcast to word
22717 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
22718 In = DAG.getNode(ISD::SHL, DL, ExtVT,
22719 DAG.getBitcast(ExtVT, In),
22720 DAG.getConstant(ShiftInx, DL, ExtVT));
22721 In = DAG.getBitcast(InVT, In);
22722 }
22723 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
22724 In, ISD::SETGT);
22725 }
22726 // Use TESTD/Q, extended vector to packed dword/qword.
22727 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22728, __extension__
__PRETTY_FUNCTION__))
22728 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22728, __extension__
__PRETTY_FUNCTION__))
;
22729 unsigned NumElts = InVT.getVectorNumElements();
22730 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22730, __extension__
__PRETTY_FUNCTION__))
;
22731 // We need to change to a wider element type that we have support for.
22732 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
22733 // For 16 element vectors we extend to v16i32 unless we are explicitly
22734 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
22735 // we need to split into two 8 element vectors which we can extend to v8i32,
22736 // truncate and concat the results. There's an additional complication if
22737 // the original type is v16i8. In that case we can't split the v16i8
22738 // directly, so we need to shuffle high elements to low and use
22739 // sign_extend_vector_inreg.
22740 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
22741 SDValue Lo, Hi;
22742 if (InVT == MVT::v16i8) {
22743 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
22744 Hi = DAG.getVectorShuffle(
22745 InVT, DL, In, In,
22746 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
22747 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
22748 } else {
22749 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22749, __extension__
__PRETTY_FUNCTION__))
;
22750 Lo = extract128BitVector(In, 0, DAG, DL);
22751 Hi = extract128BitVector(In, 8, DAG, DL);
22752 }
22753 // We're split now, just emit two truncates and a concat. The two
22754 // truncates will trigger legalization to come back to this function.
22755 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
22756 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
22757 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22758 }
22759 // We either have 8 elements or we're allowed to use 512-bit vectors.
22760 // If we have VLX, we want to use the narrowest vector that can get the
22761 // job done so we use vXi32.
22762 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
22763 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
22764 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
22765 InVT = ExtVT;
22766 ShiftInx = InVT.getScalarSizeInBits() - 1;
22767 }
22768
22769 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22770 // We need to shift to get the lsb into sign position.
22771 In = DAG.getNode(ISD::SHL, DL, InVT, In,
22772 DAG.getConstant(ShiftInx, DL, InVT));
22773 }
22774 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
22775 if (Subtarget.hasDQI())
22776 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
22777 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
22778}
22779
22780SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
22781 SDLoc DL(Op);
22782 MVT VT = Op.getSimpleValueType();
22783 SDValue In = Op.getOperand(0);
22784 MVT InVT = In.getSimpleValueType();
22785 unsigned InNumEltBits = InVT.getScalarSizeInBits();
22786
22787 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22788, __extension__
__PRETTY_FUNCTION__))
22788 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22788, __extension__
__PRETTY_FUNCTION__))
;
22789
22790 // If we're called by the type legalizer, handle a few cases.
22791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22792 if (!TLI.isTypeLegal(InVT)) {
22793 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22794 VT.is128BitVector()) {
22795 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22796, __extension__
__PRETTY_FUNCTION__))
22796 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22796, __extension__
__PRETTY_FUNCTION__))
;
22797 // The default behavior is to truncate one step, concatenate, and then
22798 // truncate the remainder. We'd rather produce two 64-bit results and
22799 // concatenate those.
22800 SDValue Lo, Hi;
22801 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
22802
22803 EVT LoVT, HiVT;
22804 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22805
22806 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
22807 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
22808 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22809 }
22810
22811 // Otherwise let default legalization handle it.
22812 return SDValue();
22813 }
22814
22815 if (VT.getVectorElementType() == MVT::i1)
22816 return LowerTruncateVecI1(Op, DAG, Subtarget);
22817
22818 // vpmovqb/w/d, vpmovdb/w, vpmovwb
22819 if (Subtarget.hasAVX512()) {
22820 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
22821 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22821, __extension__
__PRETTY_FUNCTION__))
;
22822 return splitVectorIntUnary(Op, DAG);
22823 }
22824
22825 // word to byte only under BWI. Otherwise we have to promoted to v16i32
22826 // and then truncate that. But we should only do that if we haven't been
22827 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
22828 // handled by isel patterns.
22829 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
22830 Subtarget.canExtendTo512DQ())
22831 return Op;
22832 }
22833
22834 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22835 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22836
22837 // Truncate with PACKUS if we are truncating a vector with leading zero bits
22838 // that extend all the way to the packed/truncated value.
22839 // Pre-SSE41 we can only use PACKUSWB.
22840 KnownBits Known = DAG.computeKnownBits(In);
22841 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
22842 if (SDValue V =
22843 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
22844 return V;
22845
22846 // Truncate with PACKSS if we are truncating a vector with sign-bits that
22847 // extend all the way to the packed/truncated value.
22848 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
22849 if (SDValue V =
22850 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
22851 return V;
22852
22853 // Handle truncation of V256 to V128 using shuffles.
22854 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22854, __extension__
__PRETTY_FUNCTION__))
;
22855
22856 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
22857 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
22858 if (Subtarget.hasInt256()) {
22859 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
22860 In = DAG.getBitcast(MVT::v8i32, In);
22861 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
22862 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
22863 DAG.getIntPtrConstant(0, DL));
22864 }
22865
22866 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22867 DAG.getIntPtrConstant(0, DL));
22868 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22869 DAG.getIntPtrConstant(2, DL));
22870 static const int ShufMask[] = {0, 2, 4, 6};
22871 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
22872 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
22873 }
22874
22875 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
22876 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
22877 if (Subtarget.hasInt256()) {
22878 // The PSHUFB mask:
22879 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
22880 -1, -1, -1, -1, -1, -1, -1, -1,
22881 16, 17, 20, 21, 24, 25, 28, 29,
22882 -1, -1, -1, -1, -1, -1, -1, -1 };
22883 In = DAG.getBitcast(MVT::v32i8, In);
22884 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
22885 In = DAG.getBitcast(MVT::v4i64, In);
22886
22887 static const int ShufMask2[] = {0, 2, -1, -1};
22888 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
22889 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22890 DAG.getIntPtrConstant(0, DL));
22891 return DAG.getBitcast(MVT::v8i16, In);
22892 }
22893
22894 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22895 DAG.getIntPtrConstant(0, DL));
22896 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22897 DAG.getIntPtrConstant(4, DL));
22898
22899 // The PSHUFB mask:
22900 static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
22901
22902 OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
22903 OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
22904
22905 OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
22906 OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
22907
22908 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
22909 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
22910
22911 // The MOVLHPS Mask:
22912 static const int ShufMask2[] = {0, 1, 4, 5};
22913 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
22914 return DAG.getBitcast(MVT::v8i16, res);
22915 }
22916
22917 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
22918 // Use an AND to zero uppper bits for PACKUS.
22919 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
22920
22921 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22922 DAG.getIntPtrConstant(0, DL));
22923 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22924 DAG.getIntPtrConstant(8, DL));
22925 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
22926 }
22927
22928 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22928)
;
22929}
22930
22931// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
22932// behaves on out of range inputs to generate optimized conversions.
22933static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
22934 SelectionDAG &DAG,
22935 const X86Subtarget &Subtarget) {
22936 MVT SrcVT = Src.getSimpleValueType();
22937 unsigned DstBits = VT.getScalarSizeInBits();
22938 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22938, __extension__
__PRETTY_FUNCTION__))
;
22939
22940 // Calculate the converted result for values in the range 0 to
22941 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22942 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
22943 SDValue Big =
22944 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
22945 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
22946 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
22947
22948 // The "CVTTP2SI" instruction conveniently sets the sign bit if
22949 // and only if the value was out of range. So we can use that
22950 // as our indicator that we rather use "Big" instead of "Small".
22951 //
22952 // Use "Small" if "IsOverflown" has all bits cleared
22953 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22954
22955 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
22956 // use the slightly slower blendv select instead.
22957 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
22958 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
22959 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
22960 }
22961
22962 SDValue IsOverflown =
22963 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
22964 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
22965 return DAG.getNode(ISD::OR, dl, VT, Small,
22966 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22967}
22968
22969SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
22970 bool IsStrict = Op->isStrictFPOpcode();
22971 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
22972 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
22973 MVT VT = Op->getSimpleValueType(0);
22974 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22975 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
22976 MVT SrcVT = Src.getSimpleValueType();
22977 SDLoc dl(Op);
22978
22979 SDValue Res;
22980 if (isSoftFP16(SrcVT)) {
22981 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
22982 if (IsStrict)
22983 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
22984 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
22985 {NVT, MVT::Other}, {Chain, Src})});
22986 return DAG.getNode(Op.getOpcode(), dl, VT,
22987 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
22988 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
22989 return Op;
22990 }
22991
22992 if (VT.isVector()) {
22993 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
22994 MVT ResVT = MVT::v4i32;
22995 MVT TruncVT = MVT::v4i1;
22996 unsigned Opc;
22997 if (IsStrict)
22998 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
22999 else
23000 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23001
23002 if (!IsSigned && !Subtarget.hasVLX()) {
23003 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23003, __extension__
__PRETTY_FUNCTION__))
;
23004 // Widen to 512-bits.
23005 ResVT = MVT::v8i32;
23006 TruncVT = MVT::v8i1;
23007 Opc = Op.getOpcode();
23008 // Need to concat with zero vector for strict fp to avoid spurious
23009 // exceptions.
23010 // TODO: Should we just do this for non-strict as well?
23011 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
23012 : DAG.getUNDEF(MVT::v8f64);
23013 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
23014 DAG.getIntPtrConstant(0, dl));
23015 }
23016 if (IsStrict) {
23017 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
23018 Chain = Res.getValue(1);
23019 } else {
23020 Res = DAG.getNode(Opc, dl, ResVT, Src);
23021 }
23022
23023 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
23024 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
23025 DAG.getIntPtrConstant(0, dl));
23026 if (IsStrict)
23027 return DAG.getMergeValues({Res, Chain}, dl);
23028 return Res;
23029 }
23030
23031 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
23032 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
23033 return Op;
23034
23035 MVT ResVT = VT;
23036 MVT EleVT = VT.getVectorElementType();
23037 if (EleVT != MVT::i64)
23038 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
23039
23040 if (SrcVT != MVT::v8f16) {
23041 SDValue Tmp =
23042 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
23043 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
23044 Ops[0] = Src;
23045 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
23046 }
23047
23048 if (IsStrict) {
23049 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
23050 : X86ISD::STRICT_CVTTP2UI,
23051 dl, {ResVT, MVT::Other}, {Chain, Src});
23052 Chain = Res.getValue(1);
23053 } else {
23054 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
23055 ResVT, Src);
23056 }
23057
23058 // TODO: Need to add exception check code for strict FP.
23059 if (EleVT.getSizeInBits() < 16) {
23060 ResVT = MVT::getVectorVT(EleVT, 8);
23061 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
23062 }
23063
23064 if (ResVT != VT)
23065 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23066 DAG.getIntPtrConstant(0, dl));
23067
23068 if (IsStrict)
23069 return DAG.getMergeValues({Res, Chain}, dl);
23070 return Res;
23071 }
23072
23073 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
23074 if (VT.getVectorElementType() == MVT::i16) {
23075 assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__))
23076 SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__))
23077 "Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__))
;
23078 MVT NVT = VT.changeVectorElementType(MVT::i32);
23079 if (IsStrict) {
23080 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
23081 : ISD::STRICT_FP_TO_UINT,
23082 dl, {NVT, MVT::Other}, {Chain, Src});
23083 Chain = Res.getValue(1);
23084 } else {
23085 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
23086 NVT, Src);
23087 }
23088
23089 // TODO: Need to add exception check code for strict FP.
23090 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23091
23092 if (IsStrict)
23093 return DAG.getMergeValues({Res, Chain}, dl);
23094 return Res;
23095 }
23096
23097 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
23098 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
23099 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23099, __extension__
__PRETTY_FUNCTION__))
;
23100 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23100, __extension__
__PRETTY_FUNCTION__))
;
23101 return Op;
23102 }
23103
23104 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
23105 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
23106 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
23107 Subtarget.useAVX512Regs()) {
23108 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23108, __extension__
__PRETTY_FUNCTION__))
;
23109 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23109, __extension__
__PRETTY_FUNCTION__))
;
23110 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
23111 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
23112 // Need to concat with zero vector for strict fp to avoid spurious
23113 // exceptions.
23114 // TODO: Should we just do this for non-strict as well?
23115 SDValue Tmp =
23116 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23117 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23118 DAG.getIntPtrConstant(0, dl));
23119
23120 if (IsStrict) {
23121 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
23122 {Chain, Src});
23123 Chain = Res.getValue(1);
23124 } else {
23125 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
23126 }
23127
23128 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23129 DAG.getIntPtrConstant(0, dl));
23130
23131 if (IsStrict)
23132 return DAG.getMergeValues({Res, Chain}, dl);
23133 return Res;
23134 }
23135
23136 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
23137 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
23138 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
23139 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
23140 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23140, __extension__
__PRETTY_FUNCTION__))
;
23141 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
23142 // Need to concat with zero vector for strict fp to avoid spurious
23143 // exceptions.
23144 // TODO: Should we just do this for non-strict as well?
23145 SDValue Tmp =
23146 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23147 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23148 DAG.getIntPtrConstant(0, dl));
23149
23150 if (IsStrict) {
23151 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23152 {Chain, Src});
23153 Chain = Res.getValue(1);
23154 } else {
23155 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
23156 }
23157
23158 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23159 DAG.getIntPtrConstant(0, dl));
23160
23161 if (IsStrict)
23162 return DAG.getMergeValues({Res, Chain}, dl);
23163 return Res;
23164 }
23165
23166 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
23167 if (!Subtarget.hasVLX()) {
23168 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
23169 // legalizer and then widened again by vector op legalization.
23170 if (!IsStrict)
23171 return SDValue();
23172
23173 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
23174 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
23175 {Src, Zero, Zero, Zero});
23176 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23177 {Chain, Tmp});
23178 SDValue Chain = Tmp.getValue(1);
23179 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
23180 DAG.getIntPtrConstant(0, dl));
23181 return DAG.getMergeValues({Tmp, Chain}, dl);
23182 }
23183
23184 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23184, __extension__
__PRETTY_FUNCTION__))
;
23185 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23186 DAG.getUNDEF(MVT::v2f32));
23187 if (IsStrict) {
23188 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
23189 : X86ISD::STRICT_CVTTP2UI;
23190 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
23191 }
23192 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23193 return DAG.getNode(Opc, dl, VT, Tmp);
23194 }
23195
23196 // Generate optimized instructions for pre AVX512 unsigned conversions from
23197 // vXf32 to vXi32.
23198 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
23199 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
23200 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
23201 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23201, __extension__
__PRETTY_FUNCTION__))
;
23202 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
23203 }
23204
23205 return SDValue();
23206 }
23207
23208 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23208, __extension__ __PRETTY_FUNCTION__))
;
23209
23210 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
23211
23212 if (!IsSigned && UseSSEReg) {
23213 // Conversions from f32/f64 with AVX512 should be legal.
23214 if (Subtarget.hasAVX512())
23215 return Op;
23216
23217 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
23218 // behaves on out of range inputs to generate optimized conversions.
23219 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
23220 (VT == MVT::i64 && Subtarget.is64Bit()))) {
23221 unsigned DstBits = VT.getScalarSizeInBits();
23222 APInt UIntLimit = APInt::getSignMask(DstBits);
23223 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
23224 DAG.getConstant(UIntLimit, dl, VT));
23225 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
23226
23227 // Calculate the converted result for values in the range:
23228 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23229 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
23230 SDValue Small =
23231 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
23232 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
23233 SDValue Big = DAG.getNode(
23234 X86ISD::CVTTS2SI, dl, VT,
23235 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
23236 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
23237
23238 // The "CVTTS2SI" instruction conveniently sets the sign bit if
23239 // and only if the value was out of range. So we can use that
23240 // as our indicator that we rather use "Big" instead of "Small".
23241 //
23242 // Use "Small" if "IsOverflown" has all bits cleared
23243 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23244 SDValue IsOverflown = DAG.getNode(
23245 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
23246 return DAG.getNode(ISD::OR, dl, VT, Small,
23247 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23248 }
23249
23250 // Use default expansion for i64.
23251 if (VT == MVT::i64)
23252 return SDValue();
23253
23254 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23254, __extension__
__PRETTY_FUNCTION__))
;
23255
23256 // Promote i32 to i64 and use a signed operation on 64-bit targets.
23257 // FIXME: This does not generate an invalid exception if the input does not
23258 // fit in i32. PR44019
23259 if (Subtarget.is64Bit()) {
23260 if (IsStrict) {
23261 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
23262 {Chain, Src});
23263 Chain = Res.getValue(1);
23264 } else
23265 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
23266
23267 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23268 if (IsStrict)
23269 return DAG.getMergeValues({Res, Chain}, dl);
23270 return Res;
23271 }
23272
23273 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
23274 // use fisttp which will be handled later.
23275 if (!Subtarget.hasSSE3())
23276 return SDValue();
23277 }
23278
23279 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
23280 // FIXME: This does not generate an invalid exception if the input does not
23281 // fit in i16. PR44019
23282 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
23283 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23283, __extension__
__PRETTY_FUNCTION__))
;
23284 if (IsStrict) {
23285 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
23286 {Chain, Src});
23287 Chain = Res.getValue(1);
23288 } else
23289 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
23290
23291 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23292 if (IsStrict)
23293 return DAG.getMergeValues({Res, Chain}, dl);
23294 return Res;
23295 }
23296
23297 // If this is a FP_TO_SINT using SSEReg we're done.
23298 if (UseSSEReg && IsSigned)
23299 return Op;
23300
23301 // fp128 needs to use a libcall.
23302 if (SrcVT == MVT::f128) {
23303 RTLIB::Libcall LC;
23304 if (IsSigned)
23305 LC = RTLIB::getFPTOSINT(SrcVT, VT);
23306 else
23307 LC = RTLIB::getFPTOUINT(SrcVT, VT);
23308
23309 MakeLibCallOptions CallOptions;
23310 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
23311 SDLoc(Op), Chain);
23312
23313 if (IsStrict)
23314 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
23315
23316 return Tmp.first;
23317 }
23318
23319 // Fall back to X87.
23320 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
23321 if (IsStrict)
23322 return DAG.getMergeValues({V, Chain}, dl);
23323 return V;
23324 }
23325
23326 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23326)
;
23327}
23328
23329SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
23330 SelectionDAG &DAG) const {
23331 SDValue Src = Op.getOperand(0);
23332 MVT SrcVT = Src.getSimpleValueType();
23333
23334 if (SrcVT == MVT::f16)
23335 return SDValue();
23336
23337 // If the source is in an SSE register, the node is Legal.
23338 if (isScalarFPTypeInSSEReg(SrcVT))
23339 return Op;
23340
23341 return LRINT_LLRINTHelper(Op.getNode(), DAG);
23342}
23343
23344SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
23345 SelectionDAG &DAG) const {
23346 EVT DstVT = N->getValueType(0);
23347 SDValue Src = N->getOperand(0);
23348 EVT SrcVT = Src.getValueType();
23349
23350 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
23351 // f16 must be promoted before using the lowering in this routine.
23352 // fp128 does not use this lowering.
23353 return SDValue();
23354 }
23355
23356 SDLoc DL(N);
23357 SDValue Chain = DAG.getEntryNode();
23358
23359 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
23360
23361 // If we're converting from SSE, the stack slot needs to hold both types.
23362 // Otherwise it only needs to hold the DstVT.
23363 EVT OtherVT = UseSSE ? SrcVT : DstVT;
23364 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
23365 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23366 MachinePointerInfo MPI =
23367 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
23368
23369 if (UseSSE) {
23370 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23370, __extension__
__PRETTY_FUNCTION__))
;
23371 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
23372 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
23373 SDValue Ops[] = { Chain, StackPtr };
23374
23375 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
23376 /*Align*/ std::nullopt,
23377 MachineMemOperand::MOLoad);
23378 Chain = Src.getValue(1);
23379 }
23380
23381 SDValue StoreOps[] = { Chain, Src, StackPtr };
23382 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
23383 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
23384 MachineMemOperand::MOStore);
23385
23386 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
23387}
23388
23389SDValue
23390X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
23391 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
23392 // but making use of X86 specifics to produce better instruction sequences.
23393 SDNode *Node = Op.getNode();
23394 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
23395 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
23396 SDLoc dl(SDValue(Node, 0));
23397 SDValue Src = Node->getOperand(0);
23398
23399 // There are three types involved here: SrcVT is the source floating point
23400 // type, DstVT is the type of the result, and TmpVT is the result of the
23401 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
23402 // DstVT).
23403 EVT SrcVT = Src.getValueType();
23404 EVT DstVT = Node->getValueType(0);
23405 EVT TmpVT = DstVT;
23406
23407 // This code is only for floats and doubles. Fall back to generic code for
23408 // anything else.
23409 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
23410 return SDValue();
23411
23412 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
23413 unsigned SatWidth = SatVT.getScalarSizeInBits();
23414 unsigned DstWidth = DstVT.getScalarSizeInBits();
23415 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
23416 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23417, __extension__
__PRETTY_FUNCTION__))
23417 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23417, __extension__
__PRETTY_FUNCTION__))
;
23418
23419 // Promote result of FP_TO_*INT to at least 32 bits.
23420 if (TmpWidth < 32) {
23421 TmpVT = MVT::i32;
23422 TmpWidth = 32;
23423 }
23424
23425 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
23426 // us to use a native signed conversion instead.
23427 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
23428 TmpVT = MVT::i64;
23429 TmpWidth = 64;
23430 }
23431
23432 // If the saturation width is smaller than the size of the temporary result,
23433 // we can always use signed conversion, which is native.
23434 if (SatWidth < TmpWidth)
23435 FpToIntOpcode = ISD::FP_TO_SINT;
23436
23437 // Determine minimum and maximum integer values and their corresponding
23438 // floating-point values.
23439 APInt MinInt, MaxInt;
23440 if (IsSigned) {
23441 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
23442 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
23443 } else {
23444 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
23445 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
23446 }
23447
23448 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23449 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23450
23451 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
23452 MinInt, IsSigned, APFloat::rmTowardZero);
23453 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
23454 MaxInt, IsSigned, APFloat::rmTowardZero);
23455 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
23456 && !(MaxStatus & APFloat::opStatus::opInexact);
23457
23458 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
23459 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
23460
23461 // If the integer bounds are exactly representable as floats, emit a
23462 // min+max+fptoi sequence. Otherwise use comparisons and selects.
23463 if (AreExactFloatBounds) {
23464 if (DstVT != TmpVT) {
23465 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
23466 SDValue MinClamped = DAG.getNode(
23467 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
23468 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
23469 SDValue BothClamped = DAG.getNode(
23470 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
23471 // Convert clamped value to integer.
23472 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
23473
23474 // NaN will become INDVAL, with the top bit set and the rest zero.
23475 // Truncation will discard the top bit, resulting in zero.
23476 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23477 }
23478
23479 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
23480 SDValue MinClamped = DAG.getNode(
23481 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
23482 // Clamp by MaxFloat from above. NaN cannot occur.
23483 SDValue BothClamped = DAG.getNode(
23484 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
23485 // Convert clamped value to integer.
23486 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
23487
23488 if (!IsSigned) {
23489 // In the unsigned case we're done, because we mapped NaN to MinFloat,
23490 // which is zero.
23491 return FpToInt;
23492 }
23493
23494 // Otherwise, select zero if Src is NaN.
23495 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23496 return DAG.getSelectCC(
23497 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
23498 }
23499
23500 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
23501 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
23502
23503 // Result of direct conversion, which may be selected away.
23504 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
23505
23506 if (DstVT != TmpVT) {
23507 // NaN will become INDVAL, with the top bit set and the rest zero.
23508 // Truncation will discard the top bit, resulting in zero.
23509 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23510 }
23511
23512 SDValue Select = FpToInt;
23513 // For signed conversions where we saturate to the same size as the
23514 // result type of the fptoi instructions, INDVAL coincides with integer
23515 // minimum, so we don't need to explicitly check it.
23516 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
23517 // If Src ULT MinFloat, select MinInt. In particular, this also selects
23518 // MinInt if Src is NaN.
23519 Select = DAG.getSelectCC(
23520 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
23521 }
23522
23523 // If Src OGT MaxFloat, select MaxInt.
23524 Select = DAG.getSelectCC(
23525 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
23526
23527 // In the unsigned case we are done, because we mapped NaN to MinInt, which
23528 // is already zero. The promoted case was already handled above.
23529 if (!IsSigned || DstVT != TmpVT) {
23530 return Select;
23531 }
23532
23533 // Otherwise, select 0 if Src is NaN.
23534 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23535 return DAG.getSelectCC(
23536 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
23537}
23538
23539SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
23540 bool IsStrict = Op->isStrictFPOpcode();
23541
23542 SDLoc DL(Op);
23543 MVT VT = Op.getSimpleValueType();
23544 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23545 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23546 MVT SVT = In.getSimpleValueType();
23547
23548 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
23549 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
23550 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
23551 !Subtarget.getTargetTriple().isOSDarwin()))
23552 return SDValue();
23553
23554 if (SVT == MVT::f16) {
23555 if (Subtarget.hasFP16())
23556 return Op;
23557
23558 if (VT != MVT::f32) {
23559 if (IsStrict)
23560 return DAG.getNode(
23561 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
23562 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
23563 {MVT::f32, MVT::Other}, {Chain, In})});
23564
23565 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
23566 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
23567 }
23568
23569 if (!Subtarget.hasF16C()) {
23570 if (!Subtarget.getTargetTriple().isOSDarwin())
23571 return SDValue();
23572
23573 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23573, __extension__
__PRETTY_FUNCTION__))
;
23574
23575 // Need a libcall, but ABI for f16 is soft-float on MacOS.
23576 TargetLowering::CallLoweringInfo CLI(DAG);
23577 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23578
23579 In = DAG.getBitcast(MVT::i16, In);
23580 TargetLowering::ArgListTy Args;
23581 TargetLowering::ArgListEntry Entry;
23582 Entry.Node = In;
23583 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
23584 Entry.IsSExt = false;
23585 Entry.IsZExt = true;
23586 Args.push_back(Entry);
23587
23588 SDValue Callee = DAG.getExternalSymbol(
23589 getLibcallName(RTLIB::FPEXT_F16_F32),
23590 getPointerTy(DAG.getDataLayout()));
23591 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23592 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
23593 std::move(Args));
23594
23595 SDValue Res;
23596 std::tie(Res,Chain) = LowerCallTo(CLI);
23597 if (IsStrict)
23598 Res = DAG.getMergeValues({Res, Chain}, DL);
23599
23600 return Res;
23601 }
23602
23603 In = DAG.getBitcast(MVT::i16, In);
23604 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
23605 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
23606 DAG.getIntPtrConstant(0, DL));
23607 SDValue Res;
23608 if (IsStrict) {
23609 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
23610 {Chain, In});
23611 Chain = Res.getValue(1);
23612 } else {
23613 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
23614 DAG.getTargetConstant(4, DL, MVT::i32));
23615 }
23616 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
23617 DAG.getIntPtrConstant(0, DL));
23618 if (IsStrict)
23619 return DAG.getMergeValues({Res, Chain}, DL);
23620 return Res;
23621 }
23622
23623 if (!SVT.isVector())
23624 return Op;
23625
23626 if (SVT.getVectorElementType() == MVT::f16) {
23627 assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23627, __extension__
__PRETTY_FUNCTION__))
;
23628 if (SVT == MVT::v2f16)
23629 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
23630 DAG.getUNDEF(MVT::v2f16));
23631 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
23632 DAG.getUNDEF(MVT::v4f16));
23633 if (IsStrict)
23634 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23635 {Op->getOperand(0), Res});
23636 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23637 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
23638 return Op;
23639 }
23640
23641 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23641, __extension__
__PRETTY_FUNCTION__))
;
23642
23643 SDValue Res =
23644 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
23645 if (IsStrict)
23646 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23647 {Op->getOperand(0), Res});
23648 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23649}
23650
23651SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
23652 bool IsStrict = Op->isStrictFPOpcode();
23653
23654 SDLoc DL(Op);
23655 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23656 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23657 MVT VT = Op.getSimpleValueType();
23658 MVT SVT = In.getSimpleValueType();
23659
23660 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
23661 return SDValue();
23662
23663 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
23664 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
23665 if (!Subtarget.getTargetTriple().isOSDarwin())
23666 return SDValue();
23667
23668 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
23669 TargetLowering::CallLoweringInfo CLI(DAG);
23670 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23671
23672 TargetLowering::ArgListTy Args;
23673 TargetLowering::ArgListEntry Entry;
23674 Entry.Node = In;
23675 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
23676 Entry.IsSExt = false;
23677 Entry.IsZExt = true;
23678 Args.push_back(Entry);
23679
23680 SDValue Callee = DAG.getExternalSymbol(
23681 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
23682 : RTLIB::FPROUND_F32_F16),
23683 getPointerTy(DAG.getDataLayout()));
23684 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23685 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
23686 std::move(Args));
23687
23688 SDValue Res;
23689 std::tie(Res, Chain) = LowerCallTo(CLI);
23690
23691 Res = DAG.getBitcast(MVT::f16, Res);
23692
23693 if (IsStrict)
23694 Res = DAG.getMergeValues({Res, Chain}, DL);
23695
23696 return Res;
23697 }
23698
23699 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
23700 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
23701 return SDValue();
23702
23703 if (VT.isVector())
23704 return Op;
23705
23706 SDValue Res;
23707 SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
23708 MVT::i32);
23709 if (IsStrict) {
23710 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
23711 DAG.getConstantFP(0, DL, MVT::v4f32), In,
23712 DAG.getIntPtrConstant(0, DL));
23713 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
23714 {Chain, Res, Rnd});
23715 Chain = Res.getValue(1);
23716 } else {
23717 // FIXME: Should we use zeros for upper elements for non-strict?
23718 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
23719 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
23720 }
23721
23722 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
23723 DAG.getIntPtrConstant(0, DL));
23724 Res = DAG.getBitcast(MVT::f16, Res);
23725
23726 if (IsStrict)
23727 return DAG.getMergeValues({Res, Chain}, DL);
23728
23729 return Res;
23730 }
23731
23732 return Op;
23733}
23734
23735static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
23736 bool IsStrict = Op->isStrictFPOpcode();
23737 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23738 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23739, __extension__
__PRETTY_FUNCTION__))
23739 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23739, __extension__
__PRETTY_FUNCTION__))
;
23740
23741 SDLoc dl(Op);
23742 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
23743 DAG.getConstant(0, dl, MVT::v8i16), Src,
23744 DAG.getIntPtrConstant(0, dl));
23745
23746 SDValue Chain;
23747 if (IsStrict) {
23748 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
23749 {Op.getOperand(0), Res});
23750 Chain = Res.getValue(1);
23751 } else {
23752 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
23753 }
23754
23755 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
23756 DAG.getIntPtrConstant(0, dl));
23757
23758 if (IsStrict)
23759 return DAG.getMergeValues({Res, Chain}, dl);
23760
23761 return Res;
23762}
23763
23764static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
23765 bool IsStrict = Op->isStrictFPOpcode();
23766 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23767 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23768, __extension__
__PRETTY_FUNCTION__))
23768 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23768, __extension__
__PRETTY_FUNCTION__))
;
23769
23770 SDLoc dl(Op);
23771 SDValue Res, Chain;
23772 if (IsStrict) {
23773 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
23774 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
23775 DAG.getIntPtrConstant(0, dl));
23776 Res = DAG.getNode(
23777 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
23778 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
23779 Chain = Res.getValue(1);
23780 } else {
23781 // FIXME: Should we use zeros for upper elements for non-strict?
23782 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
23783 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
23784 DAG.getTargetConstant(4, dl, MVT::i32));
23785 }
23786
23787 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
23788 DAG.getIntPtrConstant(0, dl));
23789
23790 if (IsStrict)
23791 return DAG.getMergeValues({Res, Chain}, dl);
23792
23793 return Res;
23794}
23795
23796SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
23797 SelectionDAG &DAG) const {
23798 SDLoc DL(Op);
23799 MakeLibCallOptions CallOptions;
23800 RTLIB::Libcall LC =
23801 RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
23802 SDValue Res =
23803 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
23804 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
23805 DAG.getBitcast(MVT::i32, Res));
23806}
23807
23808/// Depending on uarch and/or optimizing for size, we might prefer to use a
23809/// vector operation in place of the typical scalar operation.
23810static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
23811 const X86Subtarget &Subtarget) {
23812 // If both operands have other uses, this is probably not profitable.
23813 SDValue LHS = Op.getOperand(0);
23814 SDValue RHS = Op.getOperand(1);
23815 if (!LHS.hasOneUse() && !RHS.hasOneUse())
23816 return Op;
23817
23818 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
23819 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
23820 if (IsFP && !Subtarget.hasSSE3())
23821 return Op;
23822 if (!IsFP && !Subtarget.hasSSSE3())
23823 return Op;
23824
23825 // Extract from a common vector.
23826 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23827 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23828 LHS.getOperand(0) != RHS.getOperand(0) ||
23829 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
23830 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
23831 !shouldUseHorizontalOp(true, DAG, Subtarget))
23832 return Op;
23833
23834 // Allow commuted 'hadd' ops.
23835 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
23836 unsigned HOpcode;
23837 switch (Op.getOpcode()) {
23838 case ISD::ADD: HOpcode = X86ISD::HADD; break;
23839 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
23840 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
23841 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
23842 default:
23843 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23843)
;
23844 }
23845 unsigned LExtIndex = LHS.getConstantOperandVal(1);
23846 unsigned RExtIndex = RHS.getConstantOperandVal(1);
23847 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
23848 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
23849 std::swap(LExtIndex, RExtIndex);
23850
23851 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
23852 return Op;
23853
23854 SDValue X = LHS.getOperand(0);
23855 EVT VecVT = X.getValueType();
23856 unsigned BitWidth = VecVT.getSizeInBits();
23857 unsigned NumLanes = BitWidth / 128;
23858 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
23859 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23860, __extension__
__PRETTY_FUNCTION__))
23860 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23860, __extension__
__PRETTY_FUNCTION__))
;
23861
23862 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
23863 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
23864 SDLoc DL(Op);
23865 if (BitWidth == 256 || BitWidth == 512) {
23866 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
23867 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
23868 LExtIndex %= NumEltsPerLane;
23869 }
23870
23871 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
23872 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
23873 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
23874 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
23875 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
23876 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
23877 DAG.getIntPtrConstant(LExtIndex / 2, DL));
23878}
23879
23880/// Depending on uarch and/or optimizing for size, we might prefer to use a
23881/// vector operation in place of the typical scalar operation.
23882SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
23883 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23884, __extension__
__PRETTY_FUNCTION__))
23884 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23884, __extension__
__PRETTY_FUNCTION__))
;
23885 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
23886}
23887
23888/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
23889/// This mode isn't supported in hardware on X86. But as long as we aren't
23890/// compiling with trapping math, we can emulate this with
23891/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
23892static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
23893 SDValue N0 = Op.getOperand(0);
23894 SDLoc dl(Op);
23895 MVT VT = Op.getSimpleValueType();
23896
23897 // N0 += copysign(nextafter(0.5, 0.0), N0)
23898 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23899 bool Ignored;
23900 APFloat Point5Pred = APFloat(0.5f);
23901 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
23902 Point5Pred.next(/*nextDown*/true);
23903
23904 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
23905 DAG.getConstantFP(Point5Pred, dl, VT), N0);
23906 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
23907
23908 // Truncate the result to remove fraction.
23909 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
23910}
23911
23912/// The only differences between FABS and FNEG are the mask and the logic op.
23913/// FNEG also has a folding opportunity for FNEG(FABS(x)).
23914static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
23915 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23916, __extension__
__PRETTY_FUNCTION__))
23916 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23916, __extension__
__PRETTY_FUNCTION__))
;
23917
23918 bool IsFABS = (Op.getOpcode() == ISD::FABS);
23919
23920 // If this is a FABS and it has an FNEG user, bail out to fold the combination
23921 // into an FNABS. We'll lower the FABS after that if it is still in use.
23922 if (IsFABS)
23923 for (SDNode *User : Op->uses())
23924 if (User->getOpcode() == ISD::FNEG)
23925 return Op;
23926
23927 SDLoc dl(Op);
23928 MVT VT = Op.getSimpleValueType();
23929
23930 bool IsF128 = (VT == MVT::f128);
23931 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23933, __extension__
__PRETTY_FUNCTION__))
23932 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23933, __extension__
__PRETTY_FUNCTION__))
23933 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23933, __extension__
__PRETTY_FUNCTION__))
;
23934
23935 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
23936 // decide if we should generate a 16-byte constant mask when we only need 4 or
23937 // 8 bytes for the scalar case.
23938
23939 // There are no scalar bitwise logical SSE/AVX instructions, so we
23940 // generate a 16-byte vector constant and logic op even for the scalar case.
23941 // Using a 16-byte mask allows folding the load of the mask with
23942 // the logic op, so it can save (~4 bytes) on code size.
23943 bool IsFakeVector = !VT.isVector() && !IsF128;
23944 MVT LogicVT = VT;
23945 if (IsFakeVector)
23946 LogicVT = (VT == MVT::f64) ? MVT::v2f64
23947 : (VT == MVT::f32) ? MVT::v4f32
23948 : MVT::v8f16;
23949
23950 unsigned EltBits = VT.getScalarSizeInBits();
23951 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
23952 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
23953 APInt::getSignMask(EltBits);
23954 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23955 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
23956
23957 SDValue Op0 = Op.getOperand(0);
23958 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
23959 unsigned LogicOp = IsFABS ? X86ISD::FAND :
23960 IsFNABS ? X86ISD::FOR :
23961 X86ISD::FXOR;
23962 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
23963
23964 if (VT.isVector() || IsF128)
23965 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
23966
23967 // For the scalar case extend to a 128-bit vector, perform the logic op,
23968 // and extract the scalar result back out.
23969 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
23970 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
23971 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
23972 DAG.getIntPtrConstant(0, dl));
23973}
23974
23975static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
23976 SDValue Mag = Op.getOperand(0);
23977 SDValue Sign = Op.getOperand(1);
23978 SDLoc dl(Op);
23979
23980 // If the sign operand is smaller, extend it first.
23981 MVT VT = Op.getSimpleValueType();
23982 if (Sign.getSimpleValueType().bitsLT(VT))
23983 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
23984
23985 // And if it is bigger, shrink it first.
23986 if (Sign.getSimpleValueType().bitsGT(VT))
23987 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
23988 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
23989
23990 // At this point the operands and the result should have the same
23991 // type, and that won't be f80 since that is not custom lowered.
23992 bool IsF128 = (VT == MVT::f128);
23993 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23995, __extension__
__PRETTY_FUNCTION__))
23994 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23995, __extension__
__PRETTY_FUNCTION__))
23995 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23995, __extension__
__PRETTY_FUNCTION__))
;
23996
23997 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23998
23999 // Perform all scalar logic operations as 16-byte vectors because there are no
24000 // scalar FP logic instructions in SSE.
24001 // TODO: This isn't necessary. If we used scalar types, we might avoid some
24002 // unnecessary splats, but we might miss load folding opportunities. Should
24003 // this decision be based on OptimizeForSize?
24004 bool IsFakeVector = !VT.isVector() && !IsF128;
24005 MVT LogicVT = VT;
24006 if (IsFakeVector)
24007 LogicVT = (VT == MVT::f64) ? MVT::v2f64
24008 : (VT == MVT::f32) ? MVT::v4f32
24009 : MVT::v8f16;
24010
24011 // The mask constants are automatically splatted for vector types.
24012 unsigned EltSizeInBits = VT.getScalarSizeInBits();
24013 SDValue SignMask = DAG.getConstantFP(
24014 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
24015 SDValue MagMask = DAG.getConstantFP(
24016 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
24017
24018 // First, clear all bits but the sign bit from the second operand (sign).
24019 if (IsFakeVector)
24020 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
24021 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
24022
24023 // Next, clear the sign bit from the first operand (magnitude).
24024 // TODO: If we had general constant folding for FP logic ops, this check
24025 // wouldn't be necessary.
24026 SDValue MagBits;
24027 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
24028 APFloat APF = Op0CN->getValueAPF();
24029 APF.clearSign();
24030 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
24031 } else {
24032 // If the magnitude operand wasn't a constant, we need to AND out the sign.
24033 if (IsFakeVector)
24034 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
24035 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
24036 }
24037
24038 // OR the magnitude value with the sign bit.
24039 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
24040 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
24041 DAG.getIntPtrConstant(0, dl));
24042}
24043
24044static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
24045 SDValue N0 = Op.getOperand(0);
24046 SDLoc dl(Op);
24047 MVT VT = Op.getSimpleValueType();
24048
24049 MVT OpVT = N0.getSimpleValueType();
24050 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24051, __extension__
__PRETTY_FUNCTION__))
24051 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24051, __extension__
__PRETTY_FUNCTION__))
;
24052
24053 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
24054 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
24055 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
24056 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
24057 Res = DAG.getZExtOrTrunc(Res, dl, VT);
24058 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
24059 return Res;
24060}
24061
24062/// Helper for attempting to create a X86ISD::BT node.
24063static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
24064 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
24065 // instruction. Since the shift amount is in-range-or-undefined, we know
24066 // that doing a bittest on the i32 value is ok. We extend to i32 because
24067 // the encoding for the i16 version is larger than the i32 version.
24068 // Also promote i16 to i32 for performance / code size reason.
24069 if (Src.getValueType().getScalarSizeInBits() < 32)
24070 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
24071
24072 // No legal type found, give up.
24073 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
24074 return SDValue();
24075
24076 // See if we can use the 32-bit instruction instead of the 64-bit one for a
24077 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
24078 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
24079 // known to be zero.
24080 if (Src.getValueType() == MVT::i64 &&
24081 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
24082 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
24083
24084 // If the operand types disagree, extend the shift amount to match. Since
24085 // BT ignores high bits (like shifts) we can use anyextend.
24086 if (Src.getValueType() != BitNo.getValueType()) {
24087 // Peek through a mask/modulo operation.
24088 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
24089 // we probably need a better IsDesirableToPromoteOp to handle this as well.
24090 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
24091 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
24092 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24093 BitNo.getOperand(0)),
24094 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24095 BitNo.getOperand(1)));
24096 else
24097 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
24098 }
24099
24100 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
24101}
24102
24103/// Helper for creating a X86ISD::SETCC node.
24104static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
24105 SelectionDAG &DAG) {
24106 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
24107 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
24108}
24109
24110/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
24111/// recognizable memcmp expansion.
24112static bool isOrXorXorTree(SDValue X, bool Root = true) {
24113 if (X.getOpcode() == ISD::OR)
24114 return isOrXorXorTree(X.getOperand(0), false) &&
24115 isOrXorXorTree(X.getOperand(1), false);
24116 if (Root)
24117 return false;
24118 return X.getOpcode() == ISD::XOR;
24119}
24120
24121/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
24122/// expansion.
24123template <typename F>
24124static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
24125 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
24126 SDValue Op0 = X.getOperand(0);
24127 SDValue Op1 = X.getOperand(1);
24128 if (X.getOpcode() == ISD::OR) {
24129 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24130 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24131 if (VecVT != CmpVT)
24132 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
24133 if (HasPT)
24134 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
24135 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
24136 }
24137 if (X.getOpcode() == ISD::XOR) {
24138 SDValue A = SToV(Op0);
24139 SDValue B = SToV(Op1);
24140 if (VecVT != CmpVT)
24141 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
24142 if (HasPT)
24143 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
24144 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
24145 }
24146 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24146)
;
24147}
24148
24149/// Try to map a 128-bit or larger integer comparison to vector instructions
24150/// before type legalization splits it up into chunks.
24151static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
24152 ISD::CondCode CC,
24153 const SDLoc &DL,
24154 SelectionDAG &DAG,
24155 const X86Subtarget &Subtarget) {
24156 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24156, __extension__
__PRETTY_FUNCTION__))
;
24157
24158 // We're looking for an oversized integer equality comparison.
24159 EVT OpVT = X.getValueType();
24160 unsigned OpSize = OpVT.getSizeInBits();
24161 if (!OpVT.isScalarInteger() || OpSize < 128)
24162 return SDValue();
24163
24164 // Ignore a comparison with zero because that gets special treatment in
24165 // EmitTest(). But make an exception for the special case of a pair of
24166 // logically-combined vector-sized operands compared to zero. This pattern may
24167 // be generated by the memcmp expansion pass with oversized integer compares
24168 // (see PR33325).
24169 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
24170 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
24171 return SDValue();
24172
24173 // Don't perform this combine if constructing the vector will be expensive.
24174 auto IsVectorBitCastCheap = [](SDValue X) {
24175 X = peekThroughBitcasts(X);
24176 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
24177 X.getOpcode() == ISD::LOAD;
24178 };
24179 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
24180 !IsOrXorXorTreeCCZero)
24181 return SDValue();
24182
24183 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
24184 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
24185 // Otherwise use PCMPEQ (plus AND) and mask testing.
24186 bool NoImplicitFloatOps =
24187 DAG.getMachineFunction().getFunction().hasFnAttribute(
24188 Attribute::NoImplicitFloat);
24189 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
24190 ((OpSize == 128 && Subtarget.hasSSE2()) ||
24191 (OpSize == 256 && Subtarget.hasAVX()) ||
24192 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
24193 bool HasPT = Subtarget.hasSSE41();
24194
24195 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
24196 // vector registers are essentially free. (Technically, widening registers
24197 // prevents load folding, but the tradeoff is worth it.)
24198 bool PreferKOT = Subtarget.preferMaskRegisters();
24199 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
24200
24201 EVT VecVT = MVT::v16i8;
24202 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
24203 if (OpSize == 256) {
24204 VecVT = MVT::v32i8;
24205 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
24206 }
24207 EVT CastVT = VecVT;
24208 bool NeedsAVX512FCast = false;
24209 if (OpSize == 512 || NeedZExt) {
24210 if (Subtarget.hasBWI()) {
24211 VecVT = MVT::v64i8;
24212 CmpVT = MVT::v64i1;
24213 if (OpSize == 512)
24214 CastVT = VecVT;
24215 } else {
24216 VecVT = MVT::v16i32;
24217 CmpVT = MVT::v16i1;
24218 CastVT = OpSize == 512 ? VecVT
24219 : OpSize == 256 ? MVT::v8i32
24220 : MVT::v4i32;
24221 NeedsAVX512FCast = true;
24222 }
24223 }
24224
24225 auto ScalarToVector = [&](SDValue X) -> SDValue {
24226 bool TmpZext = false;
24227 EVT TmpCastVT = CastVT;
24228 if (X.getOpcode() == ISD::ZERO_EXTEND) {
24229 SDValue OrigX = X.getOperand(0);
24230 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
24231 if (OrigSize < OpSize) {
24232 if (OrigSize == 128) {
24233 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
24234 X = OrigX;
24235 TmpZext = true;
24236 } else if (OrigSize == 256) {
24237 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
24238 X = OrigX;
24239 TmpZext = true;
24240 }
24241 }
24242 }
24243 X = DAG.getBitcast(TmpCastVT, X);
24244 if (!NeedZExt && !TmpZext)
24245 return X;
24246 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
24247 DAG.getConstant(0, DL, VecVT), X,
24248 DAG.getVectorIdxConstant(0, DL));
24249 };
24250
24251 SDValue Cmp;
24252 if (IsOrXorXorTreeCCZero) {
24253 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
24254 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
24255 // Use 2 vector equality compares and 'and' the results before doing a
24256 // MOVMSK.
24257 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
24258 } else {
24259 SDValue VecX = ScalarToVector(X);
24260 SDValue VecY = ScalarToVector(Y);
24261 if (VecVT != CmpVT) {
24262 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
24263 } else if (HasPT) {
24264 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
24265 } else {
24266 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
24267 }
24268 }
24269 // AVX512 should emit a setcc that will lower to kortest.
24270 if (VecVT != CmpVT) {
24271 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
24272 : CmpVT == MVT::v32i1 ? MVT::i32
24273 : MVT::i16;
24274 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
24275 DAG.getConstant(0, DL, KRegVT), CC);
24276 }
24277 if (HasPT) {
24278 SDValue BCCmp =
24279 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
24280 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
24281 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24282 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
24283 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
24284 }
24285 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
24286 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
24287 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
24288 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24289, __extension__
__PRETTY_FUNCTION__))
24289 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24289, __extension__
__PRETTY_FUNCTION__))
;
24290 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
24291 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
24292 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
24293 }
24294
24295 return SDValue();
24296}
24297
24298/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
24299/// style scalarized (associative) reduction patterns. Partial reductions
24300/// are supported when the pointer SrcMask is non-null.
24301/// TODO - move this to SelectionDAG?
24302static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
24303 SmallVectorImpl<SDValue> &SrcOps,
24304 SmallVectorImpl<APInt> *SrcMask = nullptr) {
24305 SmallVector<SDValue, 8> Opnds;
24306 DenseMap<SDValue, APInt> SrcOpMap;
24307 EVT VT = MVT::Other;
24308
24309 // Recognize a special case where a vector is casted into wide integer to
24310 // test all 0s.
24311 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24312, __extension__
__PRETTY_FUNCTION__))
24312 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24312, __extension__
__PRETTY_FUNCTION__))
;
24313 Opnds.push_back(Op.getOperand(0));
24314 Opnds.push_back(Op.getOperand(1));
24315
24316 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
24317 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
24318 // BFS traverse all BinOp operands.
24319 if (I->getOpcode() == unsigned(BinOp)) {
24320 Opnds.push_back(I->getOperand(0));
24321 Opnds.push_back(I->getOperand(1));
24322 // Re-evaluate the number of nodes to be traversed.
24323 e += 2; // 2 more nodes (LHS and RHS) are pushed.
24324 continue;
24325 }
24326
24327 // Quit if a non-EXTRACT_VECTOR_ELT
24328 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24329 return false;
24330
24331 // Quit if without a constant index.
24332 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
24333 if (!Idx)
24334 return false;
24335
24336 SDValue Src = I->getOperand(0);
24337 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
24338 if (M == SrcOpMap.end()) {
24339 VT = Src.getValueType();
24340 // Quit if not the same type.
24341 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
24342 return false;
24343 unsigned NumElts = VT.getVectorNumElements();
24344 APInt EltCount = APInt::getZero(NumElts);
24345 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
24346 SrcOps.push_back(Src);
24347 }
24348
24349 // Quit if element already used.
24350 unsigned CIdx = Idx->getZExtValue();
24351 if (M->second[CIdx])
24352 return false;
24353 M->second.setBit(CIdx);
24354 }
24355
24356 if (SrcMask) {
24357 // Collect the source partial masks.
24358 for (SDValue &SrcOp : SrcOps)
24359 SrcMask->push_back(SrcOpMap[SrcOp]);
24360 } else {
24361 // Quit if not all elements are used.
24362 for (const auto &I : SrcOpMap)
24363 if (!I.second.isAllOnes())
24364 return false;
24365 }
24366
24367 return true;
24368}
24369
24370// Helper function for comparing all bits of two vectors.
24371static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
24372 ISD::CondCode CC, const APInt &OriginalMask,
24373 const X86Subtarget &Subtarget,
24374 SelectionDAG &DAG, X86::CondCode &X86CC) {
24375 EVT VT = LHS.getValueType();
24376 unsigned ScalarSize = VT.getScalarSizeInBits();
24377 if (OriginalMask.getBitWidth() != ScalarSize) {
24378 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24378, __extension__
__PRETTY_FUNCTION__))
;
24379 return SDValue();
24380 }
24381
24382 // Quit if not convertable to legal scalar or 128/256-bit vector.
24383 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24384 return SDValue();
24385
24386 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
24387 if (VT.isFloatingPoint())
24388 return SDValue();
24389
24390 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24390, __extension__
__PRETTY_FUNCTION__))
;
24391 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
24392
24393 APInt Mask = OriginalMask;
24394
24395 auto MaskBits = [&](SDValue Src) {
24396 if (Mask.isAllOnes())
24397 return Src;
24398 EVT SrcVT = Src.getValueType();
24399 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
24400 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
24401 };
24402
24403 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
24404 if (VT.getSizeInBits() < 128) {
24405 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
24406 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
24407 if (IntVT != MVT::i64)
24408 return SDValue();
24409 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
24410 MVT::i32, MVT::i32);
24411 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
24412 MVT::i32, MVT::i32);
24413 SDValue Lo =
24414 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
24415 SDValue Hi =
24416 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
24417 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24418 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
24419 DAG.getConstant(0, DL, MVT::i32));
24420 }
24421 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24422 DAG.getBitcast(IntVT, MaskBits(LHS)),
24423 DAG.getBitcast(IntVT, MaskBits(RHS)));
24424 }
24425
24426 // Without PTEST, a masked v2i64 or-reduction is not faster than
24427 // scalarization.
24428 bool UseKORTEST = Subtarget.useAVX512Regs();
24429 bool UsePTEST = Subtarget.hasSSE41();
24430 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
24431 return SDValue();
24432
24433 // Split down to 128/256/512-bit vector.
24434 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
24435
24436 // If the input vector has vector elements wider than the target test size,
24437 // then cast to <X x i64> so it will safely split.
24438 if (ScalarSize > TestSize) {
24439 if (!Mask.isAllOnes())
24440 return SDValue();
24441 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
24442 LHS = DAG.getBitcast(VT, LHS);
24443 RHS = DAG.getBitcast(VT, RHS);
24444 Mask = APInt::getAllOnes(64);
24445 }
24446
24447 if (VT.getSizeInBits() > TestSize) {
24448 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
24449 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
24450 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
24451 while (VT.getSizeInBits() > TestSize) {
24452 auto Split = DAG.SplitVector(LHS, DL);
24453 VT = Split.first.getValueType();
24454 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24455 }
24456 RHS = DAG.getAllOnesConstant(DL, VT);
24457 } else if (!UsePTEST && !KnownRHS.isZero()) {
24458 // MOVMSK Special Case:
24459 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
24460 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
24461 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
24462 LHS = DAG.getBitcast(VT, MaskBits(LHS));
24463 RHS = DAG.getBitcast(VT, MaskBits(RHS));
24464 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
24465 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
24466 V = DAG.getSExtOrTrunc(V, DL, VT);
24467 while (VT.getSizeInBits() > TestSize) {
24468 auto Split = DAG.SplitVector(V, DL);
24469 VT = Split.first.getValueType();
24470 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24471 }
24472 V = DAG.getNOT(DL, V, VT);
24473 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24474 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24475 DAG.getConstant(0, DL, MVT::i32));
24476 } else {
24477 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
24478 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24479 while (VT.getSizeInBits() > TestSize) {
24480 auto Split = DAG.SplitVector(V, DL);
24481 VT = Split.first.getValueType();
24482 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
24483 }
24484 LHS = V;
24485 RHS = DAG.getConstant(0, DL, VT);
24486 }
24487 }
24488
24489 if (UseKORTEST && VT.is512BitVector()) {
24490 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
24491 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
24492 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24493 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24494 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
24495 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
24496 }
24497
24498 if (UsePTEST) {
24499 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
24500 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24501 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24502 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
24503 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
24504 }
24505
24506 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24506, __extension__
__PRETTY_FUNCTION__))
;
24507 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
24508 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
24509 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
24510 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
24511 V = DAG.getNOT(DL, V, MaskVT);
24512 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24513 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24514 DAG.getConstant(0, DL, MVT::i32));
24515}
24516
24517// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
24518// to CMP(MOVMSK(PCMPEQB(X,Y))).
24519static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
24520 ISD::CondCode CC, const SDLoc &DL,
24521 const X86Subtarget &Subtarget,
24522 SelectionDAG &DAG,
24523 X86::CondCode &X86CC) {
24524 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24524, __extension__
__PRETTY_FUNCTION__))
;
24525
24526 bool CmpNull = isNullConstant(RHS);
24527 bool CmpAllOnes = isAllOnesConstant(RHS);
24528 if (!CmpNull && !CmpAllOnes)
24529 return SDValue();
24530
24531 SDValue Op = LHS;
24532 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
24533 return SDValue();
24534
24535 // Check whether we're masking/truncating an OR-reduction result, in which
24536 // case track the masked bits.
24537 // TODO: Add CmpAllOnes support.
24538 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
24539 if (CmpNull) {
24540 switch (Op.getOpcode()) {
24541 case ISD::TRUNCATE: {
24542 SDValue Src = Op.getOperand(0);
24543 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
24544 Op.getScalarValueSizeInBits());
24545 Op = Src;
24546 break;
24547 }
24548 case ISD::AND: {
24549 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
24550 Mask = Cst->getAPIntValue();
24551 Op = Op.getOperand(0);
24552 }
24553 break;
24554 }
24555 }
24556 }
24557
24558 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
24559
24560 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
24561 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
24562 SmallVector<SDValue, 8> VecIns;
24563 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
24564 EVT VT = VecIns[0].getValueType();
24565 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24567, __extension__
__PRETTY_FUNCTION__))
24566 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24567, __extension__
__PRETTY_FUNCTION__))
24567 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24567, __extension__
__PRETTY_FUNCTION__))
;
24568
24569 // Quit if not splittable to scalar/128/256/512-bit vector.
24570 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24571 return SDValue();
24572
24573 // If more than one full vector is evaluated, AND/OR them first before
24574 // PTEST.
24575 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
24576 Slot += 2, e += 1) {
24577 // Each iteration will AND/OR 2 nodes and append the result until there is
24578 // only 1 node left, i.e. the final value of all vectors.
24579 SDValue LHS = VecIns[Slot];
24580 SDValue RHS = VecIns[Slot + 1];
24581 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
24582 }
24583
24584 return LowerVectorAllEqual(DL, VecIns.back(),
24585 CmpNull ? DAG.getConstant(0, DL, VT)
24586 : DAG.getAllOnesConstant(DL, VT),
24587 CC, Mask, Subtarget, DAG, X86CC);
24588 }
24589
24590 // Match icmp(reduce_or(X),0) anyof reduction patterns.
24591 // Match icmp(reduce_and(X),-1) allof reduction patterns.
24592 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24593 ISD::NodeType BinOp;
24594 if (SDValue Match =
24595 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
24596 EVT MatchVT = Match.getValueType();
24597 return LowerVectorAllEqual(DL, Match,
24598 CmpNull ? DAG.getConstant(0, DL, MatchVT)
24599 : DAG.getAllOnesConstant(DL, MatchVT),
24600 CC, Mask, Subtarget, DAG, X86CC);
24601 }
24602 }
24603
24604 if (Mask.isAllOnes()) {
24605 assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24606, __extension__
__PRETTY_FUNCTION__))
24606 "Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24606, __extension__
__PRETTY_FUNCTION__))
;
24607 SDValue Src = peekThroughBitcasts(Op);
24608 if (Src.getValueType().isFixedLengthVector() &&
24609 Src.getValueType().getScalarType() == MVT::i1) {
24610 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
24611 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
24612 if (Src.getOpcode() == ISD::SETCC) {
24613 SDValue LHS = Src.getOperand(0);
24614 SDValue RHS = Src.getOperand(1);
24615 EVT LHSVT = LHS.getValueType();
24616 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
24617 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
24618 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
24619 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
24620 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
24621 X86CC);
24622 }
24623 }
24624 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
24625 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
24626 // Peek through truncation, mask the LSB and compare against zero/LSB.
24627 if (Src.getOpcode() == ISD::TRUNCATE) {
24628 SDValue Inner = Src.getOperand(0);
24629 EVT InnerVT = Inner.getValueType();
24630 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
24631 unsigned BW = InnerVT.getScalarSizeInBits();
24632 APInt SrcMask = APInt(BW, 1);
24633 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
24634 return LowerVectorAllEqual(DL, Inner,
24635 DAG.getConstant(Cmp, DL, InnerVT), CC,
24636 SrcMask, Subtarget, DAG, X86CC);
24637 }
24638 }
24639 }
24640 }
24641
24642 return SDValue();
24643}
24644
24645/// return true if \c Op has a use that doesn't just read flags.
24646static bool hasNonFlagsUse(SDValue Op) {
24647 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
24648 ++UI) {
24649 SDNode *User = *UI;
24650 unsigned UOpNo = UI.getOperandNo();
24651 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
24652 // Look pass truncate.
24653 UOpNo = User->use_begin().getOperandNo();
24654 User = *User->use_begin();
24655 }
24656
24657 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
24658 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
24659 return true;
24660 }
24661 return false;
24662}
24663
24664// Transform to an x86-specific ALU node with flags if there is a chance of
24665// using an RMW op or only the flags are used. Otherwise, leave
24666// the node alone and emit a 'cmp' or 'test' instruction.
24667static bool isProfitableToUseFlagOp(SDValue Op) {
24668 for (SDNode *U : Op->uses())
24669 if (U->getOpcode() != ISD::CopyToReg &&
24670 U->getOpcode() != ISD::SETCC &&
24671 U->getOpcode() != ISD::STORE)
24672 return false;
24673
24674 return true;
24675}
24676
24677/// Emit nodes that will be selected as "test Op0,Op0", or something
24678/// equivalent.
24679static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
24680 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
24681 // CF and OF aren't always set the way we want. Determine which
24682 // of these we need.
24683 bool NeedCF = false;
24684 bool NeedOF = false;
24685 switch (X86CC) {
24686 default: break;
24687 case X86::COND_A: case X86::COND_AE:
24688 case X86::COND_B: case X86::COND_BE:
24689 NeedCF = true;
24690 break;
24691 case X86::COND_G: case X86::COND_GE:
24692 case X86::COND_L: case X86::COND_LE:
24693 case X86::COND_O: case X86::COND_NO: {
24694 // Check if we really need to set the
24695 // Overflow flag. If NoSignedWrap is present
24696 // that is not actually needed.
24697 switch (Op->getOpcode()) {
24698 case ISD::ADD:
24699 case ISD::SUB:
24700 case ISD::MUL:
24701 case ISD::SHL:
24702 if (Op.getNode()->getFlags().hasNoSignedWrap())
24703 break;
24704 [[fallthrough]];
24705 default:
24706 NeedOF = true;
24707 break;
24708 }
24709 break;
24710 }
24711 }
24712 // See if we can use the EFLAGS value from the operand instead of
24713 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
24714 // we prove that the arithmetic won't overflow, we can't use OF or CF.
24715 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
24716 // Emit a CMP with 0, which is the TEST pattern.
24717 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24718 DAG.getConstant(0, dl, Op.getValueType()));
24719 }
24720 unsigned Opcode = 0;
24721 unsigned NumOperands = 0;
24722
24723 SDValue ArithOp = Op;
24724
24725 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
24726 // which may be the result of a CAST. We use the variable 'Op', which is the
24727 // non-casted variable when we check for possible users.
24728 switch (ArithOp.getOpcode()) {
24729 case ISD::AND:
24730 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
24731 // because a TEST instruction will be better.
24732 if (!hasNonFlagsUse(Op))
24733 break;
24734
24735 [[fallthrough]];
24736 case ISD::ADD:
24737 case ISD::SUB:
24738 case ISD::OR:
24739 case ISD::XOR:
24740 if (!isProfitableToUseFlagOp(Op))
24741 break;
24742
24743 // Otherwise use a regular EFLAGS-setting instruction.
24744 switch (ArithOp.getOpcode()) {
24745 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24745)
;
24746 case ISD::ADD: Opcode = X86ISD::ADD; break;
24747 case ISD::SUB: Opcode = X86ISD::SUB; break;
24748 case ISD::XOR: Opcode = X86ISD::XOR; break;
24749 case ISD::AND: Opcode = X86ISD::AND; break;
24750 case ISD::OR: Opcode = X86ISD::OR; break;
24751 }
24752
24753 NumOperands = 2;
24754 break;
24755 case X86ISD::ADD:
24756 case X86ISD::SUB:
24757 case X86ISD::OR:
24758 case X86ISD::XOR:
24759 case X86ISD::AND:
24760 return SDValue(Op.getNode(), 1);
24761 case ISD::SSUBO:
24762 case ISD::USUBO: {
24763 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
24764 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24765 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
24766 Op->getOperand(1)).getValue(1);
24767 }
24768 default:
24769 break;
24770 }
24771
24772 if (Opcode == 0) {
24773 // Emit a CMP with 0, which is the TEST pattern.
24774 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24775 DAG.getConstant(0, dl, Op.getValueType()));
24776 }
24777 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24778 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
24779
24780 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
24781 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
24782 return SDValue(New.getNode(), 1);
24783}
24784
24785/// Emit nodes that will be selected as "cmp Op0,Op1", or something
24786/// equivalent.
24787static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
24788 const SDLoc &dl, SelectionDAG &DAG,
24789 const X86Subtarget &Subtarget) {
24790 if (isNullConstant(Op1))
24791 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
24792
24793 EVT CmpVT = Op0.getValueType();
24794
24795 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24796, __extension__
__PRETTY_FUNCTION__))
24796 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24796, __extension__
__PRETTY_FUNCTION__))
;
24797
24798 // Only promote the compare up to I32 if it is a 16 bit operation
24799 // with an immediate. 16 bit immediates are to be avoided.
24800 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
24801 !DAG.getMachineFunction().getFunction().hasMinSize()) {
24802 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
24803 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
24804 // Don't do this if the immediate can fit in 8-bits.
24805 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
24806 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
24807 unsigned ExtendOp =
24808 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24809 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
24810 // For equality comparisons try to use SIGN_EXTEND if the input was
24811 // truncate from something with enough sign bits.
24812 if (Op0.getOpcode() == ISD::TRUNCATE) {
24813 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
24814 ExtendOp = ISD::SIGN_EXTEND;
24815 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
24816 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
24817 ExtendOp = ISD::SIGN_EXTEND;
24818 }
24819 }
24820
24821 CmpVT = MVT::i32;
24822 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
24823 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
24824 }
24825 }
24826
24827 // Try to shrink i64 compares if the input has enough zero bits.
24828 // FIXME: Do this for non-constant compares for constant on LHS?
24829 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
24830 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
24831 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
24832 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
24833 CmpVT = MVT::i32;
24834 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
24835 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
24836 }
24837
24838 // 0-x == y --> x+y == 0
24839 // 0-x != y --> x+y != 0
24840 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
24841 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24842 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24843 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
24844 return Add.getValue(1);
24845 }
24846
24847 // x == 0-y --> x+y == 0
24848 // x != 0-y --> x+y != 0
24849 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
24850 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24851 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24852 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
24853 return Add.getValue(1);
24854 }
24855
24856 // Use SUB instead of CMP to enable CSE between SUB and CMP.
24857 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24858 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
24859 return Sub.getValue(1);
24860}
24861
24862/// Check if replacement of SQRT with RSQRT should be disabled.
24863bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
24864 EVT VT = Op.getValueType();
24865
24866 // We don't need to replace SQRT with RSQRT for half type.
24867 if (VT.getScalarType() == MVT::f16)
24868 return true;
24869
24870 // We never want to use both SQRT and RSQRT instructions for the same input.
24871 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
24872 return false;
24873
24874 if (VT.isVector())
24875 return Subtarget.hasFastVectorFSQRT();
24876 return Subtarget.hasFastScalarFSQRT();
24877}
24878
24879/// The minimum architected relative accuracy is 2^-12. We need one
24880/// Newton-Raphson step to have a good float result (24 bits of precision).
24881SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
24882 SelectionDAG &DAG, int Enabled,
24883 int &RefinementSteps,
24884 bool &UseOneConstNR,
24885 bool Reciprocal) const {
24886 SDLoc DL(Op);
24887 EVT VT = Op.getValueType();
24888
24889 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
24890 // It is likely not profitable to do this for f64 because a double-precision
24891 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
24892 // instructions: convert to single, rsqrtss, convert back to double, refine
24893 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
24894 // along with FMA, this could be a throughput win.
24895 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
24896 // after legalize types.
24897 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24898 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
24899 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
24900 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24901 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24902 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24903 RefinementSteps = 1;
24904
24905 UseOneConstNR = false;
24906 // There is no FSQRT for 512-bits, but there is RSQRT14.
24907 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
24908 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
24909 if (RefinementSteps == 0 && !Reciprocal)
24910 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
24911 return Estimate;
24912 }
24913
24914 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24915 Subtarget.hasFP16()) {
24916 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24916, __extension__
__PRETTY_FUNCTION__))
;
24917 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24918 RefinementSteps = 0;
24919
24920 if (VT == MVT::f16) {
24921 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24922 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24923 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24924 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
24925 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24926 }
24927
24928 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
24929 }
24930 return SDValue();
24931}
24932
24933/// The minimum architected relative accuracy is 2^-12. We need one
24934/// Newton-Raphson step to have a good float result (24 bits of precision).
24935SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
24936 int Enabled,
24937 int &RefinementSteps) const {
24938 SDLoc DL(Op);
24939 EVT VT = Op.getValueType();
24940
24941 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
24942 // It is likely not profitable to do this for f64 because a double-precision
24943 // reciprocal estimate with refinement on x86 prior to FMA requires
24944 // 15 instructions: convert to single, rcpss, convert back to double, refine
24945 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
24946 // along with FMA, this could be a throughput win.
24947
24948 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24949 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
24950 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24951 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24952 // Enable estimate codegen with 1 refinement step for vector division.
24953 // Scalar division estimates are disabled because they break too much
24954 // real-world code. These defaults are intended to match GCC behavior.
24955 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
24956 return SDValue();
24957
24958 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24959 RefinementSteps = 1;
24960
24961 // There is no FSQRT for 512-bits, but there is RCP14.
24962 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
24963 return DAG.getNode(Opcode, DL, VT, Op);
24964 }
24965
24966 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24967 Subtarget.hasFP16()) {
24968 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24969 RefinementSteps = 0;
24970
24971 if (VT == MVT::f16) {
24972 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24973 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24974 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24975 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
24976 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24977 }
24978
24979 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
24980 }
24981 return SDValue();
24982}
24983
24984/// If we have at least two divisions that use the same divisor, convert to
24985/// multiplication by a reciprocal. This may need to be adjusted for a given
24986/// CPU if a division's cost is not at least twice the cost of a multiplication.
24987/// This is because we still need one division to calculate the reciprocal and
24988/// then we need two multiplies by that reciprocal as replacements for the
24989/// original divisions.
24990unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
24991 return 2;
24992}
24993
24994SDValue
24995X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
24996 SelectionDAG &DAG,
24997 SmallVectorImpl<SDNode *> &Created) const {
24998 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
24999 if (isIntDivCheap(N->getValueType(0), Attr))
25000 return SDValue(N,0); // Lower SDIV as SDIV
25001
25002 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25003, __extension__
__PRETTY_FUNCTION__))
25003 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25003, __extension__
__PRETTY_FUNCTION__))
;
25004
25005 // Only perform this transform if CMOV is supported otherwise the select
25006 // below will become a branch.
25007 if (!Subtarget.canUseCMOV())
25008 return SDValue();
25009
25010 // fold (sdiv X, pow2)
25011 EVT VT = N->getValueType(0);
25012 // FIXME: Support i8.
25013 if (VT != MVT::i16 && VT != MVT::i32 &&
25014 !(Subtarget.is64Bit() && VT == MVT::i64))
25015 return SDValue();
25016
25017 unsigned Lg2 = Divisor.countr_zero();
25018
25019 // If the divisor is 2 or -2, the default expansion is better.
25020 if (Lg2 == 1)
25021 return SDValue();
25022
25023 SDLoc DL(N);
25024 SDValue N0 = N->getOperand(0);
25025 SDValue Zero = DAG.getConstant(0, DL, VT);
25026 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
25027 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
25028
25029 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
25030 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
25031 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
25032 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
25033
25034 Created.push_back(Cmp.getNode());
25035 Created.push_back(Add.getNode());
25036 Created.push_back(CMov.getNode());
25037
25038 // Divide by pow2.
25039 SDValue SRA =
25040 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
25041
25042 // If we're dividing by a positive value, we're done. Otherwise, we must
25043 // negate the result.
25044 if (Divisor.isNonNegative())
25045 return SRA;
25046
25047 Created.push_back(SRA.getNode());
25048 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
25049}
25050
25051/// Result of 'and' is compared against zero. Change to a BT node if possible.
25052/// Returns the BT node and the condition code needed to use it.
25053static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
25054 SelectionDAG &DAG, X86::CondCode &X86CC) {
25055 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25055, __extension__
__PRETTY_FUNCTION__))
;
25056 SDValue Op0 = And.getOperand(0);
25057 SDValue Op1 = And.getOperand(1);
25058 if (Op0.getOpcode() == ISD::TRUNCATE)
25059 Op0 = Op0.getOperand(0);
25060 if (Op1.getOpcode() == ISD::TRUNCATE)
25061 Op1 = Op1.getOperand(0);
25062
25063 SDValue Src, BitNo;
25064 if (Op1.getOpcode() == ISD::SHL)
25065 std::swap(Op0, Op1);
25066 if (Op0.getOpcode() == ISD::SHL) {
25067 if (isOneConstant(Op0.getOperand(0))) {
25068 // If we looked past a truncate, check that it's only truncating away
25069 // known zeros.
25070 unsigned BitWidth = Op0.getValueSizeInBits();
25071 unsigned AndBitWidth = And.getValueSizeInBits();
25072 if (BitWidth > AndBitWidth) {
25073 KnownBits Known = DAG.computeKnownBits(Op0);
25074 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
25075 return SDValue();
25076 }
25077 Src = Op1;
25078 BitNo = Op0.getOperand(1);
25079 }
25080 } else if (Op1.getOpcode() == ISD::Constant) {
25081 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
25082 uint64_t AndRHSVal = AndRHS->getZExtValue();
25083 SDValue AndLHS = Op0;
25084
25085 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
25086 Src = AndLHS.getOperand(0);
25087 BitNo = AndLHS.getOperand(1);
25088 } else {
25089 // Use BT if the immediate can't be encoded in a TEST instruction or we
25090 // are optimizing for size and the immedaite won't fit in a byte.
25091 bool OptForSize = DAG.shouldOptForSize();
25092 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
25093 isPowerOf2_64(AndRHSVal)) {
25094 Src = AndLHS;
25095 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
25096 Src.getValueType());
25097 }
25098 }
25099 }
25100
25101 // No patterns found, give up.
25102 if (!Src.getNode())
25103 return SDValue();
25104
25105 // Remove any bit flip.
25106 if (isBitwiseNot(Src)) {
25107 Src = Src.getOperand(0);
25108 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
25109 }
25110
25111 // Attempt to create the X86ISD::BT node.
25112 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
25113 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25114 return BT;
25115 }
25116
25117 return SDValue();
25118}
25119
25120// Check if pre-AVX condcode can be performed by a single FCMP op.
25121static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
25122 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
25123}
25124
25125/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
25126/// CMPs.
25127static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
25128 SDValue &Op1, bool &IsAlwaysSignaling) {
25129 unsigned SSECC;
25130 bool Swap = false;
25131
25132 // SSE Condition code mapping:
25133 // 0 - EQ
25134 // 1 - LT
25135 // 2 - LE
25136 // 3 - UNORD
25137 // 4 - NEQ
25138 // 5 - NLT
25139 // 6 - NLE
25140 // 7 - ORD
25141 switch (SetCCOpcode) {
25142 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25142)
;
25143 case ISD::SETOEQ:
25144 case ISD::SETEQ: SSECC = 0; break;
25145 case ISD::SETOGT:
25146 case ISD::SETGT: Swap = true; [[fallthrough]];
25147 case ISD::SETLT:
25148 case ISD::SETOLT: SSECC = 1; break;
25149 case ISD::SETOGE:
25150 case ISD::SETGE: Swap = true; [[fallthrough]];
25151 case ISD::SETLE:
25152 case ISD::SETOLE: SSECC = 2; break;
25153 case ISD::SETUO: SSECC = 3; break;
25154 case ISD::SETUNE:
25155 case ISD::SETNE: SSECC = 4; break;
25156 case ISD::SETULE: Swap = true; [[fallthrough]];
25157 case ISD::SETUGE: SSECC = 5; break;
25158 case ISD::SETULT: Swap = true; [[fallthrough]];
25159 case ISD::SETUGT: SSECC = 6; break;
25160 case ISD::SETO: SSECC = 7; break;
25161 case ISD::SETUEQ: SSECC = 8; break;
25162 case ISD::SETONE: SSECC = 12; break;
25163 }
25164 if (Swap)
25165 std::swap(Op0, Op1);
25166
25167 switch (SetCCOpcode) {
25168 default:
25169 IsAlwaysSignaling = true;
25170 break;
25171 case ISD::SETEQ:
25172 case ISD::SETOEQ:
25173 case ISD::SETUEQ:
25174 case ISD::SETNE:
25175 case ISD::SETONE:
25176 case ISD::SETUNE:
25177 case ISD::SETO:
25178 case ISD::SETUO:
25179 IsAlwaysSignaling = false;
25180 break;
25181 }
25182
25183 return SSECC;
25184}
25185
25186/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
25187/// concatenate the result back.
25188static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
25189 ISD::CondCode Cond, SelectionDAG &DAG,
25190 const SDLoc &dl) {
25191 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25192, __extension__
__PRETTY_FUNCTION__))
25192 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25192, __extension__
__PRETTY_FUNCTION__))
;
25193
25194 SDValue CC = DAG.getCondCode(Cond);
25195
25196 // Extract the LHS Lo/Hi vectors
25197 SDValue LHS1, LHS2;
25198 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
25199
25200 // Extract the RHS Lo/Hi vectors
25201 SDValue RHS1, RHS2;
25202 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
25203
25204 // Issue the operation on the smaller types and concatenate the result back
25205 EVT LoVT, HiVT;
25206 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
25207 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25208 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
25209 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
25210}
25211
25212static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
25213
25214 SDValue Op0 = Op.getOperand(0);
25215 SDValue Op1 = Op.getOperand(1);
25216 SDValue CC = Op.getOperand(2);
25217 MVT VT = Op.getSimpleValueType();
25218 SDLoc dl(Op);
25219
25220 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25221, __extension__
__PRETTY_FUNCTION__))
25221 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25221, __extension__
__PRETTY_FUNCTION__))
;
25222
25223 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
25224
25225 // Prefer SETGT over SETLT.
25226 if (SetCCOpcode == ISD::SETLT) {
25227 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
25228 std::swap(Op0, Op1);
25229 }
25230
25231 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
25232}
25233
25234/// Given a buildvector constant, return a new vector constant with each element
25235/// incremented or decremented. If incrementing or decrementing would result in
25236/// unsigned overflow or underflow or this is not a simple vector constant,
25237/// return an empty value.
25238static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
25239 bool NSW) {
25240 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
25241 if (!BV || !V.getValueType().isSimple())
25242 return SDValue();
25243
25244 MVT VT = V.getSimpleValueType();
25245 MVT EltVT = VT.getVectorElementType();
25246 unsigned NumElts = VT.getVectorNumElements();
25247 SmallVector<SDValue, 8> NewVecC;
25248 SDLoc DL(V);
25249 for (unsigned i = 0; i < NumElts; ++i) {
25250 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
25251 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
25252 return SDValue();
25253
25254 // Avoid overflow/underflow.
25255 const APInt &EltC = Elt->getAPIntValue();
25256 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
25257 return SDValue();
25258 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
25259 (!IsInc && EltC.isMinSignedValue())))
25260 return SDValue();
25261
25262 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
25263 }
25264
25265 return DAG.getBuildVector(VT, DL, NewVecC);
25266}
25267
25268/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
25269/// Op0 u<= Op1:
25270/// t = psubus Op0, Op1
25271/// pcmpeq t, <0..0>
25272static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
25273 ISD::CondCode Cond, const SDLoc &dl,
25274 const X86Subtarget &Subtarget,
25275 SelectionDAG &DAG) {
25276 if (!Subtarget.hasSSE2())
25277 return SDValue();
25278
25279 MVT VET = VT.getVectorElementType();
25280 if (VET != MVT::i8 && VET != MVT::i16)
25281 return SDValue();
25282
25283 switch (Cond) {
25284 default:
25285 return SDValue();
25286 case ISD::SETULT: {
25287 // If the comparison is against a constant we can turn this into a
25288 // setule. With psubus, setule does not require a swap. This is
25289 // beneficial because the constant in the register is no longer
25290 // destructed as the destination so it can be hoisted out of a loop.
25291 // Only do this pre-AVX since vpcmp* is no longer destructive.
25292 if (Subtarget.hasAVX())
25293 return SDValue();
25294 SDValue ULEOp1 =
25295 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
25296 if (!ULEOp1)
25297 return SDValue();
25298 Op1 = ULEOp1;
25299 break;
25300 }
25301 case ISD::SETUGT: {
25302 // If the comparison is against a constant, we can turn this into a setuge.
25303 // This is beneficial because materializing a constant 0 for the PCMPEQ is
25304 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
25305 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
25306 SDValue UGEOp1 =
25307 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
25308 if (!UGEOp1)
25309 return SDValue();
25310 Op1 = Op0;
25311 Op0 = UGEOp1;
25312 break;
25313 }
25314 // Psubus is better than flip-sign because it requires no inversion.
25315 case ISD::SETUGE:
25316 std::swap(Op0, Op1);
25317 break;
25318 case ISD::SETULE:
25319 break;
25320 }
25321
25322 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
25323 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
25324 DAG.getConstant(0, dl, VT));
25325}
25326
25327static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
25328 SelectionDAG &DAG) {
25329 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25330 Op.getOpcode() == ISD::STRICT_FSETCCS;
25331 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25332 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25333 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
25334 MVT VT = Op->getSimpleValueType(0);
25335 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
25336 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
25337 SDLoc dl(Op);
25338
25339 if (isFP) {
25340 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
25341 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25341, __extension__
__PRETTY_FUNCTION__))
;
25342 if (isSoftFP16(EltVT, Subtarget))
25343 return SDValue();
25344
25345 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25346 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25347
25348 // If we have a strict compare with a vXi1 result and the input is 128/256
25349 // bits we can't use a masked compare unless we have VLX. If we use a wider
25350 // compare like we do for non-strict, we might trigger spurious exceptions
25351 // from the upper elements. Instead emit a AVX compare and convert to mask.
25352 unsigned Opc;
25353 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
25354 (!IsStrict || Subtarget.hasVLX() ||
25355 Op0.getSimpleValueType().is512BitVector())) {
25356#ifndef NDEBUG
25357 unsigned Num = VT.getVectorNumElements();
25358 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25358, __extension__
__PRETTY_FUNCTION__))
;
25359#endif
25360 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
25361 } else {
25362 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
25363 // The SSE/AVX packed FP comparison nodes are defined with a
25364 // floating-point vector result that matches the operand type. This allows
25365 // them to work with an SSE1 target (integer vector types are not legal).
25366 VT = Op0.getSimpleValueType();
25367 }
25368
25369 SDValue Cmp;
25370 bool IsAlwaysSignaling;
25371 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
25372 if (!Subtarget.hasAVX()) {
25373 // TODO: We could use following steps to handle a quiet compare with
25374 // signaling encodings.
25375 // 1. Get ordered masks from a quiet ISD::SETO
25376 // 2. Use the masks to mask potential unordered elements in operand A, B
25377 // 3. Get the compare results of masked A, B
25378 // 4. Calculating final result using the mask and result from 3
25379 // But currently, we just fall back to scalar operations.
25380 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
25381 return SDValue();
25382
25383 // Insert an extra signaling instruction to raise exception.
25384 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
25385 SDValue SignalCmp = DAG.getNode(
25386 Opc, dl, {VT, MVT::Other},
25387 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
25388 // FIXME: It seems we need to update the flags of all new strict nodes.
25389 // Otherwise, mayRaiseFPException in MI will return false due to
25390 // NoFPExcept = false by default. However, I didn't find it in other
25391 // patches.
25392 SignalCmp->setFlags(Op->getFlags());
25393 Chain = SignalCmp.getValue(1);
25394 }
25395
25396 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
25397 // emit two comparisons and a logic op to tie them together.
25398 if (!cheapX86FSETCC_SSE(Cond)) {
25399 // LLVM predicate is SETUEQ or SETONE.
25400 unsigned CC0, CC1;
25401 unsigned CombineOpc;
25402 if (Cond == ISD::SETUEQ) {
25403 CC0 = 3; // UNORD
25404 CC1 = 0; // EQ
25405 CombineOpc = X86ISD::FOR;
25406 } else {
25407 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25407, __extension__ __PRETTY_FUNCTION__))
;
25408 CC0 = 7; // ORD
25409 CC1 = 4; // NEQ
25410 CombineOpc = X86ISD::FAND;
25411 }
25412
25413 SDValue Cmp0, Cmp1;
25414 if (IsStrict) {
25415 Cmp0 = DAG.getNode(
25416 Opc, dl, {VT, MVT::Other},
25417 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
25418 Cmp1 = DAG.getNode(
25419 Opc, dl, {VT, MVT::Other},
25420 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
25421 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
25422 Cmp1.getValue(1));
25423 } else {
25424 Cmp0 = DAG.getNode(
25425 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
25426 Cmp1 = DAG.getNode(
25427 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
25428 }
25429 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
25430 } else {
25431 if (IsStrict) {
25432 Cmp = DAG.getNode(
25433 Opc, dl, {VT, MVT::Other},
25434 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25435 Chain = Cmp.getValue(1);
25436 } else
25437 Cmp = DAG.getNode(
25438 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25439 }
25440 } else {
25441 // Handle all other FP comparisons here.
25442 if (IsStrict) {
25443 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
25444 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
25445 Cmp = DAG.getNode(
25446 Opc, dl, {VT, MVT::Other},
25447 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25448 Chain = Cmp.getValue(1);
25449 } else
25450 Cmp = DAG.getNode(
25451 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25452 }
25453
25454 if (VT.getFixedSizeInBits() >
25455 Op.getSimpleValueType().getFixedSizeInBits()) {
25456 // We emitted a compare with an XMM/YMM result. Finish converting to a
25457 // mask register using a vptestm.
25458 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
25459 Cmp = DAG.getBitcast(CastVT, Cmp);
25460 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
25461 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
25462 } else {
25463 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
25464 // the result type of SETCC. The bitcast is expected to be optimized
25465 // away during combining/isel.
25466 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
25467 }
25468
25469 if (IsStrict)
25470 return DAG.getMergeValues({Cmp, Chain}, dl);
25471
25472 return Cmp;
25473 }
25474
25475 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25475, __extension__
__PRETTY_FUNCTION__))
;
25476
25477 MVT VTOp0 = Op0.getSimpleValueType();
25478 (void)VTOp0;
25479 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25480, __extension__
__PRETTY_FUNCTION__))
25480 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25480, __extension__
__PRETTY_FUNCTION__))
;
25481 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25482, __extension__
__PRETTY_FUNCTION__))
25482 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25482, __extension__
__PRETTY_FUNCTION__))
;
25483
25484 // The non-AVX512 code below works under the assumption that source and
25485 // destination types are the same.
25486 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25487, __extension__
__PRETTY_FUNCTION__))
25487 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25487, __extension__
__PRETTY_FUNCTION__))
;
25488
25489 // The result is boolean, but operands are int/float
25490 if (VT.getVectorElementType() == MVT::i1) {
25491 // In AVX-512 architecture setcc returns mask with i1 elements,
25492 // But there is no compare instruction for i8 and i16 elements in KNL.
25493 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25494, __extension__
__PRETTY_FUNCTION__))
25494 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25494, __extension__
__PRETTY_FUNCTION__))
;
25495 return LowerIntVSETCC_AVX512(Op, DAG);
25496 }
25497
25498 // Lower using XOP integer comparisons.
25499 if (VT.is128BitVector() && Subtarget.hasXOP()) {
25500 // Translate compare code to XOP PCOM compare mode.
25501 unsigned CmpMode = 0;
25502 switch (Cond) {
25503 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25503)
;
25504 case ISD::SETULT:
25505 case ISD::SETLT: CmpMode = 0x00; break;
25506 case ISD::SETULE:
25507 case ISD::SETLE: CmpMode = 0x01; break;
25508 case ISD::SETUGT:
25509 case ISD::SETGT: CmpMode = 0x02; break;
25510 case ISD::SETUGE:
25511 case ISD::SETGE: CmpMode = 0x03; break;
25512 case ISD::SETEQ: CmpMode = 0x04; break;
25513 case ISD::SETNE: CmpMode = 0x05; break;
25514 }
25515
25516 // Are we comparing unsigned or signed integers?
25517 unsigned Opc =
25518 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
25519
25520 return DAG.getNode(Opc, dl, VT, Op0, Op1,
25521 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
25522 }
25523
25524 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
25525 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
25526 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
25527 SDValue BC0 = peekThroughBitcasts(Op0);
25528 if (BC0.getOpcode() == ISD::AND) {
25529 APInt UndefElts;
25530 SmallVector<APInt, 64> EltBits;
25531 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
25532 VT.getScalarSizeInBits(), UndefElts,
25533 EltBits, false, false)) {
25534 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
25535 Cond = ISD::SETEQ;
25536 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
25537 }
25538 }
25539 }
25540 }
25541
25542 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
25543 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
25544 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
25545 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
25546 if (C1 && C1->getAPIntValue().isPowerOf2()) {
25547 unsigned BitWidth = VT.getScalarSizeInBits();
25548 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
25549
25550 SDValue Result = Op0.getOperand(0);
25551 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
25552 DAG.getConstant(ShiftAmt, dl, VT));
25553 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
25554 DAG.getConstant(BitWidth - 1, dl, VT));
25555 return Result;
25556 }
25557 }
25558
25559 // Break 256-bit integer vector compare into smaller ones.
25560 if (VT.is256BitVector() && !Subtarget.hasInt256())
25561 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25562
25563 // Break 512-bit integer vector compare into smaller ones.
25564 // TODO: Try harder to use VPCMPx + VPMOV2x?
25565 if (VT.is512BitVector())
25566 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25567
25568 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
25569 // not-of-PCMPEQ:
25570 // X != INT_MIN --> X >s INT_MIN
25571 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
25572 // +X != 0 --> +X >s 0
25573 APInt ConstValue;
25574 if (Cond == ISD::SETNE &&
25575 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
25576 if (ConstValue.isMinSignedValue())
25577 Cond = ISD::SETGT;
25578 else if (ConstValue.isMaxSignedValue())
25579 Cond = ISD::SETLT;
25580 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
25581 Cond = ISD::SETGT;
25582 }
25583
25584 // If both operands are known non-negative, then an unsigned compare is the
25585 // same as a signed compare and there's no need to flip signbits.
25586 // TODO: We could check for more general simplifications here since we're
25587 // computing known bits.
25588 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
25589 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
25590
25591 // Special case: Use min/max operations for unsigned compares.
25592 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25593 if (ISD::isUnsignedIntSetCC(Cond) &&
25594 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
25595 TLI.isOperationLegal(ISD::UMIN, VT)) {
25596 // If we have a constant operand, increment/decrement it and change the
25597 // condition to avoid an invert.
25598 if (Cond == ISD::SETUGT) {
25599 // X > C --> X >= (C+1) --> X == umax(X, C+1)
25600 if (SDValue UGTOp1 =
25601 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
25602 Op1 = UGTOp1;
25603 Cond = ISD::SETUGE;
25604 }
25605 }
25606 if (Cond == ISD::SETULT) {
25607 // X < C --> X <= (C-1) --> X == umin(X, C-1)
25608 if (SDValue ULTOp1 =
25609 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
25610 Op1 = ULTOp1;
25611 Cond = ISD::SETULE;
25612 }
25613 }
25614 bool Invert = false;
25615 unsigned Opc;
25616 switch (Cond) {
25617 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25617)
;
25618 case ISD::SETUGT: Invert = true; [[fallthrough]];
25619 case ISD::SETULE: Opc = ISD::UMIN; break;
25620 case ISD::SETULT: Invert = true; [[fallthrough]];
25621 case ISD::SETUGE: Opc = ISD::UMAX; break;
25622 }
25623
25624 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25625 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
25626
25627 // If the logical-not of the result is required, perform that now.
25628 if (Invert)
25629 Result = DAG.getNOT(dl, Result, VT);
25630
25631 return Result;
25632 }
25633
25634 // Try to use SUBUS and PCMPEQ.
25635 if (FlipSigns)
25636 if (SDValue V =
25637 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
25638 return V;
25639
25640 // We are handling one of the integer comparisons here. Since SSE only has
25641 // GT and EQ comparisons for integer, swapping operands and multiple
25642 // operations may be required for some comparisons.
25643 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
25644 : X86ISD::PCMPGT;
25645 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
25646 Cond == ISD::SETGE || Cond == ISD::SETUGE;
25647 bool Invert = Cond == ISD::SETNE ||
25648 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
25649
25650 if (Swap)
25651 std::swap(Op0, Op1);
25652
25653 // Check that the operation in question is available (most are plain SSE2,
25654 // but PCMPGTQ and PCMPEQQ have different requirements).
25655 if (VT == MVT::v2i64) {
25656 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
25657 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25657, __extension__
__PRETTY_FUNCTION__))
;
25658
25659 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
25660 // the odd elements over the even elements.
25661 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
25662 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
25663 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25664
25665 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25666 static const int MaskHi[] = { 1, 1, 3, 3 };
25667 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25668
25669 return DAG.getBitcast(VT, Result);
25670 }
25671
25672 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
25673 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25674 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
25675
25676 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25677 static const int MaskHi[] = { 1, 1, 3, 3 };
25678 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25679
25680 return DAG.getBitcast(VT, Result);
25681 }
25682
25683 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25684 // bits of the inputs before performing those operations. The lower
25685 // compare is always unsigned.
25686 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
25687 : 0x0000000080000000ULL,
25688 dl, MVT::v2i64);
25689
25690 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
25691 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
25692
25693 // Cast everything to the right type.
25694 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25695 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25696
25697 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
25698 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25699 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
25700
25701 // Create masks for only the low parts/high parts of the 64 bit integers.
25702 static const int MaskHi[] = { 1, 1, 3, 3 };
25703 static const int MaskLo[] = { 0, 0, 2, 2 };
25704 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
25705 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
25706 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25707
25708 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
25709 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
25710
25711 if (Invert)
25712 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25713
25714 return DAG.getBitcast(VT, Result);
25715 }
25716
25717 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
25718 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
25719 // pcmpeqd + pshufd + pand.
25720 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25720, __extension__
__PRETTY_FUNCTION__))
;
25721
25722 // First cast everything to the right type.
25723 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25724 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25725
25726 // Do the compare.
25727 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
25728
25729 // Make sure the lower and upper halves are both all-ones.
25730 static const int Mask[] = { 1, 0, 3, 2 };
25731 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
25732 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
25733
25734 if (Invert)
25735 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25736
25737 return DAG.getBitcast(VT, Result);
25738 }
25739 }
25740
25741 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25742 // bits of the inputs before performing those operations.
25743 if (FlipSigns) {
25744 MVT EltVT = VT.getVectorElementType();
25745 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
25746 VT);
25747 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
25748 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
25749 }
25750
25751 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25752
25753 // If the logical-not of the result is required, perform that now.
25754 if (Invert)
25755 Result = DAG.getNOT(dl, Result, VT);
25756
25757 return Result;
25758}
25759
25760// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
25761static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
25762 const SDLoc &dl, SelectionDAG &DAG,
25763 const X86Subtarget &Subtarget,
25764 SDValue &X86CC) {
25765 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25765, __extension__
__PRETTY_FUNCTION__))
;
25766
25767 // Must be a bitcast from vXi1.
25768 if (Op0.getOpcode() != ISD::BITCAST)
25769 return SDValue();
25770
25771 Op0 = Op0.getOperand(0);
25772 MVT VT = Op0.getSimpleValueType();
25773 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
25774 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
25775 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
25776 return SDValue();
25777
25778 X86::CondCode X86Cond;
25779 if (isNullConstant(Op1)) {
25780 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
25781 } else if (isAllOnesConstant(Op1)) {
25782 // C flag is set for all ones.
25783 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
25784 } else
25785 return SDValue();
25786
25787 // If the input is an AND, we can combine it's operands into the KTEST.
25788 bool KTestable = false;
25789 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
25790 KTestable = true;
25791 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
25792 KTestable = true;
25793 if (!isNullConstant(Op1))
25794 KTestable = false;
25795 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
25796 SDValue LHS = Op0.getOperand(0);
25797 SDValue RHS = Op0.getOperand(1);
25798 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25799 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
25800 }
25801
25802 // If the input is an OR, we can combine it's operands into the KORTEST.
25803 SDValue LHS = Op0;
25804 SDValue RHS = Op0;
25805 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
25806 LHS = Op0.getOperand(0);
25807 RHS = Op0.getOperand(1);
25808 }
25809
25810 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25811 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
25812}
25813
25814/// Emit flags for the given setcc condition and operands. Also returns the
25815/// corresponding X86 condition code constant in X86CC.
25816SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
25817 ISD::CondCode CC, const SDLoc &dl,
25818 SelectionDAG &DAG,
25819 SDValue &X86CC) const {
25820 // Equality Combines.
25821 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
25822 X86::CondCode X86CondCode;
25823
25824 // Optimize to BT if possible.
25825 // Lower (X & (1 << N)) == 0 to BT(X, N).
25826 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
25827 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
25828 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
25829 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
25830 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25831 return BT;
25832 }
25833 }
25834
25835 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
25836 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
25837 X86CondCode)) {
25838 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25839 return CmpZ;
25840 }
25841
25842 // Try to lower using KORTEST or KTEST.
25843 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
25844 return Test;
25845
25846 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
25847 // of these.
25848 if (isOneConstant(Op1) || isNullConstant(Op1)) {
25849 // If the input is a setcc, then reuse the input setcc or use a new one
25850 // with the inverted condition.
25851 if (Op0.getOpcode() == X86ISD::SETCC) {
25852 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
25853
25854 X86CC = Op0.getOperand(0);
25855 if (Invert) {
25856 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
25857 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
25858 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25859 }
25860
25861 return Op0.getOperand(1);
25862 }
25863 }
25864
25865 // Try to use the carry flag from the add in place of an separate CMP for:
25866 // (seteq (add X, -1), -1). Similar for setne.
25867 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
25868 Op0.getOperand(1) == Op1) {
25869 if (isProfitableToUseFlagOp(Op0)) {
25870 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
25871
25872 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
25873 Op0.getOperand(1));
25874 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
25875 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25876 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25877 return SDValue(New.getNode(), 1);
25878 }
25879 }
25880 }
25881
25882 X86::CondCode CondCode =
25883 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
25884 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25884, __extension__
__PRETTY_FUNCTION__))
;
25885
25886 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
25887 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25888 return EFLAGS;
25889}
25890
25891SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
25892
25893 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25894 Op.getOpcode() == ISD::STRICT_FSETCCS;
25895 MVT VT = Op->getSimpleValueType(0);
25896
25897 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
25898
25899 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25899, __extension__
__PRETTY_FUNCTION__))
;
25900 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25901 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25902 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25903 SDLoc dl(Op);
25904 ISD::CondCode CC =
25905 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
25906
25907 if (isSoftFP16(Op0.getValueType()))
25908 return SDValue();
25909
25910 // Handle f128 first, since one possible outcome is a normal integer
25911 // comparison which gets handled by emitFlagsForSetcc.
25912 if (Op0.getValueType() == MVT::f128) {
25913 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
25914 Op.getOpcode() == ISD::STRICT_FSETCCS);
25915
25916 // If softenSetCCOperands returned a scalar, use it.
25917 if (!Op1.getNode()) {
25918 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25919, __extension__
__PRETTY_FUNCTION__))
25919 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25919, __extension__
__PRETTY_FUNCTION__))
;
25920 if (IsStrict)
25921 return DAG.getMergeValues({Op0, Chain}, dl);
25922 return Op0;
25923 }
25924 }
25925
25926 if (Op0.getSimpleValueType().isInteger()) {
25927 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
25928 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
25929 // this may translate to less uops depending on uarch implementation. The
25930 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
25931 // canonicalize to that CondCode.
25932 // NOTE: Only do this if incrementing the constant doesn't increase the bit
25933 // encoding size - so it must either already be a i8 or i32 immediate, or it
25934 // shrinks down to that. We don't do this for any i64's to avoid additional
25935 // constant materializations.
25936 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
25937 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
25938 const APInt &Op1Val = Op1C->getAPIntValue();
25939 if (!Op1Val.isZero()) {
25940 // Ensure the constant+1 doesn't overflow.
25941 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
25942 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
25943 APInt Op1ValPlusOne = Op1Val + 1;
25944 if (Op1ValPlusOne.isSignedIntN(32) &&
25945 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
25946 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
25947 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
25948 : ISD::CondCode::SETUGE;
25949 }
25950 }
25951 }
25952 }
25953
25954 SDValue X86CC;
25955 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
25956 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
25957 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
25958 }
25959
25960 // Handle floating point.
25961 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
25962 if (CondCode == X86::COND_INVALID)
25963 return SDValue();
25964
25965 SDValue EFLAGS;
25966 if (IsStrict) {
25967 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25968 EFLAGS =
25969 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
25970 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
25971 Chain = EFLAGS.getValue(1);
25972 } else {
25973 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
25974 }
25975
25976 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25977 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
25978 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
25979}
25980
25981SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
25982 SDValue LHS = Op.getOperand(0);
25983 SDValue RHS = Op.getOperand(1);
25984 SDValue Carry = Op.getOperand(2);
25985 SDValue Cond = Op.getOperand(3);
25986 SDLoc DL(Op);
25987
25988 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25988, __extension__
__PRETTY_FUNCTION__))
;
25989 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
25990
25991 // Recreate the carry if needed.
25992 EVT CarryVT = Carry.getValueType();
25993 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
25994 Carry, DAG.getAllOnesConstant(DL, CarryVT));
25995
25996 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
25997 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
25998 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
25999}
26000
26001// This function returns three things: the arithmetic computation itself
26002// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
26003// flag and the condition code define the case in which the arithmetic
26004// computation overflows.
26005static std::pair<SDValue, SDValue>
26006getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
26007 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26007, __extension__
__PRETTY_FUNCTION__))
;
26008 SDValue Value, Overflow;
26009 SDValue LHS = Op.getOperand(0);
26010 SDValue RHS = Op.getOperand(1);
26011 unsigned BaseOp = 0;
26012 SDLoc DL(Op);
26013 switch (Op.getOpcode()) {
26014 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 26014)
;
26015 case ISD::SADDO:
26016 BaseOp = X86ISD::ADD;
26017 Cond = X86::COND_O;
26018 break;
26019 case ISD::UADDO:
26020 BaseOp = X86ISD::ADD;
26021 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
26022 break;
26023 case ISD::SSUBO:
26024 BaseOp = X86ISD::SUB;
26025 Cond = X86::COND_O;
26026 break;
26027 case ISD::USUBO:
26028 BaseOp = X86ISD::SUB;
26029 Cond = X86::COND_B;
26030 break;
26031 case ISD::SMULO:
26032 BaseOp = X86ISD::SMUL;
26033 Cond = X86::COND_O;
26034 break;
26035 case ISD::UMULO:
26036 BaseOp = X86ISD::UMUL;
26037 Cond = X86::COND_O;
26038 break;
26039 }
26040
26041 if (BaseOp) {
26042 // Also sets EFLAGS.
26043 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
26044 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
26045 Overflow = Value.getValue(1);
26046 }
26047
26048 return std::make_pair(Value, Overflow);
26049}
26050
26051static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
26052 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
26053 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
26054 // looks for this combo and may remove the "setcc" instruction if the "setcc"
26055 // has only one use.
26056 SDLoc DL(Op);
26057 X86::CondCode Cond;
26058 SDValue Value, Overflow;
26059 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
26060
26061 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
26062 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26062, __extension__
__PRETTY_FUNCTION__))
;
26063 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
26064}
26065
26066/// Return true if opcode is a X86 logical comparison.
26067static bool isX86LogicalCmp(SDValue Op) {
26068 unsigned Opc = Op.getOpcode();
26069 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
26070 Opc == X86ISD::FCMP)
26071 return true;
26072 if (Op.getResNo() == 1 &&
26073 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
26074 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
26075 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
26076 return true;
26077
26078 return false;
26079}
26080
26081static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
26082 if (V.getOpcode() != ISD::TRUNCATE)
26083 return false;
26084
26085 SDValue VOp0 = V.getOperand(0);
26086 unsigned InBits = VOp0.getValueSizeInBits();
26087 unsigned Bits = V.getValueSizeInBits();
26088 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
26089}
26090
26091SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
26092 bool AddTest = true;
26093 SDValue Cond = Op.getOperand(0);
26094 SDValue Op1 = Op.getOperand(1);
26095 SDValue Op2 = Op.getOperand(2);
26096 SDLoc DL(Op);
26097 MVT VT = Op1.getSimpleValueType();
26098 SDValue CC;
26099
26100 if (isSoftFP16(VT)) {
26101 MVT NVT = VT.changeTypeToInteger();
26102 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
26103 DAG.getBitcast(NVT, Op1),
26104 DAG.getBitcast(NVT, Op2)));
26105 }
26106
26107 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
26108 // are available or VBLENDV if AVX is available.
26109 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
26110 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
26111 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
26112 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
26113 bool IsAlwaysSignaling;
26114 unsigned SSECC =
26115 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
26116 CondOp0, CondOp1, IsAlwaysSignaling);
26117
26118 if (Subtarget.hasAVX512()) {
26119 SDValue Cmp =
26120 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
26121 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26122 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26122, __extension__
__PRETTY_FUNCTION__))
;
26123 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26124 }
26125
26126 if (SSECC < 8 || Subtarget.hasAVX()) {
26127 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
26128 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26129
26130 // If we have AVX, we can use a variable vector select (VBLENDV) instead
26131 // of 3 logic instructions for size savings and potentially speed.
26132 // Unfortunately, there is no scalar form of VBLENDV.
26133
26134 // If either operand is a +0.0 constant, don't try this. We can expect to
26135 // optimize away at least one of the logic instructions later in that
26136 // case, so that sequence would be faster than a variable blend.
26137
26138 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
26139 // uses XMM0 as the selection register. That may need just as many
26140 // instructions as the AND/ANDN/OR sequence due to register moves, so
26141 // don't bother.
26142 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
26143 !isNullFPConstant(Op2)) {
26144 // Convert to vectors, do a VSELECT, and convert back to scalar.
26145 // All of the conversions should be optimized away.
26146 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
26147 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
26148 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
26149 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
26150
26151 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
26152 VCmp = DAG.getBitcast(VCmpVT, VCmp);
26153
26154 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
26155
26156 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
26157 VSel, DAG.getIntPtrConstant(0, DL));
26158 }
26159 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
26160 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
26161 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
26162 }
26163 }
26164
26165 // AVX512 fallback is to lower selects of scalar floats to masked moves.
26166 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
26167 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
26168 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26169 }
26170
26171 if (Cond.getOpcode() == ISD::SETCC &&
26172 !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
26173 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
26174 Cond = NewCond;
26175 // If the condition was updated, it's possible that the operands of the
26176 // select were also updated (for example, EmitTest has a RAUW). Refresh
26177 // the local references to the select operands in case they got stale.
26178 Op1 = Op.getOperand(1);
26179 Op2 = Op.getOperand(2);
26180 }
26181 }
26182
26183 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
26184 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
26185 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
26186 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
26187 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
26188 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
26189 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26190 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26191 if (Cond.getOpcode() == X86ISD::SETCC &&
26192 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
26193 isNullConstant(Cond.getOperand(1).getOperand(1))) {
26194 SDValue Cmp = Cond.getOperand(1);
26195 SDValue CmpOp0 = Cmp.getOperand(0);
26196 unsigned CondCode = Cond.getConstantOperandVal(0);
26197
26198 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
26199 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
26200 // handle to keep the CMP with 0. This should be removed by
26201 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
26202 // cttz_zero_undef.
26203 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
26204 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
26205 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
26206 };
26207 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
26208 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
26209 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
26210 // Keep Cmp.
26211 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26212 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
26213 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
26214 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
26215
26216 // 'X - 1' sets the carry flag if X == 0.
26217 // '0 - X' sets the carry flag if X != 0.
26218 // Convert the carry flag to a -1/0 mask with sbb:
26219 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
26220 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
26221 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
26222 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
26223 SDValue Sub;
26224 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
26225 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
26226 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
26227 } else {
26228 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
26229 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
26230 }
26231 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26232 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
26233 Sub.getValue(1));
26234 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
26235 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
26236 CmpOp0.getOpcode() == ISD::AND &&
26237 isOneConstant(CmpOp0.getOperand(1))) {
26238 SDValue Src1, Src2;
26239 // true if Op2 is XOR or OR operator and one of its operands
26240 // is equal to Op1
26241 // ( a , a op b) || ( b , a op b)
26242 auto isOrXorPattern = [&]() {
26243 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
26244 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
26245 Src1 =
26246 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
26247 Src2 = Op1;
26248 return true;
26249 }
26250 return false;
26251 };
26252
26253 if (isOrXorPattern()) {
26254 SDValue Neg;
26255 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
26256 // we need mask of all zeros or ones with same size of the other
26257 // operands.
26258 if (CmpSz > VT.getSizeInBits())
26259 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
26260 else if (CmpSz < VT.getSizeInBits())
26261 Neg = DAG.getNode(ISD::AND, DL, VT,
26262 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
26263 DAG.getConstant(1, DL, VT));
26264 else
26265 Neg = CmpOp0;
26266 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
26267 Neg); // -(and (x, 0x1))
26268 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
26269 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
26270 }
26271 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
26272 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
26273 ((CondCode == X86::COND_S) || // smin(x, 0)
26274 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
26275 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26276 //
26277 // If the comparison is testing for a positive value, we have to invert
26278 // the sign bit mask, so only do that transform if the target has a
26279 // bitwise 'and not' instruction (the invert is free).
26280 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26281 unsigned ShCt = VT.getSizeInBits() - 1;
26282 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
26283 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
26284 if (CondCode == X86::COND_G)
26285 Shift = DAG.getNOT(DL, Shift, VT);
26286 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
26287 }
26288 }
26289
26290 // Look past (and (setcc_carry (cmp ...)), 1).
26291 if (Cond.getOpcode() == ISD::AND &&
26292 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
26293 isOneConstant(Cond.getOperand(1)))
26294 Cond = Cond.getOperand(0);
26295
26296 // If condition flag is set by a X86ISD::CMP, then use it as the condition
26297 // setting operand in place of the X86ISD::SETCC.
26298 unsigned CondOpcode = Cond.getOpcode();
26299 if (CondOpcode == X86ISD::SETCC ||
26300 CondOpcode == X86ISD::SETCC_CARRY) {
26301 CC = Cond.getOperand(0);
26302
26303 SDValue Cmp = Cond.getOperand(1);
26304 bool IllegalFPCMov = false;
26305 if (VT.isFloatingPoint() && !VT.isVector() &&
26306 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
26307 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
26308
26309 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
26310 Cmp.getOpcode() == X86ISD::BT) { // FIXME
26311 Cond = Cmp;
26312 AddTest = false;
26313 }
26314 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
26315 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
26316 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
26317 SDValue Value;
26318 X86::CondCode X86Cond;
26319 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26320
26321 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
26322 AddTest = false;
26323 }
26324
26325 if (AddTest) {
26326 // Look past the truncate if the high bits are known zero.
26327 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26328 Cond = Cond.getOperand(0);
26329
26330 // We know the result of AND is compared against zero. Try to match
26331 // it to BT.
26332 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
26333 X86::CondCode X86CondCode;
26334 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
26335 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
26336 Cond = BT;
26337 AddTest = false;
26338 }
26339 }
26340 }
26341
26342 if (AddTest) {
26343 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
26344 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
26345 }
26346
26347 // a < b ? -1 : 0 -> RES = ~setcc_carry
26348 // a < b ? 0 : -1 -> RES = setcc_carry
26349 // a >= b ? -1 : 0 -> RES = setcc_carry
26350 // a >= b ? 0 : -1 -> RES = ~setcc_carry
26351 if (Cond.getOpcode() == X86ISD::SUB) {
26352 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
26353
26354 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
26355 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26356 (isNullConstant(Op1) || isNullConstant(Op2))) {
26357 SDValue Res =
26358 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
26359 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
26360 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
26361 return DAG.getNOT(DL, Res, Res.getValueType());
26362 return Res;
26363 }
26364 }
26365
26366 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
26367 // widen the cmov and push the truncate through. This avoids introducing a new
26368 // branch during isel and doesn't add any extensions.
26369 if (Op.getValueType() == MVT::i8 &&
26370 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
26371 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
26372 if (T1.getValueType() == T2.getValueType() &&
26373 // Exclude CopyFromReg to avoid partial register stalls.
26374 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
26375 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
26376 CC, Cond);
26377 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26378 }
26379 }
26380
26381 // Or finally, promote i8 cmovs if we have CMOV,
26382 // or i16 cmovs if it won't prevent folding a load.
26383 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
26384 // legal, but EmitLoweredSelect() can not deal with these extensions
26385 // being inserted between two CMOV's. (in i16 case too TBN)
26386 // https://bugs.llvm.org/show_bug.cgi?id=40974
26387 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
26388 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
26389 !X86::mayFoldLoad(Op2, Subtarget))) {
26390 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
26391 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
26392 SDValue Ops[] = { Op2, Op1, CC, Cond };
26393 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
26394 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26395 }
26396
26397 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
26398 // condition is true.
26399 SDValue Ops[] = { Op2, Op1, CC, Cond };
26400 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
26401}
26402
26403static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
26404 const X86Subtarget &Subtarget,
26405 SelectionDAG &DAG) {
26406 MVT VT = Op->getSimpleValueType(0);
26407 SDValue In = Op->getOperand(0);
26408 MVT InVT = In.getSimpleValueType();
26409 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26409, __extension__
__PRETTY_FUNCTION__))
;
26410 MVT VTElt = VT.getVectorElementType();
26411 SDLoc dl(Op);
26412
26413 unsigned NumElts = VT.getVectorNumElements();
26414
26415 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
26416 MVT ExtVT = VT;
26417 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
26418 // If v16i32 is to be avoided, we'll need to split and concatenate.
26419 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
26420 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
26421
26422 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
26423 }
26424
26425 // Widen to 512-bits if VLX is not supported.
26426 MVT WideVT = ExtVT;
26427 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
26428 NumElts *= 512 / ExtVT.getSizeInBits();
26429 InVT = MVT::getVectorVT(MVT::i1, NumElts);
26430 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
26431 In, DAG.getIntPtrConstant(0, dl));
26432 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
26433 }
26434
26435 SDValue V;
26436 MVT WideEltVT = WideVT.getVectorElementType();
26437 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
26438 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
26439 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
26440 } else {
26441 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
26442 SDValue Zero = DAG.getConstant(0, dl, WideVT);
26443 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
26444 }
26445
26446 // Truncate if we had to extend i16/i8 above.
26447 if (VT != ExtVT) {
26448 WideVT = MVT::getVectorVT(VTElt, NumElts);
26449 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
26450 }
26451
26452 // Extract back to 128/256-bit if we widened.
26453 if (WideVT != VT)
26454 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
26455 DAG.getIntPtrConstant(0, dl));
26456
26457 return V;
26458}
26459
26460static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26461 SelectionDAG &DAG) {
26462 SDValue In = Op->getOperand(0);
26463 MVT InVT = In.getSimpleValueType();
26464
26465 if (InVT.getVectorElementType() == MVT::i1)
26466 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26467
26468 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26468, __extension__
__PRETTY_FUNCTION__))
;
26469 return LowerAVXExtend(Op, DAG, Subtarget);
26470}
26471
26472// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
26473// For sign extend this needs to handle all vector sizes and SSE4.1 and
26474// non-SSE4.1 targets. For zero extend this should only handle inputs of
26475// MVT::v64i8 when BWI is not supported, but AVX512 is.
26476static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
26477 const X86Subtarget &Subtarget,
26478 SelectionDAG &DAG) {
26479 SDValue In = Op->getOperand(0);
26480 MVT VT = Op->getSimpleValueType(0);
26481 MVT InVT = In.getSimpleValueType();
26482
26483 MVT SVT = VT.getVectorElementType();
26484 MVT InSVT = InVT.getVectorElementType();
26485 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26485, __extension__
__PRETTY_FUNCTION__))
;
26486
26487 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
26488 return SDValue();
26489 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
26490 return SDValue();
26491 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
26492 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
26493 !(VT.is512BitVector() && Subtarget.hasAVX512()))
26494 return SDValue();
26495
26496 SDLoc dl(Op);
26497 unsigned Opc = Op.getOpcode();
26498 unsigned NumElts = VT.getVectorNumElements();
26499
26500 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
26501 // For 512-bit vectors, we need 128-bits or 256-bits.
26502 if (InVT.getSizeInBits() > 128) {
26503 // Input needs to be at least the same number of elements as output, and
26504 // at least 128-bits.
26505 int InSize = InSVT.getSizeInBits() * NumElts;
26506 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
26507 InVT = In.getSimpleValueType();
26508 }
26509
26510 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
26511 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
26512 // need to be handled here for 256/512-bit results.
26513 if (Subtarget.hasInt256()) {
26514 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26514, __extension__
__PRETTY_FUNCTION__))
;
26515
26516 if (InVT.getVectorNumElements() != NumElts)
26517 return DAG.getNode(Op.getOpcode(), dl, VT, In);
26518
26519 // FIXME: Apparently we create inreg operations that could be regular
26520 // extends.
26521 unsigned ExtOpc =
26522 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
26523 : ISD::ZERO_EXTEND;
26524 return DAG.getNode(ExtOpc, dl, VT, In);
26525 }
26526
26527 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
26528 if (Subtarget.hasAVX()) {
26529 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26529, __extension__
__PRETTY_FUNCTION__))
;
26530 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26531 int HalfNumElts = HalfVT.getVectorNumElements();
26532
26533 unsigned NumSrcElts = InVT.getVectorNumElements();
26534 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
26535 for (int i = 0; i != HalfNumElts; ++i)
26536 HiMask[i] = HalfNumElts + i;
26537
26538 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
26539 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
26540 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
26541 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26542 }
26543
26544 // We should only get here for sign extend.
26545 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26545, __extension__
__PRETTY_FUNCTION__))
;
26546 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26546, __extension__
__PRETTY_FUNCTION__))
;
26547
26548 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
26549 SDValue Curr = In;
26550 SDValue SignExt = Curr;
26551
26552 // As SRAI is only available on i16/i32 types, we expand only up to i32
26553 // and handle i64 separately.
26554 if (InVT != MVT::v4i32) {
26555 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
26556
26557 unsigned DestWidth = DestVT.getScalarSizeInBits();
26558 unsigned Scale = DestWidth / InSVT.getSizeInBits();
26559
26560 unsigned InNumElts = InVT.getVectorNumElements();
26561 unsigned DestElts = DestVT.getVectorNumElements();
26562
26563 // Build a shuffle mask that takes each input element and places it in the
26564 // MSBs of the new element size.
26565 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
26566 for (unsigned i = 0; i != DestElts; ++i)
26567 Mask[i * Scale + (Scale - 1)] = i;
26568
26569 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
26570 Curr = DAG.getBitcast(DestVT, Curr);
26571
26572 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
26573 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
26574 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
26575 }
26576
26577 if (VT == MVT::v2i64) {
26578 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26578, __extension__
__PRETTY_FUNCTION__))
;
26579 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
26580 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
26581 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
26582 SignExt = DAG.getBitcast(VT, SignExt);
26583 }
26584
26585 return SignExt;
26586}
26587
26588static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26589 SelectionDAG &DAG) {
26590 MVT VT = Op->getSimpleValueType(0);
26591 SDValue In = Op->getOperand(0);
26592 MVT InVT = In.getSimpleValueType();
26593 SDLoc dl(Op);
26594
26595 if (InVT.getVectorElementType() == MVT::i1)
26596 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26597
26598 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26598, __extension__
__PRETTY_FUNCTION__))
;
26599 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26600, __extension__
__PRETTY_FUNCTION__))
26600 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26600, __extension__
__PRETTY_FUNCTION__))
;
26601 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))
26602 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))
26603 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))
26604 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))
;
26605 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))
26606 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))
26607 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))
26608 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))
;
26609
26610 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
26611 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26611, __extension__
__PRETTY_FUNCTION__))
;
26612 return splitVectorIntUnary(Op, DAG);
26613 }
26614
26615 if (Subtarget.hasInt256())
26616 return Op;
26617
26618 // Optimize vectors in AVX mode
26619 // Sign extend v8i16 to v8i32 and
26620 // v4i32 to v4i64
26621 //
26622 // Divide input vector into two parts
26623 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
26624 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
26625 // concat the vectors to original VT
26626 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26627 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
26628
26629 unsigned NumElems = InVT.getVectorNumElements();
26630 SmallVector<int,8> ShufMask(NumElems, -1);
26631 for (unsigned i = 0; i != NumElems/2; ++i)
26632 ShufMask[i] = i + NumElems/2;
26633
26634 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26635 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
26636
26637 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
26638}
26639
26640/// Change a vector store into a pair of half-size vector stores.
26641static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
26642 SDValue StoredVal = Store->getValue();
26643 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26645, __extension__
__PRETTY_FUNCTION__))
26644 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26645, __extension__
__PRETTY_FUNCTION__))
26645 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26645, __extension__
__PRETTY_FUNCTION__))
;
26646
26647 // Splitting volatile memory ops is not allowed unless the operation was not
26648 // legal to begin with. Assume the input store is legal (this transform is
26649 // only used for targets with AVX). Note: It is possible that we have an
26650 // illegal type like v2i128, and so we could allow splitting a volatile store
26651 // in that case if that is important.
26652 if (!Store->isSimple())
26653 return SDValue();
26654
26655 SDLoc DL(Store);
26656 SDValue Value0, Value1;
26657 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
26658 unsigned HalfOffset = Value0.getValueType().getStoreSize();
26659 SDValue Ptr0 = Store->getBasePtr();
26660 SDValue Ptr1 =
26661 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
26662 SDValue Ch0 =
26663 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
26664 Store->getOriginalAlign(),
26665 Store->getMemOperand()->getFlags());
26666 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
26667 Store->getPointerInfo().getWithOffset(HalfOffset),
26668 Store->getOriginalAlign(),
26669 Store->getMemOperand()->getFlags());
26670 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
26671}
26672
26673/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
26674/// type.
26675static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
26676 SelectionDAG &DAG) {
26677 SDValue StoredVal = Store->getValue();
26678 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__))
26679 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__))
;
26680 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
26681
26682 // Splitting volatile memory ops is not allowed unless the operation was not
26683 // legal to begin with. We are assuming the input op is legal (this transform
26684 // is only used for targets with AVX).
26685 if (!Store->isSimple())
26686 return SDValue();
26687
26688 MVT StoreSVT = StoreVT.getScalarType();
26689 unsigned NumElems = StoreVT.getVectorNumElements();
26690 unsigned ScalarSize = StoreSVT.getStoreSize();
26691
26692 SDLoc DL(Store);
26693 SmallVector<SDValue, 4> Stores;
26694 for (unsigned i = 0; i != NumElems; ++i) {
26695 unsigned Offset = i * ScalarSize;
26696 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
26697 TypeSize::Fixed(Offset), DL);
26698 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
26699 DAG.getIntPtrConstant(i, DL));
26700 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
26701 Store->getPointerInfo().getWithOffset(Offset),
26702 Store->getOriginalAlign(),
26703 Store->getMemOperand()->getFlags());
26704 Stores.push_back(Ch);
26705 }
26706 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
26707}
26708
26709static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
26710 SelectionDAG &DAG) {
26711 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
26712 SDLoc dl(St);
26713 SDValue StoredVal = St->getValue();
26714
26715 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
26716 if (StoredVal.getValueType().isVector() &&
26717 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
26718 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
26719 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26719, __extension__
__PRETTY_FUNCTION__))
;
26720 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26720, __extension__
__PRETTY_FUNCTION__))
;
26721 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26722, __extension__
__PRETTY_FUNCTION__))
26722 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26722, __extension__
__PRETTY_FUNCTION__))
;
26723
26724 // We must pad with zeros to ensure we store zeroes to any unused bits.
26725 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26726 DAG.getUNDEF(MVT::v16i1), StoredVal,
26727 DAG.getIntPtrConstant(0, dl));
26728 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
26729 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
26730 // Make sure we store zeros in the extra bits.
26731 if (NumElts < 8)
26732 StoredVal = DAG.getZeroExtendInReg(
26733 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
26734
26735 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26736 St->getPointerInfo(), St->getOriginalAlign(),
26737 St->getMemOperand()->getFlags());
26738 }
26739
26740 if (St->isTruncatingStore())
26741 return SDValue();
26742
26743 // If this is a 256-bit store of concatenated ops, we are better off splitting
26744 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
26745 // and each half can execute independently. Some cores would split the op into
26746 // halves anyway, so the concat (vinsertf128) is purely an extra op.
26747 MVT StoreVT = StoredVal.getSimpleValueType();
26748 if (StoreVT.is256BitVector() ||
26749 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
26750 !Subtarget.hasBWI())) {
26751 SmallVector<SDValue, 4> CatOps;
26752 if (StoredVal.hasOneUse() &&
26753 collectConcatOps(StoredVal.getNode(), CatOps, DAG))
26754 return splitVectorStore(St, DAG);
26755 return SDValue();
26756 }
26757
26758 if (StoreVT.is32BitVector())
26759 return SDValue();
26760
26761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26762 assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26762, __extension__
__PRETTY_FUNCTION__))
;
26763 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26765, __extension__
__PRETTY_FUNCTION__))
26764 TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26765, __extension__
__PRETTY_FUNCTION__))
26765 "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26765, __extension__
__PRETTY_FUNCTION__))
;
26766
26767 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
26768 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
26769 DAG.getUNDEF(StoreVT));
26770
26771 if (Subtarget.hasSSE2()) {
26772 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
26773 // and store it.
26774 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
26775 MVT CastVT = MVT::getVectorVT(StVT, 2);
26776 StoredVal = DAG.getBitcast(CastVT, StoredVal);
26777 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
26778 DAG.getIntPtrConstant(0, dl));
26779
26780 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26781 St->getPointerInfo(), St->getOriginalAlign(),
26782 St->getMemOperand()->getFlags());
26783 }
26784 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26784, __extension__
__PRETTY_FUNCTION__))
;
26785 SDVTList Tys = DAG.getVTList(MVT::Other);
26786 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
26787 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
26788 St->getMemOperand());
26789}
26790
26791// Lower vector extended loads using a shuffle. If SSSE3 is not available we
26792// may emit an illegal shuffle but the expansion is still better than scalar
26793// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
26794// we'll emit a shuffle and a arithmetic shift.
26795// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
26796// TODO: It is possible to support ZExt by zeroing the undef values during
26797// the shuffle phase or after the shuffle.
26798static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
26799 SelectionDAG &DAG) {
26800 MVT RegVT = Op.getSimpleValueType();
26801 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26801, __extension__
__PRETTY_FUNCTION__))
;
26802 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26803, __extension__
__PRETTY_FUNCTION__))
26803 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26803, __extension__
__PRETTY_FUNCTION__))
;
26804
26805 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
26806 SDLoc dl(Ld);
26807
26808 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
26809 if (RegVT.getVectorElementType() == MVT::i1) {
26810 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26810, __extension__
__PRETTY_FUNCTION__))
;
26811 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26811, __extension__
__PRETTY_FUNCTION__))
;
26812 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26813, __extension__
__PRETTY_FUNCTION__))
26813 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26813, __extension__
__PRETTY_FUNCTION__))
;
26814
26815 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
26816 Ld->getPointerInfo(), Ld->getOriginalAlign(),
26817 Ld->getMemOperand()->getFlags());
26818
26819 // Replace chain users with the new chain.
26820 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26820, __extension__
__PRETTY_FUNCTION__))
;
26821
26822 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
26823 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
26824 DAG.getBitcast(MVT::v16i1, Val),
26825 DAG.getIntPtrConstant(0, dl));
26826 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
26827 }
26828
26829 return SDValue();
26830}
26831
26832/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
26833/// each of which has no other use apart from the AND / OR.
26834static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
26835 Opc = Op.getOpcode();
26836 if (Opc != ISD::OR && Opc != ISD::AND)
26837 return false;
26838 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
26839 Op.getOperand(0).hasOneUse() &&
26840 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
26841 Op.getOperand(1).hasOneUse());
26842}
26843
26844SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
26845 SDValue Chain = Op.getOperand(0);
26846 SDValue Cond = Op.getOperand(1);
26847 SDValue Dest = Op.getOperand(2);
26848 SDLoc dl(Op);
26849
26850 // Bail out when we don't have native compare instructions.
26851 if (Cond.getOpcode() == ISD::SETCC &&
26852 Cond.getOperand(0).getValueType() != MVT::f128 &&
26853 !isSoftFP16(Cond.getOperand(0).getValueType())) {
26854 SDValue LHS = Cond.getOperand(0);
26855 SDValue RHS = Cond.getOperand(1);
26856 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26857
26858 // Special case for
26859 // setcc([su]{add,sub,mul}o == 0)
26860 // setcc([su]{add,sub,mul}o != 1)
26861 if (ISD::isOverflowIntrOpRes(LHS) &&
26862 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
26863 (isNullConstant(RHS) || isOneConstant(RHS))) {
26864 SDValue Value, Overflow;
26865 X86::CondCode X86Cond;
26866 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
26867
26868 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
26869 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
26870
26871 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26872 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26873 Overflow);
26874 }
26875
26876 if (LHS.getSimpleValueType().isInteger()) {
26877 SDValue CCVal;
26878 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
26879 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26880 EFLAGS);
26881 }
26882
26883 if (CC == ISD::SETOEQ) {
26884 // For FCMP_OEQ, we can emit
26885 // two branches instead of an explicit AND instruction with a
26886 // separate test. However, we only do this if this block doesn't
26887 // have a fall-through edge, because this requires an explicit
26888 // jmp when the condition is false.
26889 if (Op.getNode()->hasOneUse()) {
26890 SDNode *User = *Op.getNode()->use_begin();
26891 // Look for an unconditional branch following this conditional branch.
26892 // We need this because we need to reverse the successors in order
26893 // to implement FCMP_OEQ.
26894 if (User->getOpcode() == ISD::BR) {
26895 SDValue FalseBB = User->getOperand(1);
26896 SDNode *NewBR =
26897 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
26898 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26898, __extension__ __PRETTY_FUNCTION__))
;
26899 (void)NewBR;
26900 Dest = FalseBB;
26901
26902 SDValue Cmp =
26903 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26904 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26905 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
26906 CCVal, Cmp);
26907 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26908 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26909 Cmp);
26910 }
26911 }
26912 } else if (CC == ISD::SETUNE) {
26913 // For FCMP_UNE, we can emit
26914 // two branches instead of an explicit OR instruction with a
26915 // separate test.
26916 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26917 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26918 Chain =
26919 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
26920 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26921 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26922 Cmp);
26923 } else {
26924 X86::CondCode X86Cond =
26925 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
26926 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26927 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26928 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26929 Cmp);
26930 }
26931 }
26932
26933 if (ISD::isOverflowIntrOpRes(Cond)) {
26934 SDValue Value, Overflow;
26935 X86::CondCode X86Cond;
26936 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26937
26938 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26939 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26940 Overflow);
26941 }
26942
26943 // Look past the truncate if the high bits are known zero.
26944 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26945 Cond = Cond.getOperand(0);
26946
26947 EVT CondVT = Cond.getValueType();
26948
26949 // Add an AND with 1 if we don't already have one.
26950 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
26951 Cond =
26952 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
26953
26954 SDValue LHS = Cond;
26955 SDValue RHS = DAG.getConstant(0, dl, CondVT);
26956
26957 SDValue CCVal;
26958 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
26959 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26960 EFLAGS);
26961}
26962
26963// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
26964// Calls to _alloca are needed to probe the stack when allocating more than 4k
26965// bytes in one go. Touching the stack at 4K increments is necessary to ensure
26966// that the guard pages used by the OS virtual memory manager are allocated in
26967// correct sequence.
26968SDValue
26969X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
26970 SelectionDAG &DAG) const {
26971 MachineFunction &MF = DAG.getMachineFunction();
26972 bool SplitStack = MF.shouldSplitStack();
26973 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
26974 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
26975 SplitStack || EmitStackProbeCall;
26976 SDLoc dl(Op);
26977
26978 // Get the inputs.
26979 SDNode *Node = Op.getNode();
26980 SDValue Chain = Op.getOperand(0);
26981 SDValue Size = Op.getOperand(1);
26982 MaybeAlign Alignment(Op.getConstantOperandVal(2));
26983 EVT VT = Node->getValueType(0);
26984
26985 // Chain the dynamic stack allocation so that it doesn't modify the stack
26986 // pointer when other instructions are using the stack.
26987 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
26988
26989 bool Is64Bit = Subtarget.is64Bit();
26990 MVT SPTy = getPointerTy(DAG.getDataLayout());
26991
26992 SDValue Result;
26993 if (!Lower) {
26994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26995 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
26996 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26997, __extension__
__PRETTY_FUNCTION__))
26997 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26997, __extension__
__PRETTY_FUNCTION__))
;
26998
26999 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27000 const Align StackAlign = TFI.getStackAlign();
27001 if (hasInlineStackProbe(MF)) {
27002 MachineRegisterInfo &MRI = MF.getRegInfo();
27003
27004 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27005 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27006 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27007 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
27008 DAG.getRegister(Vreg, SPTy));
27009 } else {
27010 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
27011 Chain = SP.getValue(1);
27012 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
27013 }
27014 if (Alignment && *Alignment > StackAlign)
27015 Result =
27016 DAG.getNode(ISD::AND, dl, VT, Result,
27017 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27018 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
27019 } else if (SplitStack) {
27020 MachineRegisterInfo &MRI = MF.getRegInfo();
27021
27022 if (Is64Bit) {
27023 // The 64 bit implementation of segmented stacks needs to clobber both r10
27024 // r11. This makes it impossible to use it along with nested parameters.
27025 const Function &F = MF.getFunction();
27026 for (const auto &A : F.args()) {
27027 if (A.hasNestAttr())
27028 report_fatal_error("Cannot use segmented stacks with functions that "
27029 "have nested arguments.");
27030 }
27031 }
27032
27033 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27034 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27035 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27036 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
27037 DAG.getRegister(Vreg, SPTy));
27038 } else {
27039 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
27040 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
27041 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
27042
27043 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27044 Register SPReg = RegInfo->getStackRegister();
27045 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
27046 Chain = SP.getValue(1);
27047
27048 if (Alignment) {
27049 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
27050 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27051 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
27052 }
27053
27054 Result = SP;
27055 }
27056
27057 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
27058
27059 SDValue Ops[2] = {Result, Chain};
27060 return DAG.getMergeValues(Ops, dl);
27061}
27062
27063SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
27064 MachineFunction &MF = DAG.getMachineFunction();
27065 auto PtrVT = getPointerTy(MF.getDataLayout());
27066 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27067
27068 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27069 SDLoc DL(Op);
27070
27071 if (!Subtarget.is64Bit() ||
27072 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
27073 // vastart just stores the address of the VarArgsFrameIndex slot into the
27074 // memory location argument.
27075 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27076 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
27077 MachinePointerInfo(SV));
27078 }
27079
27080 // __va_list_tag:
27081 // gp_offset (0 - 6 * 8)
27082 // fp_offset (48 - 48 + 8 * 16)
27083 // overflow_arg_area (point to parameters coming in memory).
27084 // reg_save_area
27085 SmallVector<SDValue, 8> MemOps;
27086 SDValue FIN = Op.getOperand(1);
27087 // Store gp_offset
27088 SDValue Store = DAG.getStore(
27089 Op.getOperand(0), DL,
27090 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
27091 MachinePointerInfo(SV));
27092 MemOps.push_back(Store);
27093
27094 // Store fp_offset
27095 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
27096 Store = DAG.getStore(
27097 Op.getOperand(0), DL,
27098 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
27099 MachinePointerInfo(SV, 4));
27100 MemOps.push_back(Store);
27101
27102 // Store ptr to overflow_arg_area
27103 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
27104 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27105 Store =
27106 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
27107 MemOps.push_back(Store);
27108
27109 // Store ptr to reg_save_area.
27110 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
27111 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
27112 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
27113 Store = DAG.getStore(
27114 Op.getOperand(0), DL, RSFIN, FIN,
27115 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
27116 MemOps.push_back(Store);
27117 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
27118}
27119
27120SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
27121 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27122, __extension__
__PRETTY_FUNCTION__))
27122 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27122, __extension__
__PRETTY_FUNCTION__))
;
27123 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27123, __extension__ __PRETTY_FUNCTION__))
;
27124
27125 MachineFunction &MF = DAG.getMachineFunction();
27126 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
27127 // The Win64 ABI uses char* instead of a structure.
27128 return DAG.expandVAArg(Op.getNode());
27129
27130 SDValue Chain = Op.getOperand(0);
27131 SDValue SrcPtr = Op.getOperand(1);
27132 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27133 unsigned Align = Op.getConstantOperandVal(3);
27134 SDLoc dl(Op);
27135
27136 EVT ArgVT = Op.getNode()->getValueType(0);
27137 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27138 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
27139 uint8_t ArgMode;
27140
27141 // Decide which area this value should be read from.
27142 // TODO: Implement the AMD64 ABI in its entirety. This simple
27143 // selection mechanism works only for the basic types.
27144 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27144, __extension__
__PRETTY_FUNCTION__))
;
27145 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
27146 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
27147 } else {
27148 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27149, __extension__
__PRETTY_FUNCTION__))
27149 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27149, __extension__
__PRETTY_FUNCTION__))
;
27150 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
27151 }
27152
27153 if (ArgMode == 2) {
27154 // Make sure using fp_offset makes sense.
27155 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27157, __extension__
__PRETTY_FUNCTION__))
27156 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27157, __extension__
__PRETTY_FUNCTION__))
27157 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27157, __extension__
__PRETTY_FUNCTION__))
;
27158 }
27159
27160 // Insert VAARG node into the DAG
27161 // VAARG returns two values: Variable Argument Address, Chain
27162 SDValue InstOps[] = {Chain, SrcPtr,
27163 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
27164 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
27165 DAG.getTargetConstant(Align, dl, MVT::i32)};
27166 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
27167 SDValue VAARG = DAG.getMemIntrinsicNode(
27168 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
27169 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
27170 /*Alignment=*/std::nullopt,
27171 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
27172 Chain = VAARG.getValue(1);
27173
27174 // Load the next argument and return it
27175 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
27176}
27177
27178static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
27179 SelectionDAG &DAG) {
27180 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
27181 // where a va_list is still an i8*.
27182 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27182, __extension__
__PRETTY_FUNCTION__))
;
27183 if (Subtarget.isCallingConvWin64(
27184 DAG.getMachineFunction().getFunction().getCallingConv()))
27185 // Probably a Win64 va_copy.
27186 return DAG.expandVACopy(Op.getNode());
27187
27188 SDValue Chain = Op.getOperand(0);
27189 SDValue DstPtr = Op.getOperand(1);
27190 SDValue SrcPtr = Op.getOperand(2);
27191 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
27192 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27193 SDLoc DL(Op);
27194
27195 return DAG.getMemcpy(
27196 Chain, DL, DstPtr, SrcPtr,
27197 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
27198 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
27199 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
27200}
27201
27202// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
27203static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
27204 switch (Opc) {
27205 case ISD::SHL:
27206 case X86ISD::VSHL:
27207 case X86ISD::VSHLI:
27208 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
27209 case ISD::SRL:
27210 case X86ISD::VSRL:
27211 case X86ISD::VSRLI:
27212 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
27213 case ISD::SRA:
27214 case X86ISD::VSRA:
27215 case X86ISD::VSRAI:
27216 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
27217 }
27218 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27218)
;
27219}
27220
27221/// Handle vector element shifts where the shift amount is a constant.
27222/// Takes immediate version of shift as input.
27223static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
27224 SDValue SrcOp, uint64_t ShiftAmt,
27225 SelectionDAG &DAG) {
27226 MVT ElementType = VT.getVectorElementType();
27227
27228 // Bitcast the source vector to the output type, this is mainly necessary for
27229 // vXi8/vXi64 shifts.
27230 if (VT != SrcOp.getSimpleValueType())
27231 SrcOp = DAG.getBitcast(VT, SrcOp);
27232
27233 // Fold this packed shift into its first operand if ShiftAmt is 0.
27234 if (ShiftAmt == 0)
27235 return SrcOp;
27236
27237 // Check for ShiftAmt >= element width
27238 if (ShiftAmt >= ElementType.getSizeInBits()) {
27239 if (Opc == X86ISD::VSRAI)
27240 ShiftAmt = ElementType.getSizeInBits() - 1;
27241 else
27242 return DAG.getConstant(0, dl, VT);
27243 }
27244
27245 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27246, __extension__
__PRETTY_FUNCTION__))
27246 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27246, __extension__
__PRETTY_FUNCTION__))
;
27247
27248 // Fold this packed vector shift into a build vector if SrcOp is a
27249 // vector of Constants or UNDEFs.
27250 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
27251 unsigned ShiftOpc;
27252 switch (Opc) {
27253 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27253)
;
27254 case X86ISD::VSHLI:
27255 ShiftOpc = ISD::SHL;
27256 break;
27257 case X86ISD::VSRLI:
27258 ShiftOpc = ISD::SRL;
27259 break;
27260 case X86ISD::VSRAI:
27261 ShiftOpc = ISD::SRA;
27262 break;
27263 }
27264
27265 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
27266 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
27267 return C;
27268 }
27269
27270 return DAG.getNode(Opc, dl, VT, SrcOp,
27271 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
27272}
27273
27274/// Handle vector element shifts by a splat shift amount
27275static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
27276 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
27277 const X86Subtarget &Subtarget,
27278 SelectionDAG &DAG) {
27279 MVT AmtVT = ShAmt.getSimpleValueType();
27280 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27280, __extension__
__PRETTY_FUNCTION__))
;
27281 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27282, __extension__
__PRETTY_FUNCTION__))
27282 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27282, __extension__
__PRETTY_FUNCTION__))
;
27283
27284 // Move the splat element to the bottom element.
27285 if (ShAmtIdx != 0) {
27286 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
27287 Mask[0] = ShAmtIdx;
27288 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
27289 }
27290
27291 // Peek through any zext node if we can get back to a 128-bit source.
27292 if (AmtVT.getScalarSizeInBits() == 64 &&
27293 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
27294 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
27295 ShAmt.getOperand(0).getValueType().isSimple() &&
27296 ShAmt.getOperand(0).getValueType().is128BitVector()) {
27297 ShAmt = ShAmt.getOperand(0);
27298 AmtVT = ShAmt.getSimpleValueType();
27299 }
27300
27301 // See if we can mask off the upper elements using the existing source node.
27302 // The shift uses the entire lower 64-bits of the amount vector, so no need to
27303 // do this for vXi64 types.
27304 bool IsMasked = false;
27305 if (AmtVT.getScalarSizeInBits() < 64) {
27306 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
27307 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
27308 // If the shift amount has come from a scalar, then zero-extend the scalar
27309 // before moving to the vector.
27310 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
27311 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
27312 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
27313 AmtVT = MVT::v4i32;
27314 IsMasked = true;
27315 } else if (ShAmt.getOpcode() == ISD::AND) {
27316 // See if the shift amount is already masked (e.g. for rotation modulo),
27317 // then we can zero-extend it by setting all the other mask elements to
27318 // zero.
27319 SmallVector<SDValue> MaskElts(
27320 AmtVT.getVectorNumElements(),
27321 DAG.getConstant(0, dl, AmtVT.getScalarType()));
27322 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
27323 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
27324 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
27325 {ShAmt.getOperand(1), Mask}))) {
27326 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
27327 IsMasked = true;
27328 }
27329 }
27330 }
27331
27332 // Extract if the shift amount vector is larger than 128-bits.
27333 if (AmtVT.getSizeInBits() > 128) {
27334 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
27335 AmtVT = ShAmt.getSimpleValueType();
27336 }
27337
27338 // Zero-extend bottom element to v2i64 vector type, either by extension or
27339 // shuffle masking.
27340 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
27341 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
27342 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
27343 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
27344 } else if (Subtarget.hasSSE41()) {
27345 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
27346 MVT::v2i64, ShAmt);
27347 } else {
27348 SDValue ByteShift = DAG.getTargetConstant(
27349 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
27350 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
27351 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27352 ByteShift);
27353 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27354 ByteShift);
27355 }
27356 }
27357
27358 // Change opcode to non-immediate version.
27359 Opc = getTargetVShiftUniformOpcode(Opc, true);
27360
27361 // The return type has to be a 128-bit type with the same element
27362 // type as the input type.
27363 MVT EltVT = VT.getVectorElementType();
27364 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
27365
27366 ShAmt = DAG.getBitcast(ShVT, ShAmt);
27367 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
27368}
27369
27370/// Return Mask with the necessary casting or extending
27371/// for \p Mask according to \p MaskVT when lowering masking intrinsics
27372static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
27373 const X86Subtarget &Subtarget, SelectionDAG &DAG,
27374 const SDLoc &dl) {
27375
27376 if (isAllOnesConstant(Mask))
27377 return DAG.getConstant(1, dl, MaskVT);
27378 if (X86::isZeroNode(Mask))
27379 return DAG.getConstant(0, dl, MaskVT);
27380
27381 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27381, __extension__
__PRETTY_FUNCTION__))
;
27382
27383 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
27384 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27384, __extension__
__PRETTY_FUNCTION__))
;
27385 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27385, __extension__
__PRETTY_FUNCTION__))
;
27386 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
27387 SDValue Lo, Hi;
27388 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
27389 Lo = DAG.getBitcast(MVT::v32i1, Lo);
27390 Hi = DAG.getBitcast(MVT::v32i1, Hi);
27391 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27392 } else {
27393 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
27394 Mask.getSimpleValueType().getSizeInBits());
27395 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
27396 // are extracted by EXTRACT_SUBVECTOR.
27397 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
27398 DAG.getBitcast(BitcastVT, Mask),
27399 DAG.getIntPtrConstant(0, dl));
27400 }
27401}
27402
27403/// Return (and \p Op, \p Mask) for compare instructions or
27404/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
27405/// necessary casting or extending for \p Mask when lowering masking intrinsics
27406static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
27407 SDValue PreservedSrc,
27408 const X86Subtarget &Subtarget,
27409 SelectionDAG &DAG) {
27410 MVT VT = Op.getSimpleValueType();
27411 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27412 unsigned OpcodeSelect = ISD::VSELECT;
27413 SDLoc dl(Op);
27414
27415 if (isAllOnesConstant(Mask))
27416 return Op;
27417
27418 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27419
27420 if (PreservedSrc.isUndef())
27421 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27422 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
27423}
27424
27425/// Creates an SDNode for a predicated scalar operation.
27426/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
27427/// The mask is coming as MVT::i8 and it should be transformed
27428/// to MVT::v1i1 while lowering masking intrinsics.
27429/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
27430/// "X86select" instead of "vselect". We just can't create the "vselect" node
27431/// for a scalar instruction.
27432static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
27433 SDValue PreservedSrc,
27434 const X86Subtarget &Subtarget,
27435 SelectionDAG &DAG) {
27436
27437 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
27438 if (MaskConst->getZExtValue() & 0x1)
27439 return Op;
27440
27441 MVT VT = Op.getSimpleValueType();
27442 SDLoc dl(Op);
27443
27444 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27444, __extension__
__PRETTY_FUNCTION__))
;
27445 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
27446 DAG.getBitcast(MVT::v8i1, Mask),
27447 DAG.getIntPtrConstant(0, dl));
27448 if (Op.getOpcode() == X86ISD::FSETCCM ||
27449 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
27450 Op.getOpcode() == X86ISD::VFPCLASSS)
27451 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
27452
27453 if (PreservedSrc.isUndef())
27454 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27455 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
27456}
27457
27458static int getSEHRegistrationNodeSize(const Function *Fn) {
27459 if (!Fn->hasPersonalityFn())
27460 report_fatal_error(
27461 "querying registration node size for function without personality");
27462 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
27463 // WinEHStatePass for the full struct definition.
27464 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
27465 case EHPersonality::MSVC_X86SEH: return 24;
27466 case EHPersonality::MSVC_CXX: return 16;
27467 default: break;
27468 }
27469 report_fatal_error(
27470 "can only recover FP for 32-bit MSVC EH personality functions");
27471}
27472
27473/// When the MSVC runtime transfers control to us, either to an outlined
27474/// function or when returning to a parent frame after catching an exception, we
27475/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
27476/// Here's the math:
27477/// RegNodeBase = EntryEBP - RegNodeSize
27478/// ParentFP = RegNodeBase - ParentFrameOffset
27479/// Subtracting RegNodeSize takes us to the offset of the registration node, and
27480/// subtracting the offset (negative on x86) takes us back to the parent FP.
27481static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
27482 SDValue EntryEBP) {
27483 MachineFunction &MF = DAG.getMachineFunction();
27484 SDLoc dl;
27485
27486 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27487 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27488
27489 // It's possible that the parent function no longer has a personality function
27490 // if the exceptional code was optimized away, in which case we just return
27491 // the incoming EBP.
27492 if (!Fn->hasPersonalityFn())
27493 return EntryEBP;
27494
27495 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
27496 // registration, or the .set_setframe offset.
27497 MCSymbol *OffsetSym =
27498 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
27499 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
27500 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
27501 SDValue ParentFrameOffset =
27502 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
27503
27504 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
27505 // prologue to RBP in the parent function.
27506 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
27507 if (Subtarget.is64Bit())
27508 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
27509
27510 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
27511 // RegNodeBase = EntryEBP - RegNodeSize
27512 // ParentFP = RegNodeBase - ParentFrameOffset
27513 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
27514 DAG.getConstant(RegNodeSize, dl, PtrVT));
27515 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
27516}
27517
27518SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
27519 SelectionDAG &DAG) const {
27520 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
27521 auto isRoundModeCurDirection = [](SDValue Rnd) {
27522 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
27523 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
27524
27525 return false;
27526 };
27527 auto isRoundModeSAE = [](SDValue Rnd) {
27528 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27529 unsigned RC = C->getZExtValue();
27530 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27531 // Clear the NO_EXC bit and check remaining bits.
27532 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27533 // As a convenience we allow no other bits or explicitly
27534 // current direction.
27535 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
27536 }
27537 }
27538
27539 return false;
27540 };
27541 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
27542 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27543 RC = C->getZExtValue();
27544 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27545 // Clear the NO_EXC bit and check remaining bits.
27546 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27547 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
27548 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
27549 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
27550 RC == X86::STATIC_ROUNDING::TO_ZERO;
27551 }
27552 }
27553
27554 return false;
27555 };
27556
27557 SDLoc dl(Op);
27558 unsigned IntNo = Op.getConstantOperandVal(0);
27559 MVT VT = Op.getSimpleValueType();
27560 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
27561
27562 // Propagate flags from original node to transformed node(s).
27563 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
27564
27565 if (IntrData) {
27566 switch(IntrData->Type) {
27567 case INTR_TYPE_1OP: {
27568 // We specify 2 possible opcodes for intrinsics with rounding modes.
27569 // First, we check if the intrinsic may have non-default rounding mode,
27570 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27571 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27572 if (IntrWithRoundingModeOpcode != 0) {
27573 SDValue Rnd = Op.getOperand(2);
27574 unsigned RC = 0;
27575 if (isRoundModeSAEToX(Rnd, RC))
27576 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27577 Op.getOperand(1),
27578 DAG.getTargetConstant(RC, dl, MVT::i32));
27579 if (!isRoundModeCurDirection(Rnd))
27580 return SDValue();
27581 }
27582 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27583 Op.getOperand(1));
27584 }
27585 case INTR_TYPE_1OP_SAE: {
27586 SDValue Sae = Op.getOperand(2);
27587
27588 unsigned Opc;
27589 if (isRoundModeCurDirection(Sae))
27590 Opc = IntrData->Opc0;
27591 else if (isRoundModeSAE(Sae))
27592 Opc = IntrData->Opc1;
27593 else
27594 return SDValue();
27595
27596 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
27597 }
27598 case INTR_TYPE_2OP: {
27599 SDValue Src2 = Op.getOperand(2);
27600
27601 // We specify 2 possible opcodes for intrinsics with rounding modes.
27602 // First, we check if the intrinsic may have non-default rounding mode,
27603 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27604 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27605 if (IntrWithRoundingModeOpcode != 0) {
27606 SDValue Rnd = Op.getOperand(3);
27607 unsigned RC = 0;
27608 if (isRoundModeSAEToX(Rnd, RC))
27609 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27610 Op.getOperand(1), Src2,
27611 DAG.getTargetConstant(RC, dl, MVT::i32));
27612 if (!isRoundModeCurDirection(Rnd))
27613 return SDValue();
27614 }
27615
27616 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27617 Op.getOperand(1), Src2);
27618 }
27619 case INTR_TYPE_2OP_SAE: {
27620 SDValue Sae = Op.getOperand(3);
27621
27622 unsigned Opc;
27623 if (isRoundModeCurDirection(Sae))
27624 Opc = IntrData->Opc0;
27625 else if (isRoundModeSAE(Sae))
27626 Opc = IntrData->Opc1;
27627 else
27628 return SDValue();
27629
27630 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
27631 Op.getOperand(2));
27632 }
27633 case INTR_TYPE_3OP:
27634 case INTR_TYPE_3OP_IMM8: {
27635 SDValue Src1 = Op.getOperand(1);
27636 SDValue Src2 = Op.getOperand(2);
27637 SDValue Src3 = Op.getOperand(3);
27638
27639 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
27640 Src3.getValueType() != MVT::i8) {
27641 Src3 = DAG.getTargetConstant(
27642 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
27643 }
27644
27645 // We specify 2 possible opcodes for intrinsics with rounding modes.
27646 // First, we check if the intrinsic may have non-default rounding mode,
27647 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27648 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27649 if (IntrWithRoundingModeOpcode != 0) {
27650 SDValue Rnd = Op.getOperand(4);
27651 unsigned RC = 0;
27652 if (isRoundModeSAEToX(Rnd, RC))
27653 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27654 Src1, Src2, Src3,
27655 DAG.getTargetConstant(RC, dl, MVT::i32));
27656 if (!isRoundModeCurDirection(Rnd))
27657 return SDValue();
27658 }
27659
27660 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27661 {Src1, Src2, Src3});
27662 }
27663 case INTR_TYPE_4OP_IMM8: {
27664 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27664, __extension__
__PRETTY_FUNCTION__))
;
27665 SDValue Src4 = Op.getOperand(4);
27666 if (Src4.getValueType() != MVT::i8) {
27667 Src4 = DAG.getTargetConstant(
27668 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
27669 }
27670
27671 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27672 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
27673 Src4);
27674 }
27675 case INTR_TYPE_1OP_MASK: {
27676 SDValue Src = Op.getOperand(1);
27677 SDValue PassThru = Op.getOperand(2);
27678 SDValue Mask = Op.getOperand(3);
27679 // We add rounding mode to the Node when
27680 // - RC Opcode is specified and
27681 // - RC is not "current direction".
27682 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27683 if (IntrWithRoundingModeOpcode != 0) {
27684 SDValue Rnd = Op.getOperand(4);
27685 unsigned RC = 0;
27686 if (isRoundModeSAEToX(Rnd, RC))
27687 return getVectorMaskingNode(
27688 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27689 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
27690 Mask, PassThru, Subtarget, DAG);
27691 if (!isRoundModeCurDirection(Rnd))
27692 return SDValue();
27693 }
27694 return getVectorMaskingNode(
27695 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
27696 Subtarget, DAG);
27697 }
27698 case INTR_TYPE_1OP_MASK_SAE: {
27699 SDValue Src = Op.getOperand(1);
27700 SDValue PassThru = Op.getOperand(2);
27701 SDValue Mask = Op.getOperand(3);
27702 SDValue Rnd = Op.getOperand(4);
27703
27704 unsigned Opc;
27705 if (isRoundModeCurDirection(Rnd))
27706 Opc = IntrData->Opc0;
27707 else if (isRoundModeSAE(Rnd))
27708 Opc = IntrData->Opc1;
27709 else
27710 return SDValue();
27711
27712 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
27713 Subtarget, DAG);
27714 }
27715 case INTR_TYPE_SCALAR_MASK: {
27716 SDValue Src1 = Op.getOperand(1);
27717 SDValue Src2 = Op.getOperand(2);
27718 SDValue passThru = Op.getOperand(3);
27719 SDValue Mask = Op.getOperand(4);
27720 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27721 // There are 2 kinds of intrinsics in this group:
27722 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
27723 // (2) With rounding mode and sae - 7 operands.
27724 bool HasRounding = IntrWithRoundingModeOpcode != 0;
27725 if (Op.getNumOperands() == (5U + HasRounding)) {
27726 if (HasRounding) {
27727 SDValue Rnd = Op.getOperand(5);
27728 unsigned RC = 0;
27729 if (isRoundModeSAEToX(Rnd, RC))
27730 return getScalarMaskingNode(
27731 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
27732 DAG.getTargetConstant(RC, dl, MVT::i32)),
27733 Mask, passThru, Subtarget, DAG);
27734 if (!isRoundModeCurDirection(Rnd))
27735 return SDValue();
27736 }
27737 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
27738 Src2),
27739 Mask, passThru, Subtarget, DAG);
27740 }
27741
27742 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27743, __extension__
__PRETTY_FUNCTION__))
27743 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27743, __extension__
__PRETTY_FUNCTION__))
;
27744 SDValue RoundingMode = Op.getOperand(5);
27745 unsigned Opc = IntrData->Opc0;
27746 if (HasRounding) {
27747 SDValue Sae = Op.getOperand(6);
27748 if (isRoundModeSAE(Sae))
27749 Opc = IntrWithRoundingModeOpcode;
27750 else if (!isRoundModeCurDirection(Sae))
27751 return SDValue();
27752 }
27753 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
27754 Src2, RoundingMode),
27755 Mask, passThru, Subtarget, DAG);
27756 }
27757 case INTR_TYPE_SCALAR_MASK_RND: {
27758 SDValue Src1 = Op.getOperand(1);
27759 SDValue Src2 = Op.getOperand(2);
27760 SDValue passThru = Op.getOperand(3);
27761 SDValue Mask = Op.getOperand(4);
27762 SDValue Rnd = Op.getOperand(5);
27763
27764 SDValue NewOp;
27765 unsigned RC = 0;
27766 if (isRoundModeCurDirection(Rnd))
27767 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27768 else if (isRoundModeSAEToX(Rnd, RC))
27769 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27770 DAG.getTargetConstant(RC, dl, MVT::i32));
27771 else
27772 return SDValue();
27773
27774 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
27775 }
27776 case INTR_TYPE_SCALAR_MASK_SAE: {
27777 SDValue Src1 = Op.getOperand(1);
27778 SDValue Src2 = Op.getOperand(2);
27779 SDValue passThru = Op.getOperand(3);
27780 SDValue Mask = Op.getOperand(4);
27781 SDValue Sae = Op.getOperand(5);
27782 unsigned Opc;
27783 if (isRoundModeCurDirection(Sae))
27784 Opc = IntrData->Opc0;
27785 else if (isRoundModeSAE(Sae))
27786 Opc = IntrData->Opc1;
27787 else
27788 return SDValue();
27789
27790 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27791 Mask, passThru, Subtarget, DAG);
27792 }
27793 case INTR_TYPE_2OP_MASK: {
27794 SDValue Src1 = Op.getOperand(1);
27795 SDValue Src2 = Op.getOperand(2);
27796 SDValue PassThru = Op.getOperand(3);
27797 SDValue Mask = Op.getOperand(4);
27798 SDValue NewOp;
27799 if (IntrData->Opc1 != 0) {
27800 SDValue Rnd = Op.getOperand(5);
27801 unsigned RC = 0;
27802 if (isRoundModeSAEToX(Rnd, RC))
27803 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27804 DAG.getTargetConstant(RC, dl, MVT::i32));
27805 else if (!isRoundModeCurDirection(Rnd))
27806 return SDValue();
27807 }
27808 if (!NewOp)
27809 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27810 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27811 }
27812 case INTR_TYPE_2OP_MASK_SAE: {
27813 SDValue Src1 = Op.getOperand(1);
27814 SDValue Src2 = Op.getOperand(2);
27815 SDValue PassThru = Op.getOperand(3);
27816 SDValue Mask = Op.getOperand(4);
27817
27818 unsigned Opc = IntrData->Opc0;
27819 if (IntrData->Opc1 != 0) {
27820 SDValue Sae = Op.getOperand(5);
27821 if (isRoundModeSAE(Sae))
27822 Opc = IntrData->Opc1;
27823 else if (!isRoundModeCurDirection(Sae))
27824 return SDValue();
27825 }
27826
27827 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27828 Mask, PassThru, Subtarget, DAG);
27829 }
27830 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
27831 SDValue Src1 = Op.getOperand(1);
27832 SDValue Src2 = Op.getOperand(2);
27833 SDValue Src3 = Op.getOperand(3);
27834 SDValue PassThru = Op.getOperand(4);
27835 SDValue Mask = Op.getOperand(5);
27836 SDValue Sae = Op.getOperand(6);
27837 unsigned Opc;
27838 if (isRoundModeCurDirection(Sae))
27839 Opc = IntrData->Opc0;
27840 else if (isRoundModeSAE(Sae))
27841 Opc = IntrData->Opc1;
27842 else
27843 return SDValue();
27844
27845 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27846 Mask, PassThru, Subtarget, DAG);
27847 }
27848 case INTR_TYPE_3OP_MASK_SAE: {
27849 SDValue Src1 = Op.getOperand(1);
27850 SDValue Src2 = Op.getOperand(2);
27851 SDValue Src3 = Op.getOperand(3);
27852 SDValue PassThru = Op.getOperand(4);
27853 SDValue Mask = Op.getOperand(5);
27854
27855 unsigned Opc = IntrData->Opc0;
27856 if (IntrData->Opc1 != 0) {
27857 SDValue Sae = Op.getOperand(6);
27858 if (isRoundModeSAE(Sae))
27859 Opc = IntrData->Opc1;
27860 else if (!isRoundModeCurDirection(Sae))
27861 return SDValue();
27862 }
27863 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27864 Mask, PassThru, Subtarget, DAG);
27865 }
27866 case BLENDV: {
27867 SDValue Src1 = Op.getOperand(1);
27868 SDValue Src2 = Op.getOperand(2);
27869 SDValue Src3 = Op.getOperand(3);
27870
27871 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
27872 Src3 = DAG.getBitcast(MaskVT, Src3);
27873
27874 // Reverse the operands to match VSELECT order.
27875 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
27876 }
27877 case VPERM_2OP : {
27878 SDValue Src1 = Op.getOperand(1);
27879 SDValue Src2 = Op.getOperand(2);
27880
27881 // Swap Src1 and Src2 in the node creation
27882 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
27883 }
27884 case CFMA_OP_MASKZ:
27885 case CFMA_OP_MASK: {
27886 SDValue Src1 = Op.getOperand(1);
27887 SDValue Src2 = Op.getOperand(2);
27888 SDValue Src3 = Op.getOperand(3);
27889 SDValue Mask = Op.getOperand(4);
27890 MVT VT = Op.getSimpleValueType();
27891
27892 SDValue PassThru = Src3;
27893 if (IntrData->Type == CFMA_OP_MASKZ)
27894 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
27895
27896 // We add rounding mode to the Node when
27897 // - RC Opcode is specified and
27898 // - RC is not "current direction".
27899 SDValue NewOp;
27900 if (IntrData->Opc1 != 0) {
27901 SDValue Rnd = Op.getOperand(5);
27902 unsigned RC = 0;
27903 if (isRoundModeSAEToX(Rnd, RC))
27904 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
27905 DAG.getTargetConstant(RC, dl, MVT::i32));
27906 else if (!isRoundModeCurDirection(Rnd))
27907 return SDValue();
27908 }
27909 if (!NewOp)
27910 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
27911 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27912 }
27913 case IFMA_OP:
27914 // NOTE: We need to swizzle the operands to pass the multiply operands
27915 // first.
27916 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27917 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
27918 case FPCLASSS: {
27919 SDValue Src1 = Op.getOperand(1);
27920 SDValue Imm = Op.getOperand(2);
27921 SDValue Mask = Op.getOperand(3);
27922 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
27923 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
27924 Subtarget, DAG);
27925 // Need to fill with zeros to ensure the bitcast will produce zeroes
27926 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27927 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27928 DAG.getConstant(0, dl, MVT::v8i1),
27929 FPclassMask, DAG.getIntPtrConstant(0, dl));
27930 return DAG.getBitcast(MVT::i8, Ins);
27931 }
27932
27933 case CMP_MASK_CC: {
27934 MVT MaskVT = Op.getSimpleValueType();
27935 SDValue CC = Op.getOperand(3);
27936 SDValue Mask = Op.getOperand(4);
27937 // We specify 2 possible opcodes for intrinsics with rounding modes.
27938 // First, we check if the intrinsic may have non-default rounding mode,
27939 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27940 if (IntrData->Opc1 != 0) {
27941 SDValue Sae = Op.getOperand(5);
27942 if (isRoundModeSAE(Sae))
27943 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
27944 Op.getOperand(2), CC, Mask, Sae);
27945 if (!isRoundModeCurDirection(Sae))
27946 return SDValue();
27947 }
27948 //default rounding mode
27949 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
27950 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
27951 }
27952 case CMP_MASK_SCALAR_CC: {
27953 SDValue Src1 = Op.getOperand(1);
27954 SDValue Src2 = Op.getOperand(2);
27955 SDValue CC = Op.getOperand(3);
27956 SDValue Mask = Op.getOperand(4);
27957
27958 SDValue Cmp;
27959 if (IntrData->Opc1 != 0) {
27960 SDValue Sae = Op.getOperand(5);
27961 if (isRoundModeSAE(Sae))
27962 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
27963 else if (!isRoundModeCurDirection(Sae))
27964 return SDValue();
27965 }
27966 //default rounding mode
27967 if (!Cmp.getNode())
27968 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
27969
27970 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
27971 Subtarget, DAG);
27972 // Need to fill with zeros to ensure the bitcast will produce zeroes
27973 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27974 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27975 DAG.getConstant(0, dl, MVT::v8i1),
27976 CmpMask, DAG.getIntPtrConstant(0, dl));
27977 return DAG.getBitcast(MVT::i8, Ins);
27978 }
27979 case COMI: { // Comparison intrinsics
27980 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
27981 SDValue LHS = Op.getOperand(1);
27982 SDValue RHS = Op.getOperand(2);
27983 // Some conditions require the operands to be swapped.
27984 if (CC == ISD::SETLT || CC == ISD::SETLE)
27985 std::swap(LHS, RHS);
27986
27987 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
27988 SDValue SetCC;
27989 switch (CC) {
27990 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
27991 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
27992 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
27993 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
27994 break;
27995 }
27996 case ISD::SETNE: { // (ZF = 1 or PF = 1)
27997 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
27998 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
27999 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
28000 break;
28001 }
28002 case ISD::SETGT: // (CF = 0 and ZF = 0)
28003 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
28004 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
28005 break;
28006 }
28007 case ISD::SETGE: // CF = 0
28008 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
28009 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
28010 break;
28011 default:
28012 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28012)
;
28013 }
28014 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28015 }
28016 case COMI_RM: { // Comparison intrinsics with Sae
28017 SDValue LHS = Op.getOperand(1);
28018 SDValue RHS = Op.getOperand(2);
28019 unsigned CondVal = Op.getConstantOperandVal(3);
28020 SDValue Sae = Op.getOperand(4);
28021
28022 SDValue FCmp;
28023 if (isRoundModeCurDirection(Sae))
28024 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
28025 DAG.getTargetConstant(CondVal, dl, MVT::i8));
28026 else if (isRoundModeSAE(Sae))
28027 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
28028 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
28029 else
28030 return SDValue();
28031 // Need to fill with zeros to ensure the bitcast will produce zeroes
28032 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28033 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
28034 DAG.getConstant(0, dl, MVT::v16i1),
28035 FCmp, DAG.getIntPtrConstant(0, dl));
28036 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
28037 DAG.getBitcast(MVT::i16, Ins));
28038 }
28039 case VSHIFT: {
28040 SDValue SrcOp = Op.getOperand(1);
28041 SDValue ShAmt = Op.getOperand(2);
28042 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28043, __extension__
__PRETTY_FUNCTION__))
28043 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28043, __extension__
__PRETTY_FUNCTION__))
;
28044
28045 // Catch shift-by-constant.
28046 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
28047 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
28048 Op.getSimpleValueType(), SrcOp,
28049 CShAmt->getZExtValue(), DAG);
28050
28051 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
28052 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
28053 SrcOp, ShAmt, 0, Subtarget, DAG);
28054 }
28055 case COMPRESS_EXPAND_IN_REG: {
28056 SDValue Mask = Op.getOperand(3);
28057 SDValue DataToCompress = Op.getOperand(1);
28058 SDValue PassThru = Op.getOperand(2);
28059 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
28060 return Op.getOperand(1);
28061
28062 // Avoid false dependency.
28063 if (PassThru.isUndef())
28064 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
28065
28066 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
28067 Mask);
28068 }
28069 case FIXUPIMM:
28070 case FIXUPIMM_MASKZ: {
28071 SDValue Src1 = Op.getOperand(1);
28072 SDValue Src2 = Op.getOperand(2);
28073 SDValue Src3 = Op.getOperand(3);
28074 SDValue Imm = Op.getOperand(4);
28075 SDValue Mask = Op.getOperand(5);
28076 SDValue Passthru = (IntrData->Type == FIXUPIMM)
28077 ? Src1
28078 : getZeroVector(VT, Subtarget, DAG, dl);
28079
28080 unsigned Opc = IntrData->Opc0;
28081 if (IntrData->Opc1 != 0) {
28082 SDValue Sae = Op.getOperand(6);
28083 if (isRoundModeSAE(Sae))
28084 Opc = IntrData->Opc1;
28085 else if (!isRoundModeCurDirection(Sae))
28086 return SDValue();
28087 }
28088
28089 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
28090
28091 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
28092 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28093
28094 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28095 }
28096 case ROUNDP: {
28097 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28097, __extension__
__PRETTY_FUNCTION__))
;
28098 // Clear the upper bits of the rounding immediate so that the legacy
28099 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28100 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
28101 SDValue RoundingMode =
28102 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28103 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28104 Op.getOperand(1), RoundingMode);
28105 }
28106 case ROUNDS: {
28107 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28107, __extension__
__PRETTY_FUNCTION__))
;
28108 // Clear the upper bits of the rounding immediate so that the legacy
28109 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28110 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
28111 SDValue RoundingMode =
28112 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28113 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28114 Op.getOperand(1), Op.getOperand(2), RoundingMode);
28115 }
28116 case BEXTRI: {
28117 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28117, __extension__
__PRETTY_FUNCTION__))
;
28118
28119 uint64_t Imm = Op.getConstantOperandVal(2);
28120 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
28121 Op.getValueType());
28122 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28123 Op.getOperand(1), Control);
28124 }
28125 // ADC/ADCX/SBB
28126 case ADX: {
28127 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
28128 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
28129
28130 SDValue Res;
28131 // If the carry in is zero, then we should just use ADD/SUB instead of
28132 // ADC/SBB.
28133 if (isNullConstant(Op.getOperand(1))) {
28134 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
28135 Op.getOperand(3));
28136 } else {
28137 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
28138 DAG.getConstant(-1, dl, MVT::i8));
28139 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
28140 Op.getOperand(3), GenCF.getValue(1));
28141 }
28142 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
28143 SDValue Results[] = { SetCC, Res };
28144 return DAG.getMergeValues(Results, dl);
28145 }
28146 case CVTPD2PS_MASK:
28147 case CVTPD2DQ_MASK:
28148 case CVTQQ2PS_MASK:
28149 case TRUNCATE_TO_REG: {
28150 SDValue Src = Op.getOperand(1);
28151 SDValue PassThru = Op.getOperand(2);
28152 SDValue Mask = Op.getOperand(3);
28153
28154 if (isAllOnesConstant(Mask))
28155 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28156
28157 MVT SrcVT = Src.getSimpleValueType();
28158 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28159 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28160 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
28161 {Src, PassThru, Mask});
28162 }
28163 case CVTPS2PH_MASK: {
28164 SDValue Src = Op.getOperand(1);
28165 SDValue Rnd = Op.getOperand(2);
28166 SDValue PassThru = Op.getOperand(3);
28167 SDValue Mask = Op.getOperand(4);
28168
28169 unsigned RC = 0;
28170 unsigned Opc = IntrData->Opc0;
28171 bool SAE = Src.getValueType().is512BitVector() &&
28172 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
28173 if (SAE) {
28174 Opc = X86ISD::CVTPS2PH_SAE;
28175 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
28176 }
28177
28178 if (isAllOnesConstant(Mask))
28179 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
28180
28181 if (SAE)
28182 Opc = X86ISD::MCVTPS2PH_SAE;
28183 else
28184 Opc = IntrData->Opc1;
28185 MVT SrcVT = Src.getSimpleValueType();
28186 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28187 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28188 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
28189 }
28190 case CVTNEPS2BF16_MASK: {
28191 SDValue Src = Op.getOperand(1);
28192 SDValue PassThru = Op.getOperand(2);
28193 SDValue Mask = Op.getOperand(3);
28194
28195 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28196 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28197
28198 // Break false dependency.
28199 if (PassThru.isUndef())
28200 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
28201
28202 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
28203 Mask);
28204 }
28205 default:
28206 break;
28207 }
28208 }
28209
28210 switch (IntNo) {
28211 default: return SDValue(); // Don't custom lower most intrinsics.
28212
28213 // ptest and testp intrinsics. The intrinsic these come from are designed to
28214 // return an integer value, not just an instruction so lower it to the ptest
28215 // or testp pattern and a setcc for the result.
28216 case Intrinsic::x86_avx512_ktestc_b:
28217 case Intrinsic::x86_avx512_ktestc_w:
28218 case Intrinsic::x86_avx512_ktestc_d:
28219 case Intrinsic::x86_avx512_ktestc_q:
28220 case Intrinsic::x86_avx512_ktestz_b:
28221 case Intrinsic::x86_avx512_ktestz_w:
28222 case Intrinsic::x86_avx512_ktestz_d:
28223 case Intrinsic::x86_avx512_ktestz_q:
28224 case Intrinsic::x86_sse41_ptestz:
28225 case Intrinsic::x86_sse41_ptestc:
28226 case Intrinsic::x86_sse41_ptestnzc:
28227 case Intrinsic::x86_avx_ptestz_256:
28228 case Intrinsic::x86_avx_ptestc_256:
28229 case Intrinsic::x86_avx_ptestnzc_256:
28230 case Intrinsic::x86_avx_vtestz_ps:
28231 case Intrinsic::x86_avx_vtestc_ps:
28232 case Intrinsic::x86_avx_vtestnzc_ps:
28233 case Intrinsic::x86_avx_vtestz_pd:
28234 case Intrinsic::x86_avx_vtestc_pd:
28235 case Intrinsic::x86_avx_vtestnzc_pd:
28236 case Intrinsic::x86_avx_vtestz_ps_256:
28237 case Intrinsic::x86_avx_vtestc_ps_256:
28238 case Intrinsic::x86_avx_vtestnzc_ps_256:
28239 case Intrinsic::x86_avx_vtestz_pd_256:
28240 case Intrinsic::x86_avx_vtestc_pd_256:
28241 case Intrinsic::x86_avx_vtestnzc_pd_256: {
28242 unsigned TestOpc = X86ISD::PTEST;
28243 X86::CondCode X86CC;
28244 switch (IntNo) {
28245 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28245)
;
28246 case Intrinsic::x86_avx512_ktestc_b:
28247 case Intrinsic::x86_avx512_ktestc_w:
28248 case Intrinsic::x86_avx512_ktestc_d:
28249 case Intrinsic::x86_avx512_ktestc_q:
28250 // CF = 1
28251 TestOpc = X86ISD::KTEST;
28252 X86CC = X86::COND_B;
28253 break;
28254 case Intrinsic::x86_avx512_ktestz_b:
28255 case Intrinsic::x86_avx512_ktestz_w:
28256 case Intrinsic::x86_avx512_ktestz_d:
28257 case Intrinsic::x86_avx512_ktestz_q:
28258 TestOpc = X86ISD::KTEST;
28259 X86CC = X86::COND_E;
28260 break;
28261 case Intrinsic::x86_avx_vtestz_ps:
28262 case Intrinsic::x86_avx_vtestz_pd:
28263 case Intrinsic::x86_avx_vtestz_ps_256:
28264 case Intrinsic::x86_avx_vtestz_pd_256:
28265 TestOpc = X86ISD::TESTP;
28266 [[fallthrough]];
28267 case Intrinsic::x86_sse41_ptestz:
28268 case Intrinsic::x86_avx_ptestz_256:
28269 // ZF = 1
28270 X86CC = X86::COND_E;
28271 break;
28272 case Intrinsic::x86_avx_vtestc_ps:
28273 case Intrinsic::x86_avx_vtestc_pd:
28274 case Intrinsic::x86_avx_vtestc_ps_256:
28275 case Intrinsic::x86_avx_vtestc_pd_256:
28276 TestOpc = X86ISD::TESTP;
28277 [[fallthrough]];
28278 case Intrinsic::x86_sse41_ptestc:
28279 case Intrinsic::x86_avx_ptestc_256:
28280 // CF = 1
28281 X86CC = X86::COND_B;
28282 break;
28283 case Intrinsic::x86_avx_vtestnzc_ps:
28284 case Intrinsic::x86_avx_vtestnzc_pd:
28285 case Intrinsic::x86_avx_vtestnzc_ps_256:
28286 case Intrinsic::x86_avx_vtestnzc_pd_256:
28287 TestOpc = X86ISD::TESTP;
28288 [[fallthrough]];
28289 case Intrinsic::x86_sse41_ptestnzc:
28290 case Intrinsic::x86_avx_ptestnzc_256:
28291 // ZF and CF = 0
28292 X86CC = X86::COND_A;
28293 break;
28294 }
28295
28296 SDValue LHS = Op.getOperand(1);
28297 SDValue RHS = Op.getOperand(2);
28298 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
28299 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
28300 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28301 }
28302
28303 case Intrinsic::x86_sse42_pcmpistria128:
28304 case Intrinsic::x86_sse42_pcmpestria128:
28305 case Intrinsic::x86_sse42_pcmpistric128:
28306 case Intrinsic::x86_sse42_pcmpestric128:
28307 case Intrinsic::x86_sse42_pcmpistrio128:
28308 case Intrinsic::x86_sse42_pcmpestrio128:
28309 case Intrinsic::x86_sse42_pcmpistris128:
28310 case Intrinsic::x86_sse42_pcmpestris128:
28311 case Intrinsic::x86_sse42_pcmpistriz128:
28312 case Intrinsic::x86_sse42_pcmpestriz128: {
28313 unsigned Opcode;
28314 X86::CondCode X86CC;
28315 switch (IntNo) {
28316 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28316)
; // Can't reach here.
28317 case Intrinsic::x86_sse42_pcmpistria128:
28318 Opcode = X86ISD::PCMPISTR;
28319 X86CC = X86::COND_A;
28320 break;
28321 case Intrinsic::x86_sse42_pcmpestria128:
28322 Opcode = X86ISD::PCMPESTR;
28323 X86CC = X86::COND_A;
28324 break;
28325 case Intrinsic::x86_sse42_pcmpistric128:
28326 Opcode = X86ISD::PCMPISTR;
28327 X86CC = X86::COND_B;
28328 break;
28329 case Intrinsic::x86_sse42_pcmpestric128:
28330 Opcode = X86ISD::PCMPESTR;
28331 X86CC = X86::COND_B;
28332 break;
28333 case Intrinsic::x86_sse42_pcmpistrio128:
28334 Opcode = X86ISD::PCMPISTR;
28335 X86CC = X86::COND_O;
28336 break;
28337 case Intrinsic::x86_sse42_pcmpestrio128:
28338 Opcode = X86ISD::PCMPESTR;
28339 X86CC = X86::COND_O;
28340 break;
28341 case Intrinsic::x86_sse42_pcmpistris128:
28342 Opcode = X86ISD::PCMPISTR;
28343 X86CC = X86::COND_S;
28344 break;
28345 case Intrinsic::x86_sse42_pcmpestris128:
28346 Opcode = X86ISD::PCMPESTR;
28347 X86CC = X86::COND_S;
28348 break;
28349 case Intrinsic::x86_sse42_pcmpistriz128:
28350 Opcode = X86ISD::PCMPISTR;
28351 X86CC = X86::COND_E;
28352 break;
28353 case Intrinsic::x86_sse42_pcmpestriz128:
28354 Opcode = X86ISD::PCMPESTR;
28355 X86CC = X86::COND_E;
28356 break;
28357 }
28358 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28359 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28360 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
28361 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
28362 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28363 }
28364
28365 case Intrinsic::x86_sse42_pcmpistri128:
28366 case Intrinsic::x86_sse42_pcmpestri128: {
28367 unsigned Opcode;
28368 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
28369 Opcode = X86ISD::PCMPISTR;
28370 else
28371 Opcode = X86ISD::PCMPESTR;
28372
28373 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28374 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28375 return DAG.getNode(Opcode, dl, VTs, NewOps);
28376 }
28377
28378 case Intrinsic::x86_sse42_pcmpistrm128:
28379 case Intrinsic::x86_sse42_pcmpestrm128: {
28380 unsigned Opcode;
28381 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
28382 Opcode = X86ISD::PCMPISTR;
28383 else
28384 Opcode = X86ISD::PCMPESTR;
28385
28386 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28387 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28388 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
28389 }
28390
28391 case Intrinsic::eh_sjlj_lsda: {
28392 MachineFunction &MF = DAG.getMachineFunction();
28393 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28394 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28395 auto &Context = MF.getMMI().getContext();
28396 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
28397 Twine(MF.getFunctionNumber()));
28398 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
28399 DAG.getMCSymbol(S, PtrVT));
28400 }
28401
28402 case Intrinsic::x86_seh_lsda: {
28403 // Compute the symbol for the LSDA. We know it'll get emitted later.
28404 MachineFunction &MF = DAG.getMachineFunction();
28405 SDValue Op1 = Op.getOperand(1);
28406 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
28407 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
28408 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
28409
28410 // Generate a simple absolute symbol reference. This intrinsic is only
28411 // supported on 32-bit Windows, which isn't PIC.
28412 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
28413 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
28414 }
28415
28416 case Intrinsic::eh_recoverfp: {
28417 SDValue FnOp = Op.getOperand(1);
28418 SDValue IncomingFPOp = Op.getOperand(2);
28419 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
28420 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
28421 if (!Fn)
28422 report_fatal_error(
28423 "llvm.eh.recoverfp must take a function as the first argument");
28424 return recoverFramePointer(DAG, Fn, IncomingFPOp);
28425 }
28426
28427 case Intrinsic::localaddress: {
28428 // Returns one of the stack, base, or frame pointer registers, depending on
28429 // which is used to reference local variables.
28430 MachineFunction &MF = DAG.getMachineFunction();
28431 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28432 unsigned Reg;
28433 if (RegInfo->hasBasePointer(MF))
28434 Reg = RegInfo->getBaseRegister();
28435 else { // Handles the SP or FP case.
28436 bool CantUseFP = RegInfo->hasStackRealignment(MF);
28437 if (CantUseFP)
28438 Reg = RegInfo->getPtrSizedStackRegister(MF);
28439 else
28440 Reg = RegInfo->getPtrSizedFrameRegister(MF);
28441 }
28442 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
28443 }
28444 case Intrinsic::x86_avx512_vp2intersect_q_512:
28445 case Intrinsic::x86_avx512_vp2intersect_q_256:
28446 case Intrinsic::x86_avx512_vp2intersect_q_128:
28447 case Intrinsic::x86_avx512_vp2intersect_d_512:
28448 case Intrinsic::x86_avx512_vp2intersect_d_256:
28449 case Intrinsic::x86_avx512_vp2intersect_d_128: {
28450 MVT MaskVT = Op.getSimpleValueType();
28451
28452 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
28453 SDLoc DL(Op);
28454
28455 SDValue Operation =
28456 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
28457 Op->getOperand(1), Op->getOperand(2));
28458
28459 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
28460 MaskVT, Operation);
28461 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
28462 MaskVT, Operation);
28463 return DAG.getMergeValues({Result0, Result1}, DL);
28464 }
28465 case Intrinsic::x86_mmx_pslli_w:
28466 case Intrinsic::x86_mmx_pslli_d:
28467 case Intrinsic::x86_mmx_pslli_q:
28468 case Intrinsic::x86_mmx_psrli_w:
28469 case Intrinsic::x86_mmx_psrli_d:
28470 case Intrinsic::x86_mmx_psrli_q:
28471 case Intrinsic::x86_mmx_psrai_w:
28472 case Intrinsic::x86_mmx_psrai_d: {
28473 SDLoc DL(Op);
28474 SDValue ShAmt = Op.getOperand(2);
28475 // If the argument is a constant, convert it to a target constant.
28476 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
28477 // Clamp out of bounds shift amounts since they will otherwise be masked
28478 // to 8-bits which may make it no longer out of bounds.
28479 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
28480 if (ShiftAmount == 0)
28481 return Op.getOperand(1);
28482
28483 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28484 Op.getOperand(0), Op.getOperand(1),
28485 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
28486 }
28487
28488 unsigned NewIntrinsic;
28489 switch (IntNo) {
28490 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28490)
; // Can't reach here.
28491 case Intrinsic::x86_mmx_pslli_w:
28492 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
28493 break;
28494 case Intrinsic::x86_mmx_pslli_d:
28495 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
28496 break;
28497 case Intrinsic::x86_mmx_pslli_q:
28498 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
28499 break;
28500 case Intrinsic::x86_mmx_psrli_w:
28501 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
28502 break;
28503 case Intrinsic::x86_mmx_psrli_d:
28504 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
28505 break;
28506 case Intrinsic::x86_mmx_psrli_q:
28507 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
28508 break;
28509 case Intrinsic::x86_mmx_psrai_w:
28510 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
28511 break;
28512 case Intrinsic::x86_mmx_psrai_d:
28513 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
28514 break;
28515 }
28516
28517 // The vector shift intrinsics with scalars uses 32b shift amounts but
28518 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
28519 // MMX register.
28520 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
28521 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28522 DAG.getTargetConstant(NewIntrinsic, DL,
28523 getPointerTy(DAG.getDataLayout())),
28524 Op.getOperand(1), ShAmt);
28525 }
28526 case Intrinsic::thread_pointer: {
28527 if (Subtarget.isTargetELF()) {
28528 SDLoc dl(Op);
28529 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28530 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
28531 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
28532 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
28533 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28534 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
28535 }
28536 report_fatal_error(
28537 "Target OS doesn't support __builtin_thread_pointer() yet.");
28538 }
28539 }
28540}
28541
28542static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28543 SDValue Src, SDValue Mask, SDValue Base,
28544 SDValue Index, SDValue ScaleOp, SDValue Chain,
28545 const X86Subtarget &Subtarget) {
28546 SDLoc dl(Op);
28547 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28548 // Scale must be constant.
28549 if (!C)
28550 return SDValue();
28551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28552 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28553 TLI.getPointerTy(DAG.getDataLayout()));
28554 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
28555 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28556 // If source is undef or we know it won't be used, use a zero vector
28557 // to break register dependency.
28558 // TODO: use undef instead and let BreakFalseDeps deal with it?
28559 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28560 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28561
28562 // Cast mask to an integer type.
28563 Mask = DAG.getBitcast(MaskVT, Mask);
28564
28565 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28566
28567 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28568 SDValue Res =
28569 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28570 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28571 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28572}
28573
28574static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
28575 SDValue Src, SDValue Mask, SDValue Base,
28576 SDValue Index, SDValue ScaleOp, SDValue Chain,
28577 const X86Subtarget &Subtarget) {
28578 MVT VT = Op.getSimpleValueType();
28579 SDLoc dl(Op);
28580 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28581 // Scale must be constant.
28582 if (!C)
28583 return SDValue();
28584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28585 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28586 TLI.getPointerTy(DAG.getDataLayout()));
28587 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28588 VT.getVectorNumElements());
28589 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28590
28591 // We support two versions of the gather intrinsics. One with scalar mask and
28592 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28593 if (Mask.getValueType() != MaskVT)
28594 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28595
28596 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28597 // If source is undef or we know it won't be used, use a zero vector
28598 // to break register dependency.
28599 // TODO: use undef instead and let BreakFalseDeps deal with it?
28600 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28601 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28602
28603 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28604
28605 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28606 SDValue Res =
28607 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28608 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28609 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28610}
28611
28612static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28613 SDValue Src, SDValue Mask, SDValue Base,
28614 SDValue Index, SDValue ScaleOp, SDValue Chain,
28615 const X86Subtarget &Subtarget) {
28616 SDLoc dl(Op);
28617 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28618 // Scale must be constant.
28619 if (!C)
28620 return SDValue();
28621 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28622 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28623 TLI.getPointerTy(DAG.getDataLayout()));
28624 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28625 Src.getSimpleValueType().getVectorNumElements());
28626 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28627
28628 // We support two versions of the scatter intrinsics. One with scalar mask and
28629 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28630 if (Mask.getValueType() != MaskVT)
28631 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28632
28633 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28634
28635 SDVTList VTs = DAG.getVTList(MVT::Other);
28636 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
28637 SDValue Res =
28638 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28639 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28640 return Res;
28641}
28642
28643static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28644 SDValue Mask, SDValue Base, SDValue Index,
28645 SDValue ScaleOp, SDValue Chain,
28646 const X86Subtarget &Subtarget) {
28647 SDLoc dl(Op);
28648 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28649 // Scale must be constant.
28650 if (!C)
28651 return SDValue();
28652 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28653 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28654 TLI.getPointerTy(DAG.getDataLayout()));
28655 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
28656 SDValue Segment = DAG.getRegister(0, MVT::i32);
28657 MVT MaskVT =
28658 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
28659 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28660 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
28661 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
28662 return SDValue(Res, 0);
28663}
28664
28665/// Handles the lowering of builtin intrinsics with chain that return their
28666/// value into registers EDX:EAX.
28667/// If operand ScrReg is a valid register identifier, then operand 2 of N is
28668/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
28669/// TargetOpcode.
28670/// Returns a Glue value which can be used to add extra copy-from-reg if the
28671/// expanded intrinsics implicitly defines extra registers (i.e. not just
28672/// EDX:EAX).
28673static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
28674 SelectionDAG &DAG,
28675 unsigned TargetOpcode,
28676 unsigned SrcReg,
28677 const X86Subtarget &Subtarget,
28678 SmallVectorImpl<SDValue> &Results) {
28679 SDValue Chain = N->getOperand(0);
28680 SDValue Glue;
28681
28682 if (SrcReg) {
28683 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28683, __extension__
__PRETTY_FUNCTION__))
;
28684 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
28685 Glue = Chain.getValue(1);
28686 }
28687
28688 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28689 SDValue N1Ops[] = {Chain, Glue};
28690 SDNode *N1 = DAG.getMachineNode(
28691 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
28692 Chain = SDValue(N1, 0);
28693
28694 // Reads the content of XCR and returns it in registers EDX:EAX.
28695 SDValue LO, HI;
28696 if (Subtarget.is64Bit()) {
28697 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
28698 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
28699 LO.getValue(2));
28700 } else {
28701 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
28702 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
28703 LO.getValue(2));
28704 }
28705 Chain = HI.getValue(1);
28706 Glue = HI.getValue(2);
28707
28708 if (Subtarget.is64Bit()) {
28709 // Merge the two 32-bit values into a 64-bit one.
28710 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
28711 DAG.getConstant(32, DL, MVT::i8));
28712 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
28713 Results.push_back(Chain);
28714 return Glue;
28715 }
28716
28717 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
28718 SDValue Ops[] = { LO, HI };
28719 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
28720 Results.push_back(Pair);
28721 Results.push_back(Chain);
28722 return Glue;
28723}
28724
28725/// Handles the lowering of builtin intrinsics that read the time stamp counter
28726/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
28727/// READCYCLECOUNTER nodes.
28728static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
28729 SelectionDAG &DAG,
28730 const X86Subtarget &Subtarget,
28731 SmallVectorImpl<SDValue> &Results) {
28732 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
28733 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
28734 // and the EAX register is loaded with the low-order 32 bits.
28735 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
28736 /* NoRegister */0, Subtarget,
28737 Results);
28738 if (Opcode != X86::RDTSCP)
28739 return;
28740
28741 SDValue Chain = Results[1];
28742 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
28743 // the ECX register. Add 'ecx' explicitly to the chain.
28744 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
28745 Results[1] = ecx;
28746 Results.push_back(ecx.getValue(1));
28747}
28748
28749static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
28750 SelectionDAG &DAG) {
28751 SmallVector<SDValue, 3> Results;
28752 SDLoc DL(Op);
28753 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
28754 Results);
28755 return DAG.getMergeValues(Results, DL);
28756}
28757
28758static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
28759 MachineFunction &MF = DAG.getMachineFunction();
28760 SDValue Chain = Op.getOperand(0);
28761 SDValue RegNode = Op.getOperand(2);
28762 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28763 if (!EHInfo)
28764 report_fatal_error("EH registrations only live in functions using WinEH");
28765
28766 // Cast the operand to an alloca, and remember the frame index.
28767 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
28768 if (!FINode)
28769 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
28770 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
28771
28772 // Return the chain operand without making any DAG nodes.
28773 return Chain;
28774}
28775
28776static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
28777 MachineFunction &MF = DAG.getMachineFunction();
28778 SDValue Chain = Op.getOperand(0);
28779 SDValue EHGuard = Op.getOperand(2);
28780 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28781 if (!EHInfo)
28782 report_fatal_error("EHGuard only live in functions using WinEH");
28783
28784 // Cast the operand to an alloca, and remember the frame index.
28785 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
28786 if (!FINode)
28787 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
28788 EHInfo->EHGuardFrameIndex = FINode->getIndex();
28789
28790 // Return the chain operand without making any DAG nodes.
28791 return Chain;
28792}
28793
28794/// Emit Truncating Store with signed or unsigned saturation.
28795static SDValue
28796EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
28797 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
28798 SelectionDAG &DAG) {
28799 SDVTList VTs = DAG.getVTList(MVT::Other);
28800 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
28801 SDValue Ops[] = { Chain, Val, Ptr, Undef };
28802 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
28803 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28804}
28805
28806/// Emit Masked Truncating Store with signed or unsigned saturation.
28807static SDValue
28808EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
28809 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
28810 MachineMemOperand *MMO, SelectionDAG &DAG) {
28811 SDVTList VTs = DAG.getVTList(MVT::Other);
28812 SDValue Ops[] = { Chain, Val, Ptr, Mask };
28813 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
28814 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28815}
28816
28817static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
28818 SelectionDAG &DAG) {
28819 unsigned IntNo = Op.getConstantOperandVal(1);
28820 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
28821 if (!IntrData) {
28822 switch (IntNo) {
28823
28824 case Intrinsic::swift_async_context_addr: {
28825 SDLoc dl(Op);
28826 auto &MF = DAG.getMachineFunction();
28827 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
28828 if (Subtarget.is64Bit()) {
28829 MF.getFrameInfo().setFrameAddressIsTaken(true);
28830 X86FI->setHasSwiftAsyncContext(true);
28831 SDValue Chain = Op->getOperand(0);
28832 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
28833 SDValue Result =
28834 SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
28835 DAG.getTargetConstant(8, dl, MVT::i32)),
28836 0);
28837 // Return { result, chain }.
28838 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28839 CopyRBP.getValue(1));
28840 } else {
28841 // 32-bit so no special extended frame, create or reuse an existing
28842 // stack slot.
28843 if (!X86FI->getSwiftAsyncContextFrameIdx())
28844 X86FI->setSwiftAsyncContextFrameIdx(
28845 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
28846 SDValue Result =
28847 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
28848 // Return { result, chain }.
28849 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28850 Op->getOperand(0));
28851 }
28852 }
28853
28854 case llvm::Intrinsic::x86_seh_ehregnode:
28855 return MarkEHRegistrationNode(Op, DAG);
28856 case llvm::Intrinsic::x86_seh_ehguard:
28857 return MarkEHGuard(Op, DAG);
28858 case llvm::Intrinsic::x86_rdpkru: {
28859 SDLoc dl(Op);
28860 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28861 // Create a RDPKRU node and pass 0 to the ECX parameter.
28862 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
28863 DAG.getConstant(0, dl, MVT::i32));
28864 }
28865 case llvm::Intrinsic::x86_wrpkru: {
28866 SDLoc dl(Op);
28867 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
28868 // to the EDX and ECX parameters.
28869 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
28870 Op.getOperand(0), Op.getOperand(2),
28871 DAG.getConstant(0, dl, MVT::i32),
28872 DAG.getConstant(0, dl, MVT::i32));
28873 }
28874 case llvm::Intrinsic::asan_check_memaccess: {
28875 // Mark this as adjustsStack because it will be lowered to a call.
28876 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
28877 // Don't do anything here, we will expand these intrinsics out later.
28878 return Op;
28879 }
28880 case llvm::Intrinsic::x86_flags_read_u32:
28881 case llvm::Intrinsic::x86_flags_read_u64:
28882 case llvm::Intrinsic::x86_flags_write_u32:
28883 case llvm::Intrinsic::x86_flags_write_u64: {
28884 // We need a frame pointer because this will get lowered to a PUSH/POP
28885 // sequence.
28886 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28887 MFI.setHasCopyImplyingStackAdjustment(true);
28888 // Don't do anything here, we will expand these intrinsics out later
28889 // during FinalizeISel in EmitInstrWithCustomInserter.
28890 return Op;
28891 }
28892 case Intrinsic::x86_lwpins32:
28893 case Intrinsic::x86_lwpins64:
28894 case Intrinsic::x86_umwait:
28895 case Intrinsic::x86_tpause: {
28896 SDLoc dl(Op);
28897 SDValue Chain = Op->getOperand(0);
28898 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28899 unsigned Opcode;
28900
28901 switch (IntNo) {
28902 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28902)
;
28903 case Intrinsic::x86_umwait:
28904 Opcode = X86ISD::UMWAIT;
28905 break;
28906 case Intrinsic::x86_tpause:
28907 Opcode = X86ISD::TPAUSE;
28908 break;
28909 case Intrinsic::x86_lwpins32:
28910 case Intrinsic::x86_lwpins64:
28911 Opcode = X86ISD::LWPINS;
28912 break;
28913 }
28914
28915 SDValue Operation =
28916 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
28917 Op->getOperand(3), Op->getOperand(4));
28918 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28919 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28920 Operation.getValue(1));
28921 }
28922 case Intrinsic::x86_enqcmd:
28923 case Intrinsic::x86_enqcmds: {
28924 SDLoc dl(Op);
28925 SDValue Chain = Op.getOperand(0);
28926 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28927 unsigned Opcode;
28928 switch (IntNo) {
28929 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28929)
;
28930 case Intrinsic::x86_enqcmd:
28931 Opcode = X86ISD::ENQCMD;
28932 break;
28933 case Intrinsic::x86_enqcmds:
28934 Opcode = X86ISD::ENQCMDS;
28935 break;
28936 }
28937 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
28938 Op.getOperand(3));
28939 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
28940 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28941 Operation.getValue(1));
28942 }
28943 case Intrinsic::x86_aesenc128kl:
28944 case Intrinsic::x86_aesdec128kl:
28945 case Intrinsic::x86_aesenc256kl:
28946 case Intrinsic::x86_aesdec256kl: {
28947 SDLoc DL(Op);
28948 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
28949 SDValue Chain = Op.getOperand(0);
28950 unsigned Opcode;
28951
28952 switch (IntNo) {
28953 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28953)
;
28954 case Intrinsic::x86_aesenc128kl:
28955 Opcode = X86ISD::AESENC128KL;
28956 break;
28957 case Intrinsic::x86_aesdec128kl:
28958 Opcode = X86ISD::AESDEC128KL;
28959 break;
28960 case Intrinsic::x86_aesenc256kl:
28961 Opcode = X86ISD::AESENC256KL;
28962 break;
28963 case Intrinsic::x86_aesdec256kl:
28964 Opcode = X86ISD::AESDEC256KL;
28965 break;
28966 }
28967
28968 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28969 MachineMemOperand *MMO = MemIntr->getMemOperand();
28970 EVT MemVT = MemIntr->getMemoryVT();
28971 SDValue Operation = DAG.getMemIntrinsicNode(
28972 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
28973 MMO);
28974 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
28975
28976 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
28977 {ZF, Operation.getValue(0), Operation.getValue(2)});
28978 }
28979 case Intrinsic::x86_aesencwide128kl:
28980 case Intrinsic::x86_aesdecwide128kl:
28981 case Intrinsic::x86_aesencwide256kl:
28982 case Intrinsic::x86_aesdecwide256kl: {
28983 SDLoc DL(Op);
28984 SDVTList VTs = DAG.getVTList(
28985 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
28986 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
28987 SDValue Chain = Op.getOperand(0);
28988 unsigned Opcode;
28989
28990 switch (IntNo) {
28991 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28991)
;
28992 case Intrinsic::x86_aesencwide128kl:
28993 Opcode = X86ISD::AESENCWIDE128KL;
28994 break;
28995 case Intrinsic::x86_aesdecwide128kl:
28996 Opcode = X86ISD::AESDECWIDE128KL;
28997 break;
28998 case Intrinsic::x86_aesencwide256kl:
28999 Opcode = X86ISD::AESENCWIDE256KL;
29000 break;
29001 case Intrinsic::x86_aesdecwide256kl:
29002 Opcode = X86ISD::AESDECWIDE256KL;
29003 break;
29004 }
29005
29006 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29007 MachineMemOperand *MMO = MemIntr->getMemOperand();
29008 EVT MemVT = MemIntr->getMemoryVT();
29009 SDValue Operation = DAG.getMemIntrinsicNode(
29010 Opcode, DL, VTs,
29011 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
29012 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
29013 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
29014 MemVT, MMO);
29015 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
29016
29017 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29018 {ZF, Operation.getValue(1), Operation.getValue(2),
29019 Operation.getValue(3), Operation.getValue(4),
29020 Operation.getValue(5), Operation.getValue(6),
29021 Operation.getValue(7), Operation.getValue(8),
29022 Operation.getValue(9)});
29023 }
29024 case Intrinsic::x86_testui: {
29025 SDLoc dl(Op);
29026 SDValue Chain = Op.getOperand(0);
29027 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29028 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
29029 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
29030 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29031 Operation.getValue(1));
29032 }
29033 case Intrinsic::x86_atomic_bts_rm:
29034 case Intrinsic::x86_atomic_btc_rm:
29035 case Intrinsic::x86_atomic_btr_rm: {
29036 SDLoc DL(Op);
29037 MVT VT = Op.getSimpleValueType();
29038 SDValue Chain = Op.getOperand(0);
29039 SDValue Op1 = Op.getOperand(2);
29040 SDValue Op2 = Op.getOperand(3);
29041 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
29042 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
29043 : X86ISD::LBTR_RM;
29044 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29045 SDValue Res =
29046 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29047 {Chain, Op1, Op2}, VT, MMO);
29048 Chain = Res.getValue(1);
29049 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29050 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29051 }
29052 case Intrinsic::x86_atomic_bts:
29053 case Intrinsic::x86_atomic_btc:
29054 case Intrinsic::x86_atomic_btr: {
29055 SDLoc DL(Op);
29056 MVT VT = Op.getSimpleValueType();
29057 SDValue Chain = Op.getOperand(0);
29058 SDValue Op1 = Op.getOperand(2);
29059 SDValue Op2 = Op.getOperand(3);
29060 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
29061 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
29062 : X86ISD::LBTR;
29063 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
29064 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29065 SDValue Res =
29066 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29067 {Chain, Op1, Op2, Size}, VT, MMO);
29068 Chain = Res.getValue(1);
29069 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29070 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
29071 if (Imm)
29072 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
29073 DAG.getShiftAmountConstant(Imm, VT, DL));
29074 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29075 }
29076 case Intrinsic::x86_cmpccxadd32:
29077 case Intrinsic::x86_cmpccxadd64: {
29078 SDLoc DL(Op);
29079 SDValue Chain = Op.getOperand(0);
29080 SDValue Addr = Op.getOperand(2);
29081 SDValue Src1 = Op.getOperand(3);
29082 SDValue Src2 = Op.getOperand(4);
29083 SDValue CC = Op.getOperand(5);
29084 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29085 SDValue Operation = DAG.getMemIntrinsicNode(
29086 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
29087 MVT::i32, MMO);
29088 return Operation;
29089 }
29090 case Intrinsic::x86_aadd32:
29091 case Intrinsic::x86_aadd64:
29092 case Intrinsic::x86_aand32:
29093 case Intrinsic::x86_aand64:
29094 case Intrinsic::x86_aor32:
29095 case Intrinsic::x86_aor64:
29096 case Intrinsic::x86_axor32:
29097 case Intrinsic::x86_axor64: {
29098 SDLoc DL(Op);
29099 SDValue Chain = Op.getOperand(0);
29100 SDValue Op1 = Op.getOperand(2);
29101 SDValue Op2 = Op.getOperand(3);
29102 MVT VT = Op2.getSimpleValueType();
29103 unsigned Opc = 0;
29104 switch (IntNo) {
29105 default:
29106 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29106)
;
29107 case Intrinsic::x86_aadd32:
29108 case Intrinsic::x86_aadd64:
29109 Opc = X86ISD::AADD;
29110 break;
29111 case Intrinsic::x86_aand32:
29112 case Intrinsic::x86_aand64:
29113 Opc = X86ISD::AAND;
29114 break;
29115 case Intrinsic::x86_aor32:
29116 case Intrinsic::x86_aor64:
29117 Opc = X86ISD::AOR;
29118 break;
29119 case Intrinsic::x86_axor32:
29120 case Intrinsic::x86_axor64:
29121 Opc = X86ISD::AXOR;
29122 break;
29123 }
29124 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
29125 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
29126 {Chain, Op1, Op2}, VT, MMO);
29127 }
29128 case Intrinsic::x86_atomic_add_cc:
29129 case Intrinsic::x86_atomic_sub_cc:
29130 case Intrinsic::x86_atomic_or_cc:
29131 case Intrinsic::x86_atomic_and_cc:
29132 case Intrinsic::x86_atomic_xor_cc: {
29133 SDLoc DL(Op);
29134 SDValue Chain = Op.getOperand(0);
29135 SDValue Op1 = Op.getOperand(2);
29136 SDValue Op2 = Op.getOperand(3);
29137 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
29138 MVT VT = Op2.getSimpleValueType();
29139 unsigned Opc = 0;
29140 switch (IntNo) {
29141 default:
29142 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29142)
;
29143 case Intrinsic::x86_atomic_add_cc:
29144 Opc = X86ISD::LADD;
29145 break;
29146 case Intrinsic::x86_atomic_sub_cc:
29147 Opc = X86ISD::LSUB;
29148 break;
29149 case Intrinsic::x86_atomic_or_cc:
29150 Opc = X86ISD::LOR;
29151 break;
29152 case Intrinsic::x86_atomic_and_cc:
29153 Opc = X86ISD::LAND;
29154 break;
29155 case Intrinsic::x86_atomic_xor_cc:
29156 Opc = X86ISD::LXOR;
29157 break;
29158 }
29159 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29160 SDValue LockArith =
29161 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29162 {Chain, Op1, Op2}, VT, MMO);
29163 Chain = LockArith.getValue(1);
29164 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
29165 }
29166 }
29167 return SDValue();
29168 }
29169
29170 SDLoc dl(Op);
29171 switch(IntrData->Type) {
29172 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29172)
;
29173 case RDSEED:
29174 case RDRAND: {
29175 // Emit the node with the right value type.
29176 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
29177 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29178
29179 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
29180 // Otherwise return the value from Rand, which is always 0, casted to i32.
29181 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
29182 DAG.getConstant(1, dl, Op->getValueType(1)),
29183 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
29184 SDValue(Result.getNode(), 1)};
29185 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
29186
29187 // Return { result, isValid, chain }.
29188 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
29189 SDValue(Result.getNode(), 2));
29190 }
29191 case GATHER_AVX2: {
29192 SDValue Chain = Op.getOperand(0);
29193 SDValue Src = Op.getOperand(2);
29194 SDValue Base = Op.getOperand(3);
29195 SDValue Index = Op.getOperand(4);
29196 SDValue Mask = Op.getOperand(5);
29197 SDValue Scale = Op.getOperand(6);
29198 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29199 Scale, Chain, Subtarget);
29200 }
29201 case GATHER: {
29202 //gather(v1, mask, index, base, scale);
29203 SDValue Chain = Op.getOperand(0);
29204 SDValue Src = Op.getOperand(2);
29205 SDValue Base = Op.getOperand(3);
29206 SDValue Index = Op.getOperand(4);
29207 SDValue Mask = Op.getOperand(5);
29208 SDValue Scale = Op.getOperand(6);
29209 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
29210 Chain, Subtarget);
29211 }
29212 case SCATTER: {
29213 //scatter(base, mask, index, v1, scale);
29214 SDValue Chain = Op.getOperand(0);
29215 SDValue Base = Op.getOperand(2);
29216 SDValue Mask = Op.getOperand(3);
29217 SDValue Index = Op.getOperand(4);
29218 SDValue Src = Op.getOperand(5);
29219 SDValue Scale = Op.getOperand(6);
29220 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29221 Scale, Chain, Subtarget);
29222 }
29223 case PREFETCH: {
29224 const APInt &HintVal = Op.getConstantOperandAPInt(6);
29225 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29226, __extension__
__PRETTY_FUNCTION__))
29226 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29226, __extension__
__PRETTY_FUNCTION__))
;
29227 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
29228 SDValue Chain = Op.getOperand(0);
29229 SDValue Mask = Op.getOperand(2);
29230 SDValue Index = Op.getOperand(3);
29231 SDValue Base = Op.getOperand(4);
29232 SDValue Scale = Op.getOperand(5);
29233 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
29234 Subtarget);
29235 }
29236 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
29237 case RDTSC: {
29238 SmallVector<SDValue, 2> Results;
29239 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
29240 Results);
29241 return DAG.getMergeValues(Results, dl);
29242 }
29243 // Read Performance Monitoring Counters.
29244 case RDPMC:
29245 // Read Processor Register.
29246 case RDPRU:
29247 // GetExtended Control Register.
29248 case XGETBV: {
29249 SmallVector<SDValue, 2> Results;
29250
29251 // RDPMC uses ECX to select the index of the performance counter to read.
29252 // RDPRU uses ECX to select the processor register to read.
29253 // XGETBV uses ECX to select the index of the XCR register to return.
29254 // The result is stored into registers EDX:EAX.
29255 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
29256 Subtarget, Results);
29257 return DAG.getMergeValues(Results, dl);
29258 }
29259 // XTEST intrinsics.
29260 case XTEST: {
29261 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
29262 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29263
29264 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
29265 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
29266 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
29267 Ret, SDValue(InTrans.getNode(), 1));
29268 }
29269 case TRUNCATE_TO_MEM_VI8:
29270 case TRUNCATE_TO_MEM_VI16:
29271 case TRUNCATE_TO_MEM_VI32: {
29272 SDValue Mask = Op.getOperand(4);
29273 SDValue DataToTruncate = Op.getOperand(3);
29274 SDValue Addr = Op.getOperand(2);
29275 SDValue Chain = Op.getOperand(0);
29276
29277 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
29278 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29278, __extension__
__PRETTY_FUNCTION__))
;
29279
29280 EVT MemVT = MemIntr->getMemoryVT();
29281
29282 uint16_t TruncationOp = IntrData->Opc0;
29283 switch (TruncationOp) {
29284 case X86ISD::VTRUNC: {
29285 if (isAllOnesConstant(Mask)) // return just a truncate store
29286 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
29287 MemIntr->getMemOperand());
29288
29289 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29290 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29291 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
29292
29293 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
29294 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
29295 true /* truncating */);
29296 }
29297 case X86ISD::VTRUNCUS:
29298 case X86ISD::VTRUNCS: {
29299 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
29300 if (isAllOnesConstant(Mask))
29301 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
29302 MemIntr->getMemOperand(), DAG);
29303
29304 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29305 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29306
29307 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
29308 VMask, MemVT, MemIntr->getMemOperand(), DAG);
29309 }
29310 default:
29311 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29311)
;
29312 }
29313 }
29314 }
29315}
29316
29317SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
29318 SelectionDAG &DAG) const {
29319 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
29320 MFI.setReturnAddressIsTaken(true);
29321
29322 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
29323 return SDValue();
29324
29325 unsigned Depth = Op.getConstantOperandVal(0);
29326 SDLoc dl(Op);
29327 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29328
29329 if (Depth > 0) {
29330 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
29331 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29332 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
29333 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
29334 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
29335 MachinePointerInfo());
29336 }
29337
29338 // Just load the return address.
29339 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
29340 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
29341 MachinePointerInfo());
29342}
29343
29344SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
29345 SelectionDAG &DAG) const {
29346 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
29347 return getReturnAddressFrameIndex(DAG);
29348}
29349
29350SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
29351 MachineFunction &MF = DAG.getMachineFunction();
29352 MachineFrameInfo &MFI = MF.getFrameInfo();
29353 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
29354 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29355 EVT VT = Op.getValueType();
29356
29357 MFI.setFrameAddressIsTaken(true);
29358
29359 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
29360 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
29361 // is not possible to crawl up the stack without looking at the unwind codes
29362 // simultaneously.
29363 int FrameAddrIndex = FuncInfo->getFAIndex();
29364 if (!FrameAddrIndex) {
29365 // Set up a frame object for the return address.
29366 unsigned SlotSize = RegInfo->getSlotSize();
29367 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
29368 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
29369 FuncInfo->setFAIndex(FrameAddrIndex);
29370 }
29371 return DAG.getFrameIndex(FrameAddrIndex, VT);
29372 }
29373
29374 unsigned FrameReg =
29375 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
29376 SDLoc dl(Op); // FIXME probably not meaningful
29377 unsigned Depth = Op.getConstantOperandVal(0);
29378 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29380, __extension__
__PRETTY_FUNCTION__))
29379 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29380, __extension__
__PRETTY_FUNCTION__))
29380 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29380, __extension__
__PRETTY_FUNCTION__))
;
29381 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
29382 while (Depth--)
29383 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
29384 MachinePointerInfo());
29385 return FrameAddr;
29386}
29387
29388// FIXME? Maybe this could be a TableGen attribute on some registers and
29389// this table could be generated automatically from RegInfo.
29390Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
29391 const MachineFunction &MF) const {
29392 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
29393
29394 Register Reg = StringSwitch<unsigned>(RegName)
29395 .Case("esp", X86::ESP)
29396 .Case("rsp", X86::RSP)
29397 .Case("ebp", X86::EBP)
29398 .Case("rbp", X86::RBP)
29399 .Default(0);
29400
29401 if (Reg == X86::EBP || Reg == X86::RBP) {
29402 if (!TFI.hasFP(MF))
29403 report_fatal_error("register " + StringRef(RegName) +
29404 " is allocatable: function has no frame pointer");
29405#ifndef NDEBUG
29406 else {
29407 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29408 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
29409 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29410, __extension__
__PRETTY_FUNCTION__))
29410 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29410, __extension__
__PRETTY_FUNCTION__))
;
29411 }
29412#endif
29413 }
29414
29415 if (Reg)
29416 return Reg;
29417
29418 report_fatal_error("Invalid register name global variable");
29419}
29420
29421SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
29422 SelectionDAG &DAG) const {
29423 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29424 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
29425}
29426
29427Register X86TargetLowering::getExceptionPointerRegister(
29428 const Constant *PersonalityFn) const {
29429 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
29430 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29431
29432 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
29433}
29434
29435Register X86TargetLowering::getExceptionSelectorRegister(
29436 const Constant *PersonalityFn) const {
29437 // Funclet personalities don't use selectors (the runtime does the selection).
29438 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
29439 return X86::NoRegister;
29440 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29441}
29442
29443bool X86TargetLowering::needsFixedCatchObjects() const {
29444 return Subtarget.isTargetWin64();
29445}
29446
29447SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
29448 SDValue Chain = Op.getOperand(0);
29449 SDValue Offset = Op.getOperand(1);
29450 SDValue Handler = Op.getOperand(2);
29451 SDLoc dl (Op);
29452
29453 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29454 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29455 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
29456 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29458, __extension__
__PRETTY_FUNCTION__))
29457 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29458, __extension__
__PRETTY_FUNCTION__))
29458 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29458, __extension__
__PRETTY_FUNCTION__))
;
29459 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
29460 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
29461
29462 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
29463 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
29464 dl));
29465 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
29466 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
29467 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
29468
29469 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
29470 DAG.getRegister(StoreAddrReg, PtrVT));
29471}
29472
29473SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
29474 SelectionDAG &DAG) const {
29475 SDLoc DL(Op);
29476 // If the subtarget is not 64bit, we may need the global base reg
29477 // after isel expand pseudo, i.e., after CGBR pass ran.
29478 // Therefore, ask for the GlobalBaseReg now, so that the pass
29479 // inserts the code for us in case we need it.
29480 // Otherwise, we will end up in a situation where we will
29481 // reference a virtual register that is not defined!
29482 if (!Subtarget.is64Bit()) {
29483 const X86InstrInfo *TII = Subtarget.getInstrInfo();
29484 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
29485 }
29486 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
29487 DAG.getVTList(MVT::i32, MVT::Other),
29488 Op.getOperand(0), Op.getOperand(1));
29489}
29490
29491SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
29492 SelectionDAG &DAG) const {
29493 SDLoc DL(Op);
29494 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
29495 Op.getOperand(0), Op.getOperand(1));
29496}
29497
29498SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
29499 SelectionDAG &DAG) const {
29500 SDLoc DL(Op);
29501 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
29502 Op.getOperand(0));
29503}
29504
29505static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
29506 return Op.getOperand(0);
29507}
29508
29509SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
29510 SelectionDAG &DAG) const {
29511 SDValue Root = Op.getOperand(0);
29512 SDValue Trmp = Op.getOperand(1); // trampoline
29513 SDValue FPtr = Op.getOperand(2); // nested function
29514 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
29515 SDLoc dl (Op);
29516
29517 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
29518 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29519
29520 if (Subtarget.is64Bit()) {
29521 SDValue OutChains[6];
29522
29523 // Large code-model.
29524 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
29525 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
29526
29527 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
29528 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
29529
29530 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
29531
29532 // Load the pointer to the nested function into R11.
29533 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
29534 SDValue Addr = Trmp;
29535 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29536 Addr, MachinePointerInfo(TrmpAddr));
29537
29538 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29539 DAG.getConstant(2, dl, MVT::i64));
29540 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
29541 MachinePointerInfo(TrmpAddr, 2), Align(2));
29542
29543 // Load the 'nest' parameter value into R10.
29544 // R10 is specified in X86CallingConv.td
29545 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
29546 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29547 DAG.getConstant(10, dl, MVT::i64));
29548 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29549 Addr, MachinePointerInfo(TrmpAddr, 10));
29550
29551 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29552 DAG.getConstant(12, dl, MVT::i64));
29553 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
29554 MachinePointerInfo(TrmpAddr, 12), Align(2));
29555
29556 // Jump to the nested function.
29557 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
29558 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29559 DAG.getConstant(20, dl, MVT::i64));
29560 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29561 Addr, MachinePointerInfo(TrmpAddr, 20));
29562
29563 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
29564 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29565 DAG.getConstant(22, dl, MVT::i64));
29566 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
29567 Addr, MachinePointerInfo(TrmpAddr, 22));
29568
29569 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29570 } else {
29571 const Function *Func =
29572 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
29573 CallingConv::ID CC = Func->getCallingConv();
29574 unsigned NestReg;
29575
29576 switch (CC) {
29577 default:
29578 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29578)
;
29579 case CallingConv::C:
29580 case CallingConv::X86_StdCall: {
29581 // Pass 'nest' parameter in ECX.
29582 // Must be kept in sync with X86CallingConv.td
29583 NestReg = X86::ECX;
29584
29585 // Check that ECX wasn't needed by an 'inreg' parameter.
29586 FunctionType *FTy = Func->getFunctionType();
29587 const AttributeList &Attrs = Func->getAttributes();
29588
29589 if (!Attrs.isEmpty() && !Func->isVarArg()) {
29590 unsigned InRegCount = 0;
29591 unsigned Idx = 0;
29592
29593 for (FunctionType::param_iterator I = FTy->param_begin(),
29594 E = FTy->param_end(); I != E; ++I, ++Idx)
29595 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
29596 const DataLayout &DL = DAG.getDataLayout();
29597 // FIXME: should only count parameters that are lowered to integers.
29598 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
29599 }
29600
29601 if (InRegCount > 2) {
29602 report_fatal_error("Nest register in use - reduce number of inreg"
29603 " parameters!");
29604 }
29605 }
29606 break;
29607 }
29608 case CallingConv::X86_FastCall:
29609 case CallingConv::X86_ThisCall:
29610 case CallingConv::Fast:
29611 case CallingConv::Tail:
29612 case CallingConv::SwiftTail:
29613 // Pass 'nest' parameter in EAX.
29614 // Must be kept in sync with X86CallingConv.td
29615 NestReg = X86::EAX;
29616 break;
29617 }
29618
29619 SDValue OutChains[4];
29620 SDValue Addr, Disp;
29621
29622 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29623 DAG.getConstant(10, dl, MVT::i32));
29624 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
29625
29626 // This is storing the opcode for MOV32ri.
29627 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
29628 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
29629 OutChains[0] =
29630 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
29631 Trmp, MachinePointerInfo(TrmpAddr));
29632
29633 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29634 DAG.getConstant(1, dl, MVT::i32));
29635 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
29636 MachinePointerInfo(TrmpAddr, 1), Align(1));
29637
29638 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
29639 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29640 DAG.getConstant(5, dl, MVT::i32));
29641 OutChains[2] =
29642 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
29643 MachinePointerInfo(TrmpAddr, 5), Align(1));
29644
29645 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29646 DAG.getConstant(6, dl, MVT::i32));
29647 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
29648 MachinePointerInfo(TrmpAddr, 6), Align(1));
29649
29650 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29651 }
29652}
29653
29654SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
29655 SelectionDAG &DAG) const {
29656 /*
29657 The rounding mode is in bits 11:10 of FPSR, and has the following
29658 settings:
29659 00 Round to nearest
29660 01 Round to -inf
29661 10 Round to +inf
29662 11 Round to 0
29663
29664 GET_ROUNDING, on the other hand, expects the following:
29665 -1 Undefined
29666 0 Round to 0
29667 1 Round to nearest
29668 2 Round to +inf
29669 3 Round to -inf
29670
29671 To perform the conversion, we use a packed lookup table of the four 2-bit
29672 values that we can index by FPSP[11:10]
29673 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
29674
29675 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
29676 */
29677
29678 MachineFunction &MF = DAG.getMachineFunction();
29679 MVT VT = Op.getSimpleValueType();
29680 SDLoc DL(Op);
29681
29682 // Save FP Control Word to stack slot
29683 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
29684 SDValue StackSlot =
29685 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
29686
29687 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
29688
29689 SDValue Chain = Op.getOperand(0);
29690 SDValue Ops[] = {Chain, StackSlot};
29691 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
29692 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
29693 Align(2), MachineMemOperand::MOStore);
29694
29695 // Load FP Control Word from stack slot
29696 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
29697 Chain = CWD.getValue(1);
29698
29699 // Mask and turn the control bits into a shift for the lookup table.
29700 SDValue Shift =
29701 DAG.getNode(ISD::SRL, DL, MVT::i16,
29702 DAG.getNode(ISD::AND, DL, MVT::i16,
29703 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
29704 DAG.getConstant(9, DL, MVT::i8));
29705 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
29706
29707 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
29708 SDValue RetVal =
29709 DAG.getNode(ISD::AND, DL, MVT::i32,
29710 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
29711 DAG.getConstant(3, DL, MVT::i32));
29712
29713 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
29714
29715 return DAG.getMergeValues({RetVal, Chain}, DL);
29716}
29717
29718SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
29719 SelectionDAG &DAG) const {
29720 MachineFunction &MF = DAG.getMachineFunction();
29721 SDLoc DL(Op);
29722 SDValue Chain = Op.getNode()->getOperand(0);
29723
29724 // FP control word may be set only from data in memory. So we need to allocate
29725 // stack space to save/load FP control word.
29726 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
29727 SDValue StackSlot =
29728 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
29729 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
29730 MachineMemOperand *MMO =
29731 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
29732
29733 // Store FP control word into memory.
29734 SDValue Ops[] = {Chain, StackSlot};
29735 Chain = DAG.getMemIntrinsicNode(
29736 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
29737
29738 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
29739 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
29740 Chain = CWD.getValue(1);
29741 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
29742 DAG.getConstant(0xf3ff, DL, MVT::i16));
29743
29744 // Calculate new rounding mode.
29745 SDValue NewRM = Op.getNode()->getOperand(1);
29746 SDValue RMBits;
29747 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
29748 uint64_t RM = CVal->getZExtValue();
29749 int FieldVal;
29750 switch (static_cast<RoundingMode>(RM)) {
29751 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
29752 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
29753 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
29754 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
29755 default:
29756 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29756)
;
29757 }
29758 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
29759 } else {
29760 // Need to convert argument into bits of control word:
29761 // 0 Round to 0 -> 11
29762 // 1 Round to nearest -> 00
29763 // 2 Round to +inf -> 10
29764 // 3 Round to -inf -> 01
29765 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
29766 // To make the conversion, put all these values into a value 0xc9 and shift
29767 // it left depending on the rounding mode:
29768 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
29769 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
29770 // ...
29771 // (0xc9 << (2 * NewRM + 4)) & 0xc00
29772 SDValue ShiftValue =
29773 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
29774 DAG.getNode(ISD::ADD, DL, MVT::i32,
29775 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
29776 DAG.getConstant(1, DL, MVT::i8)),
29777 DAG.getConstant(4, DL, MVT::i32)));
29778 SDValue Shifted =
29779 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
29780 ShiftValue);
29781 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
29782 DAG.getConstant(0xc00, DL, MVT::i16));
29783 }
29784
29785 // Update rounding mode bits and store the new FP Control Word into stack.
29786 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
29787 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
29788
29789 // Load FP control word from the slot.
29790 SDValue OpsLD[] = {Chain, StackSlot};
29791 MachineMemOperand *MMOL =
29792 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
29793 Chain = DAG.getMemIntrinsicNode(
29794 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
29795
29796 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
29797 // same way but in bits 14:13.
29798 if (Subtarget.hasSSE1()) {
29799 // Store MXCSR into memory.
29800 Chain = DAG.getNode(
29801 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29802 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
29803 StackSlot);
29804
29805 // Load MXCSR from stack slot and clear RM field (bits 14:13).
29806 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
29807 Chain = CWD.getValue(1);
29808 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
29809 DAG.getConstant(0xffff9fff, DL, MVT::i32));
29810
29811 // Shift X87 RM bits from 11:10 to 14:13.
29812 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
29813 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
29814 DAG.getConstant(3, DL, MVT::i8));
29815
29816 // Update rounding mode bits and store the new FP Control Word into stack.
29817 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
29818 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
29819
29820 // Load MXCSR from the slot.
29821 Chain = DAG.getNode(
29822 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29823 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
29824 StackSlot);
29825 }
29826
29827 return Chain;
29828}
29829
29830/// Lower a vector CTLZ using native supported vector CTLZ instruction.
29831//
29832// i8/i16 vector implemented using dword LZCNT vector instruction
29833// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
29834// split the vector, perform operation on it's Lo a Hi part and
29835// concatenate the results.
29836static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
29837 const X86Subtarget &Subtarget) {
29838 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29838, __extension__ __PRETTY_FUNCTION__))
;
29839 SDLoc dl(Op);
29840 MVT VT = Op.getSimpleValueType();
29841 MVT EltVT = VT.getVectorElementType();
29842 unsigned NumElems = VT.getVectorNumElements();
29843
29844 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29845, __extension__
__PRETTY_FUNCTION__))
29845 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29845, __extension__
__PRETTY_FUNCTION__))
;
29846
29847 // Split vector, it's Lo and Hi parts will be handled in next iteration.
29848 if (NumElems > 16 ||
29849 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
29850 return splitVectorIntUnary(Op, DAG);
29851
29852 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29853 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29854, __extension__
__PRETTY_FUNCTION__))
29854 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29854, __extension__
__PRETTY_FUNCTION__))
;
29855
29856 // Use native supported vector instruction vplzcntd.
29857 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
29858 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
29859 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
29860 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
29861
29862 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
29863}
29864
29865// Lower CTLZ using a PSHUFB lookup table implementation.
29866static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
29867 const X86Subtarget &Subtarget,
29868 SelectionDAG &DAG) {
29869 MVT VT = Op.getSimpleValueType();
29870 int NumElts = VT.getVectorNumElements();
29871 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
29872 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
29873
29874 // Per-nibble leading zero PSHUFB lookup table.
29875 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
29876 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
29877 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
29878 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29879
29880 SmallVector<SDValue, 64> LUTVec;
29881 for (int i = 0; i < NumBytes; ++i)
29882 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29883 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29884
29885 // Begin by bitcasting the input to byte vector, then split those bytes
29886 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
29887 // If the hi input nibble is zero then we add both results together, otherwise
29888 // we just take the hi result (by masking the lo result to zero before the
29889 // add).
29890 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29891 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29892
29893 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29894 SDValue Lo = Op0;
29895 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29896 SDValue HiZ;
29897 if (CurrVT.is512BitVector()) {
29898 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29899 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29900 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29901 } else {
29902 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29903 }
29904
29905 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29906 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29907 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29908 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29909
29910 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29911 // of the current vector width in the same way we did for the nibbles.
29912 // If the upper half of the input element is zero then add the halves'
29913 // leading zero counts together, otherwise just use the upper half's.
29914 // Double the width of the result until we are at target width.
29915 while (CurrVT != VT) {
29916 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29917 int CurrNumElts = CurrVT.getVectorNumElements();
29918 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29919 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29920 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29921
29922 // Check if the upper half of the input element is zero.
29923 if (CurrVT.is512BitVector()) {
29924 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29925 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29926 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29927 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29928 } else {
29929 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29930 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29931 }
29932 HiZ = DAG.getBitcast(NextVT, HiZ);
29933
29934 // Move the upper/lower halves to the lower bits as we'll be extending to
29935 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29936 // together.
29937 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29938 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29939 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29940 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29941 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29942 CurrVT = NextVT;
29943 }
29944
29945 return Res;
29946}
29947
29948static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
29949 const X86Subtarget &Subtarget,
29950 SelectionDAG &DAG) {
29951 MVT VT = Op.getSimpleValueType();
29952
29953 if (Subtarget.hasCDI() &&
29954 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29955 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29956 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29957
29958 // Decompose 256-bit ops into smaller 128-bit ops.
29959 if (VT.is256BitVector() && !Subtarget.hasInt256())
29960 return splitVectorIntUnary(Op, DAG);
29961
29962 // Decompose 512-bit ops into smaller 256-bit ops.
29963 if (VT.is512BitVector() && !Subtarget.hasBWI())
29964 return splitVectorIntUnary(Op, DAG);
29965
29966 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29966, __extension__
__PRETTY_FUNCTION__))
;
29967 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29968}
29969
29970static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29971 SelectionDAG &DAG) {
29972 MVT VT = Op.getSimpleValueType();
29973 MVT OpVT = VT;
29974 unsigned NumBits = VT.getSizeInBits();
29975 SDLoc dl(Op);
29976 unsigned Opc = Op.getOpcode();
29977
29978 if (VT.isVector())
29979 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29980
29981 Op = Op.getOperand(0);
29982 if (VT == MVT::i8) {
29983 // Zero extend to i32 since there is not an i8 bsr.
29984 OpVT = MVT::i32;
29985 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29986 }
29987
29988 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29989 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29990 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
29991
29992 if (Opc == ISD::CTLZ) {
29993 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29994 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29995 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29996 Op.getValue(1)};
29997 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29998 }
29999
30000 // Finally xor with NumBits-1.
30001 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
30002 DAG.getConstant(NumBits - 1, dl, OpVT));
30003
30004 if (VT == MVT::i8)
30005 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
30006 return Op;
30007}
30008
30009static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
30010 SelectionDAG &DAG) {
30011 MVT VT = Op.getSimpleValueType();
30012 unsigned NumBits = VT.getScalarSizeInBits();
30013 SDValue N0 = Op.getOperand(0);
30014 SDLoc dl(Op);
30015
30016 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30017, __extension__
__PRETTY_FUNCTION__))
30017 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30017, __extension__
__PRETTY_FUNCTION__))
;
30018
30019 // Issue a bsf (scan bits forward) which also sets EFLAGS.
30020 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30021 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
30022
30023 // If src is known never zero we can skip the CMOV.
30024 if (DAG.isKnownNeverZero(N0))
30025 return Op;
30026
30027 // If src is zero (i.e. bsf sets ZF), returns NumBits.
30028 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
30029 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30030 Op.getValue(1)};
30031 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
30032}
30033
30034static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
30035 const X86Subtarget &Subtarget) {
30036 MVT VT = Op.getSimpleValueType();
30037 if (VT == MVT::i16 || VT == MVT::i32)
30038 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
30039
30040 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30041 return splitVectorIntBinary(Op, DAG);
30042
30043 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30045, __extension__
__PRETTY_FUNCTION__))
30044 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30045, __extension__
__PRETTY_FUNCTION__))
30045 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30045, __extension__
__PRETTY_FUNCTION__))
;
30046 return splitVectorIntBinary(Op, DAG);
30047}
30048
30049static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
30050 const X86Subtarget &Subtarget) {
30051 MVT VT = Op.getSimpleValueType();
30052 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
30053 unsigned Opcode = Op.getOpcode();
30054 SDLoc DL(Op);
30055
30056 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
30057 (VT.is256BitVector() && !Subtarget.hasInt256())) {
30058 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30059, __extension__
__PRETTY_FUNCTION__))
30059 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30059, __extension__
__PRETTY_FUNCTION__))
;
30060 return splitVectorIntBinary(Op, DAG);
30061 }
30062
30063 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
30064 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30065 EVT SetCCResultType =
30066 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30067
30068 unsigned BitWidth = VT.getScalarSizeInBits();
30069 if (Opcode == ISD::USUBSAT) {
30070 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
30071 // Handle a special-case with a bit-hack instead of cmp+select:
30072 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
30073 // If the target can use VPTERNLOG, DAGToDAG will match this as
30074 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
30075 // "broadcast" constant load.
30076 ConstantSDNode *C = isConstOrConstSplat(Y, true);
30077 if (C && C->getAPIntValue().isSignMask()) {
30078 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
30079 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
30080 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
30081 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
30082 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
30083 }
30084 }
30085 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
30086 // usubsat X, Y --> (X >u Y) ? X - Y : 0
30087 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
30088 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
30089 // TODO: Move this to DAGCombiner?
30090 if (SetCCResultType == VT &&
30091 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
30092 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
30093 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
30094 }
30095 }
30096
30097 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
30098 (!VT.isVector() || VT == MVT::v2i64)) {
30099 APInt MinVal = APInt::getSignedMinValue(BitWidth);
30100 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
30101 SDValue Zero = DAG.getConstant(0, DL, VT);
30102 SDValue Result =
30103 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
30104 DAG.getVTList(VT, SetCCResultType), X, Y);
30105 SDValue SumDiff = Result.getValue(0);
30106 SDValue Overflow = Result.getValue(1);
30107 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
30108 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
30109 SDValue SumNeg =
30110 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
30111 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
30112 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
30113 }
30114
30115 // Use default expansion.
30116 return SDValue();
30117}
30118
30119static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
30120 SelectionDAG &DAG) {
30121 MVT VT = Op.getSimpleValueType();
30122 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
30123 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30124 // 8-bit integer abs to NEG and CMOV.
30125 SDLoc DL(Op);
30126 SDValue N0 = Op.getOperand(0);
30127 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30128 DAG.getConstant(0, DL, VT), N0);
30129 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
30130 SDValue(Neg.getNode(), 1)};
30131 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
30132 }
30133
30134 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
30135 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
30136 SDLoc DL(Op);
30137 SDValue Src = Op.getOperand(0);
30138 SDValue Sub =
30139 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
30140 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
30141 }
30142
30143 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
30144 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30145, __extension__
__PRETTY_FUNCTION__))
30145 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30145, __extension__
__PRETTY_FUNCTION__))
;
30146 return splitVectorIntUnary(Op, DAG);
30147 }
30148
30149 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30150 return splitVectorIntUnary(Op, DAG);
30151
30152 // Default to expand.
30153 return SDValue();
30154}
30155
30156static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
30157 SelectionDAG &DAG) {
30158 MVT VT = Op.getSimpleValueType();
30159
30160 // For AVX1 cases, split to use legal ops.
30161 if (VT.is256BitVector() && !Subtarget.hasInt256())
30162 return splitVectorIntBinary(Op, DAG);
30163
30164 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30165 return splitVectorIntBinary(Op, DAG);
30166
30167 // Default to expand.
30168 return SDValue();
30169}
30170
30171static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
30172 SelectionDAG &DAG) {
30173 MVT VT = Op.getSimpleValueType();
30174
30175 // For AVX1 cases, split to use legal ops.
30176 if (VT.is256BitVector() && !Subtarget.hasInt256())
30177 return splitVectorIntBinary(Op, DAG);
30178
30179 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30180 return splitVectorIntBinary(Op, DAG);
30181
30182 // umax(x,1) --> sub(x,cmpeq(x,0))
30183 // TODO: Move this to expandIntMINMAX?
30184 if (VT.isVector() && Op.getOpcode() == ISD::UMAX &&
30185 llvm::isOneOrOneSplat(Op.getOperand(1), true)) {
30186 SDLoc DL(Op);
30187 SDValue X = DAG.getFreeze(Op.getOperand(0));
30188 SDValue Zero = getZeroVector(VT, Subtarget, DAG, DL);
30189 return DAG.getNode(ISD::SUB, DL, VT, X,
30190 DAG.getSetCC(DL, VT, X, Zero, ISD::SETEQ));
30191 }
30192
30193 // Default to expand.
30194 return SDValue();
30195}
30196
30197static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
30198 SelectionDAG &DAG) {
30199 MVT VT = Op.getSimpleValueType();
30200
30201 // For AVX1 cases, split to use legal ops.
30202 if (VT.is256BitVector() && !Subtarget.hasInt256())
30203 return splitVectorIntBinary(Op, DAG);
30204
30205 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
30206 return splitVectorIntBinary(Op, DAG);
30207
30208 // TODO: Add TargetLowering expandABD() support.
30209 SDLoc dl(Op);
30210 bool IsSigned = Op.getOpcode() == ISD::ABDS;
30211 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
30212 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
30213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30214
30215 // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
30216 // abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))
30217 unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
30218 unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
30219 if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {
30220 SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
30221 SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
30222 return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
30223 }
30224
30225 // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
30226 // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
30227 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30228 ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
30229 SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
30230 return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
30231 DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
30232}
30233
30234static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
30235 SelectionDAG &DAG) {
30236 SDLoc dl(Op);
30237 MVT VT = Op.getSimpleValueType();
30238
30239 // Decompose 256-bit ops into 128-bit ops.
30240 if (VT.is256BitVector() && !Subtarget.hasInt256())
30241 return splitVectorIntBinary(Op, DAG);
30242
30243 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30244 return splitVectorIntBinary(Op, DAG);
30245
30246 SDValue A = Op.getOperand(0);
30247 SDValue B = Op.getOperand(1);
30248
30249 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
30250 // vector pairs, multiply and truncate.
30251 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
30252 unsigned NumElts = VT.getVectorNumElements();
30253
30254 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30255 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30256 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30257 return DAG.getNode(
30258 ISD::TRUNCATE, dl, VT,
30259 DAG.getNode(ISD::MUL, dl, ExVT,
30260 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
30261 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
30262 }
30263
30264 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30265
30266 // Extract the lo/hi parts to any extend to i16.
30267 // We're going to mask off the low byte of each result element of the
30268 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
30269 // element.
30270 SDValue Undef = DAG.getUNDEF(VT);
30271 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
30272 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
30273
30274 SDValue BLo, BHi;
30275 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30276 // If the RHS is a constant, manually unpackl/unpackh.
30277 SmallVector<SDValue, 16> LoOps, HiOps;
30278 for (unsigned i = 0; i != NumElts; i += 16) {
30279 for (unsigned j = 0; j != 8; ++j) {
30280 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
30281 MVT::i16));
30282 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
30283 MVT::i16));
30284 }
30285 }
30286
30287 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30288 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30289 } else {
30290 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
30291 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
30292 }
30293
30294 // Multiply, mask the lower 8bits of the lo/hi results and pack.
30295 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
30296 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
30297 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30298 }
30299
30300 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
30301 if (VT == MVT::v4i32) {
30302 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30303, __extension__
__PRETTY_FUNCTION__))
30303 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30303, __extension__
__PRETTY_FUNCTION__))
;
30304
30305 // Extract the odd parts.
30306 static const int UnpackMask[] = { 1, -1, 3, -1 };
30307 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
30308 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
30309
30310 // Multiply the even parts.
30311 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30312 DAG.getBitcast(MVT::v2i64, A),
30313 DAG.getBitcast(MVT::v2i64, B));
30314 // Now multiply odd parts.
30315 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30316 DAG.getBitcast(MVT::v2i64, Aodds),
30317 DAG.getBitcast(MVT::v2i64, Bodds));
30318
30319 Evens = DAG.getBitcast(VT, Evens);
30320 Odds = DAG.getBitcast(VT, Odds);
30321
30322 // Merge the two vectors back together with a shuffle. This expands into 2
30323 // shuffles.
30324 static const int ShufMask[] = { 0, 4, 2, 6 };
30325 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
30326 }
30327
30328 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30329, __extension__
__PRETTY_FUNCTION__))
30329 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30329, __extension__
__PRETTY_FUNCTION__))
;
30330 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30330, __extension__
__PRETTY_FUNCTION__))
;
30331
30332 // Ahi = psrlqi(a, 32);
30333 // Bhi = psrlqi(b, 32);
30334 //
30335 // AloBlo = pmuludq(a, b);
30336 // AloBhi = pmuludq(a, Bhi);
30337 // AhiBlo = pmuludq(Ahi, b);
30338 //
30339 // Hi = psllqi(AloBhi + AhiBlo, 32);
30340 // return AloBlo + Hi;
30341 KnownBits AKnown = DAG.computeKnownBits(A);
30342 KnownBits BKnown = DAG.computeKnownBits(B);
30343
30344 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
30345 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
30346 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
30347
30348 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
30349 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
30350 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
30351
30352 SDValue Zero = DAG.getConstant(0, dl, VT);
30353
30354 // Only multiply lo/hi halves that aren't known to be zero.
30355 SDValue AloBlo = Zero;
30356 if (!ALoIsZero && !BLoIsZero)
30357 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
30358
30359 SDValue AloBhi = Zero;
30360 if (!ALoIsZero && !BHiIsZero) {
30361 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
30362 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
30363 }
30364
30365 SDValue AhiBlo = Zero;
30366 if (!AHiIsZero && !BLoIsZero) {
30367 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
30368 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
30369 }
30370
30371 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
30372 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
30373
30374 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
30375}
30376
30377static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
30378 MVT VT, bool IsSigned,
30379 const X86Subtarget &Subtarget,
30380 SelectionDAG &DAG,
30381 SDValue *Low = nullptr) {
30382 unsigned NumElts = VT.getVectorNumElements();
30383
30384 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
30385 // to a vXi16 type. Do the multiplies, shift the results and pack the half
30386 // lane results back together.
30387
30388 // We'll take different approaches for signed and unsigned.
30389 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
30390 // and use pmullw to calculate the full 16-bit product.
30391 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
30392 // shift them left into the upper byte of each word. This allows us to use
30393 // pmulhw to calculate the full 16-bit product. This trick means we don't
30394 // need to sign extend the bytes to use pmullw.
30395
30396 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30397 SDValue Zero = DAG.getConstant(0, dl, VT);
30398
30399 SDValue ALo, AHi;
30400 if (IsSigned) {
30401 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
30402 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
30403 } else {
30404 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
30405 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
30406 }
30407
30408 SDValue BLo, BHi;
30409 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30410 // If the RHS is a constant, manually unpackl/unpackh and extend.
30411 SmallVector<SDValue, 16> LoOps, HiOps;
30412 for (unsigned i = 0; i != NumElts; i += 16) {
30413 for (unsigned j = 0; j != 8; ++j) {
30414 SDValue LoOp = B.getOperand(i + j);
30415 SDValue HiOp = B.getOperand(i + j + 8);
30416
30417 if (IsSigned) {
30418 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
30419 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
30420 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
30421 DAG.getConstant(8, dl, MVT::i16));
30422 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
30423 DAG.getConstant(8, dl, MVT::i16));
30424 } else {
30425 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
30426 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
30427 }
30428
30429 LoOps.push_back(LoOp);
30430 HiOps.push_back(HiOp);
30431 }
30432 }
30433
30434 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30435 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30436 } else if (IsSigned) {
30437 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
30438 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
30439 } else {
30440 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
30441 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
30442 }
30443
30444 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
30445 // pack back to vXi8.
30446 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
30447 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
30448 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
30449
30450 if (Low)
30451 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30452
30453 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
30454}
30455
30456static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
30457 SelectionDAG &DAG) {
30458 SDLoc dl(Op);
30459 MVT VT = Op.getSimpleValueType();
30460 bool IsSigned = Op->getOpcode() == ISD::MULHS;
30461 unsigned NumElts = VT.getVectorNumElements();
30462 SDValue A = Op.getOperand(0);
30463 SDValue B = Op.getOperand(1);
30464
30465 // Decompose 256-bit ops into 128-bit ops.
30466 if (VT.is256BitVector() && !Subtarget.hasInt256())
30467 return splitVectorIntBinary(Op, DAG);
30468
30469 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30470 return splitVectorIntBinary(Op, DAG);
30471
30472 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
30473 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30475, __extension__
__PRETTY_FUNCTION__))
30474 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30475, __extension__
__PRETTY_FUNCTION__))
30475 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30475, __extension__
__PRETTY_FUNCTION__))
;
30476
30477 // PMULxD operations multiply each even value (starting at 0) of LHS with
30478 // the related value of RHS and produce a widen result.
30479 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30480 // => <2 x i64> <ae|cg>
30481 //
30482 // In other word, to have all the results, we need to perform two PMULxD:
30483 // 1. one with the even values.
30484 // 2. one with the odd values.
30485 // To achieve #2, with need to place the odd values at an even position.
30486 //
30487 // Place the odd value at an even position (basically, shift all values 1
30488 // step to the left):
30489 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
30490 9, -1, 11, -1, 13, -1, 15, -1};
30491 // <a|b|c|d> => <b|undef|d|undef>
30492 SDValue Odd0 =
30493 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
30494 // <e|f|g|h> => <f|undef|h|undef>
30495 SDValue Odd1 =
30496 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
30497
30498 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
30499 // ints.
30500 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
30501 unsigned Opcode =
30502 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
30503 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30504 // => <2 x i64> <ae|cg>
30505 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30506 DAG.getBitcast(MulVT, A),
30507 DAG.getBitcast(MulVT, B)));
30508 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
30509 // => <2 x i64> <bf|dh>
30510 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30511 DAG.getBitcast(MulVT, Odd0),
30512 DAG.getBitcast(MulVT, Odd1)));
30513
30514 // Shuffle it back into the right order.
30515 SmallVector<int, 16> ShufMask(NumElts);
30516 for (int i = 0; i != (int)NumElts; ++i)
30517 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
30518
30519 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
30520
30521 // If we have a signed multiply but no PMULDQ fix up the result of an
30522 // unsigned multiply.
30523 if (IsSigned && !Subtarget.hasSSE41()) {
30524 SDValue Zero = DAG.getConstant(0, dl, VT);
30525 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
30526 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
30527 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
30528 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
30529
30530 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
30531 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
30532 }
30533
30534 return Res;
30535 }
30536
30537 // Only i8 vectors should need custom lowering after this.
30538 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30540, __extension__
__PRETTY_FUNCTION__))
30539 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30540, __extension__
__PRETTY_FUNCTION__))
30540 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30540, __extension__
__PRETTY_FUNCTION__))
;
30541
30542 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
30543 // logical shift down the upper half and pack back to i8.
30544
30545 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
30546 // and then ashr/lshr the upper bits down to the lower bits before multiply.
30547
30548 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30549 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30550 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30551 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30552 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30553 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30554 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30555 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30556 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30557 }
30558
30559 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
30560}
30561
30562// Custom lowering for SMULO/UMULO.
30563static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
30564 SelectionDAG &DAG) {
30565 MVT VT = Op.getSimpleValueType();
30566
30567 // Scalars defer to LowerXALUO.
30568 if (!VT.isVector())
30569 return LowerXALUO(Op, DAG);
30570
30571 SDLoc dl(Op);
30572 bool IsSigned = Op->getOpcode() == ISD::SMULO;
30573 SDValue A = Op.getOperand(0);
30574 SDValue B = Op.getOperand(1);
30575 EVT OvfVT = Op->getValueType(1);
30576
30577 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
30578 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
30579 // Extract the LHS Lo/Hi vectors
30580 SDValue LHSLo, LHSHi;
30581 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
30582
30583 // Extract the RHS Lo/Hi vectors
30584 SDValue RHSLo, RHSHi;
30585 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
30586
30587 EVT LoOvfVT, HiOvfVT;
30588 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
30589 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
30590 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
30591
30592 // Issue the split operations.
30593 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
30594 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
30595
30596 // Join the separate data results and the overflow results.
30597 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30598 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
30599 Hi.getValue(1));
30600
30601 return DAG.getMergeValues({Res, Ovf}, dl);
30602 }
30603
30604 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30605 EVT SetccVT =
30606 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30607
30608 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30609 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30610 unsigned NumElts = VT.getVectorNumElements();
30611 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30612 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30613 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30614 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30615 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30616
30617 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30618
30619 SDValue Ovf;
30620 if (IsSigned) {
30621 SDValue High, LowSign;
30622 if (OvfVT.getVectorElementType() == MVT::i1 &&
30623 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30624 // Rather the truncating try to do the compare on vXi16 or vXi32.
30625 // Shift the high down filling with sign bits.
30626 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
30627 // Fill all 16 bits with the sign bit from the low.
30628 LowSign =
30629 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
30630 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
30631 15, DAG);
30632 SetccVT = OvfVT;
30633 if (!Subtarget.hasBWI()) {
30634 // We can't do a vXi16 compare so sign extend to v16i32.
30635 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30636 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30637 }
30638 } else {
30639 // Otherwise do the compare at vXi8.
30640 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30641 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30642 LowSign =
30643 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30644 }
30645
30646 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30647 } else {
30648 SDValue High =
30649 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30650 if (OvfVT.getVectorElementType() == MVT::i1 &&
30651 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30652 // Rather the truncating try to do the compare on vXi16 or vXi32.
30653 SetccVT = OvfVT;
30654 if (!Subtarget.hasBWI()) {
30655 // We can't do a vXi16 compare so sign extend to v16i32.
30656 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30657 }
30658 } else {
30659 // Otherwise do the compare at vXi8.
30660 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30661 }
30662
30663 Ovf =
30664 DAG.getSetCC(dl, SetccVT, High,
30665 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30666 }
30667
30668 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30669
30670 return DAG.getMergeValues({Low, Ovf}, dl);
30671 }
30672
30673 SDValue Low;
30674 SDValue High =
30675 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30676
30677 SDValue Ovf;
30678 if (IsSigned) {
30679 // SMULO overflows if the high bits don't match the sign of the low.
30680 SDValue LowSign =
30681 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30682 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30683 } else {
30684 // UMULO overflows if the high bits are non-zero.
30685 Ovf =
30686 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30687 }
30688
30689 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30690
30691 return DAG.getMergeValues({Low, Ovf}, dl);
30692}
30693
30694SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30695 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30695, __extension__
__PRETTY_FUNCTION__))
;
30696 EVT VT = Op.getValueType();
30697 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30698, __extension__
__PRETTY_FUNCTION__))
30698 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30698, __extension__
__PRETTY_FUNCTION__))
;
30699
30700 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30701 SmallVector<SDValue> Result;
30702 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30703 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30704 }
30705
30706 RTLIB::Libcall LC;
30707 bool isSigned;
30708 switch (Op->getOpcode()) {
30709 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30709)
;
30710 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30711 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30712 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30713 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30714 }
30715
30716 SDLoc dl(Op);
30717 SDValue InChain = DAG.getEntryNode();
30718
30719 TargetLowering::ArgListTy Args;
30720 TargetLowering::ArgListEntry Entry;
30721 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30722 EVT ArgVT = Op->getOperand(i).getValueType();
30723 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
30724 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))
;
30725 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30726 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30727 MachinePointerInfo MPI =
30728 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30729 Entry.Node = StackPtr;
30730 InChain =
30731 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30732 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30733 Entry.Ty = PointerType::get(ArgTy,0);
30734 Entry.IsSExt = false;
30735 Entry.IsZExt = false;
30736 Args.push_back(Entry);
30737 }
30738
30739 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
30740 getPointerTy(DAG.getDataLayout()));
30741
30742 TargetLowering::CallLoweringInfo CLI(DAG);
30743 CLI.setDebugLoc(dl)
30744 .setChain(InChain)
30745 .setLibCallee(
30746 getLibcallCallingConv(LC),
30747 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30748 std::move(Args))
30749 .setInRegister()
30750 .setSExtResult(isSigned)
30751 .setZExtResult(!isSigned);
30752
30753 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30754 return DAG.getBitcast(VT, CallInfo.first);
30755}
30756
30757SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30758 SelectionDAG &DAG,
30759 SDValue &Chain) const {
30760 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30760, __extension__
__PRETTY_FUNCTION__))
;
30761 EVT VT = Op.getValueType();
30762 bool IsStrict = Op->isStrictFPOpcode();
30763
30764 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30765 EVT ArgVT = Arg.getValueType();
30766
30767 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__))
30768 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__))
;
30769
30770 RTLIB::Libcall LC;
30771 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30772 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30773 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30774 else
30775 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30776 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30776, __extension__
__PRETTY_FUNCTION__))
;
30777
30778 SDLoc dl(Op);
30779 MakeLibCallOptions CallOptions;
30780 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30781
30782 SDValue Result;
30783 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30784 // expected VT (i128).
30785 std::tie(Result, Chain) =
30786 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30787 Result = DAG.getBitcast(VT, Result);
30788 return Result;
30789}
30790
30791SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30792 SelectionDAG &DAG) const {
30793 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30793, __extension__
__PRETTY_FUNCTION__))
;
30794 EVT VT = Op.getValueType();
30795 bool IsStrict = Op->isStrictFPOpcode();
30796
30797 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30798 EVT ArgVT = Arg.getValueType();
30799
30800 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30801, __extension__
__PRETTY_FUNCTION__))
30801 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30801, __extension__
__PRETTY_FUNCTION__))
;
30802
30803 RTLIB::Libcall LC;
30804 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30805 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30806 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30807 else
30808 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30809 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30809, __extension__
__PRETTY_FUNCTION__))
;
30810
30811 SDLoc dl(Op);
30812 MakeLibCallOptions CallOptions;
30813 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30814
30815 // Pass the i128 argument as an indirect argument on the stack.
30816 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30817 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30818 MachinePointerInfo MPI =
30819 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30820 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30821
30822 SDValue Result;
30823 std::tie(Result, Chain) =
30824 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30825 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30826}
30827
30828// Return true if the required (according to Opcode) shift-imm form is natively
30829// supported by the Subtarget
30830static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
30831 unsigned Opcode) {
30832 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30833 return false;
30834
30835 if (VT.getScalarSizeInBits() < 16)
30836 return false;
30837
30838 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30839 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30840 return true;
30841
30842 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30843 (VT.is256BitVector() && Subtarget.hasInt256());
30844
30845 bool AShift = LShift && (Subtarget.hasAVX512() ||
30846 (VT != MVT::v2i64 && VT != MVT::v4i64));
30847 return (Opcode == ISD::SRA) ? AShift : LShift;
30848}
30849
30850// The shift amount is a variable, but it is the same for all vector lanes.
30851// These instructions are defined together with shift-immediate.
30852static
30853bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
30854 unsigned Opcode) {
30855 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30856}
30857
30858// Return true if the required (according to Opcode) variable-shift form is
30859// natively supported by the Subtarget
30860static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
30861 unsigned Opcode) {
30862 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30863 return false;
30864
30865 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30866 return false;
30867
30868 // vXi16 supported only on AVX-512, BWI
30869 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30870 return false;
30871
30872 if (Subtarget.hasAVX512() &&
30873 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30874 return true;
30875
30876 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30877 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30878 return (Opcode == ISD::SRA) ? AShift : LShift;
30879}
30880
30881static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
30882 const X86Subtarget &Subtarget) {
30883 MVT VT = Op.getSimpleValueType();
30884 SDLoc dl(Op);
30885 SDValue R = Op.getOperand(0);
30886 SDValue Amt = Op.getOperand(1);
30887 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30888
30889 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30890 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30890, __extension__
__PRETTY_FUNCTION__))
;
30891 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30892 SDValue Ex = DAG.getBitcast(ExVT, R);
30893
30894 // ashr(R, 63) === cmp_slt(R, 0)
30895 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30896 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30897, __extension__
__PRETTY_FUNCTION__))
30897 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30897, __extension__
__PRETTY_FUNCTION__))
;
30898 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30899 }
30900
30901 if (ShiftAmt >= 32) {
30902 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30903 SDValue Upper =
30904 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30905 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
30906 ShiftAmt - 32, DAG);
30907 if (VT == MVT::v2i64)
30908 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30909 if (VT == MVT::v4i64)
30910 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30911 {9, 1, 11, 3, 13, 5, 15, 7});
30912 } else {
30913 // SRA upper i32, SRL whole i64 and select lower i32.
30914 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
30915 ShiftAmt, DAG);
30916 SDValue Lower =
30917 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30918 Lower = DAG.getBitcast(ExVT, Lower);
30919 if (VT == MVT::v2i64)
30920 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30921 if (VT == MVT::v4i64)
30922 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30923 {8, 1, 10, 3, 12, 5, 14, 7});
30924 }
30925 return DAG.getBitcast(VT, Ex);
30926 };
30927
30928 // Optimize shl/srl/sra with constant shift amount.
30929 APInt APIntShiftAmt;
30930 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30931 return SDValue();
30932
30933 // If the shift amount is out of range, return undef.
30934 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
30935 return DAG.getUNDEF(VT);
30936
30937 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30938
30939 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30940 // Hardware support for vector shifts is sparse which makes us scalarize the
30941 // vector operations in many cases. Also, on sandybridge ADD is faster than
30942 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30943 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30944 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30945 // must be 0). (add undef, undef) however can be any value. To make this
30946 // safe, we must freeze R to ensure that register allocation uses the same
30947 // register for an undefined value. This ensures that the result will
30948 // still be even and preserves the original semantics.
30949 R = DAG.getFreeze(R);
30950 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30951 }
30952
30953 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30954 }
30955
30956 // i64 SRA needs to be performed as partial shifts.
30957 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30958 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30959 Op.getOpcode() == ISD::SRA)
30960 return ArithmeticShiftRight64(ShiftAmt);
30961
30962 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30963 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30964 unsigned NumElts = VT.getVectorNumElements();
30965 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30966
30967 // Simple i8 add case
30968 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30969 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30970 // must be 0). (add undef, undef) however can be any value. To make this
30971 // safe, we must freeze R to ensure that register allocation uses the same
30972 // register for an undefined value. This ensures that the result will
30973 // still be even and preserves the original semantics.
30974 R = DAG.getFreeze(R);
30975 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30976 }
30977
30978 // ashr(R, 7) === cmp_slt(R, 0)
30979 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30980 SDValue Zeros = DAG.getConstant(0, dl, VT);
30981 if (VT.is512BitVector()) {
30982 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30982, __extension__
__PRETTY_FUNCTION__))
;
30983 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30984 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30985 }
30986 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30987 }
30988
30989 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30990 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30991 return SDValue();
30992
30993 if (Op.getOpcode() == ISD::SHL) {
30994 // Make a large shift.
30995 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30996 ShiftAmt, DAG);
30997 SHL = DAG.getBitcast(VT, SHL);
30998 // Zero out the rightmost bits.
30999 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
31000 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
31001 }
31002 if (Op.getOpcode() == ISD::SRL) {
31003 // Make a large shift.
31004 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
31005 ShiftAmt, DAG);
31006 SRL = DAG.getBitcast(VT, SRL);
31007 // Zero out the leftmost bits.
31008 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
31009 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
31010 }
31011 if (Op.getOpcode() == ISD::SRA) {
31012 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
31013 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31014
31015 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
31016 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
31017 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
31018 return Res;
31019 }
31020 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31020)
;
31021 }
31022
31023 return SDValue();
31024}
31025
31026static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
31027 const X86Subtarget &Subtarget) {
31028 MVT VT = Op.getSimpleValueType();
31029 SDLoc dl(Op);
31030 SDValue R = Op.getOperand(0);
31031 SDValue Amt = Op.getOperand(1);
31032 unsigned Opcode = Op.getOpcode();
31033 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
31034
31035 int BaseShAmtIdx = -1;
31036 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
31037 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
31038 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
31039 Subtarget, DAG);
31040
31041 // vXi8 shifts - shift as v8i16 + mask result.
31042 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
31043 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
31044 VT == MVT::v64i8) &&
31045 !Subtarget.hasXOP()) {
31046 unsigned NumElts = VT.getVectorNumElements();
31047 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31048 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
31049 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
31050 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
31051
31052 // Create the mask using vXi16 shifts. For shift-rights we need to move
31053 // the upper byte down before splatting the vXi8 mask.
31054 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
31055 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
31056 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
31057 if (Opcode != ISD::SHL)
31058 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
31059 8, DAG);
31060 BitMask = DAG.getBitcast(VT, BitMask);
31061 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
31062 SmallVector<int, 64>(NumElts, 0));
31063
31064 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
31065 DAG.getBitcast(ExtVT, R), BaseShAmt,
31066 BaseShAmtIdx, Subtarget, DAG);
31067 Res = DAG.getBitcast(VT, Res);
31068 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
31069
31070 if (Opcode == ISD::SRA) {
31071 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
31072 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
31073 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
31074 SignMask =
31075 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
31076 BaseShAmtIdx, Subtarget, DAG);
31077 SignMask = DAG.getBitcast(VT, SignMask);
31078 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
31079 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
31080 }
31081 return Res;
31082 }
31083 }
31084 }
31085
31086 return SDValue();
31087}
31088
31089// Convert a shift/rotate left amount to a multiplication scale factor.
31090static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
31091 const X86Subtarget &Subtarget,
31092 SelectionDAG &DAG) {
31093 MVT VT = Amt.getSimpleValueType();
31094 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
31095 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
31096 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
31097 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
31098 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31099 (Subtarget.hasBWI() && VT == MVT::v64i8)))
31100 return SDValue();
31101
31102 MVT SVT = VT.getVectorElementType();
31103 unsigned SVTBits = SVT.getSizeInBits();
31104 unsigned NumElems = VT.getVectorNumElements();
31105
31106 APInt UndefElts;
31107 SmallVector<APInt> EltBits;
31108 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
31109 APInt One(SVTBits, 1);
31110 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
31111 for (unsigned I = 0; I != NumElems; ++I) {
31112 if (UndefElts[I] || EltBits[I].uge(SVTBits))
31113 continue;
31114 uint64_t ShAmt = EltBits[I].getZExtValue();
31115 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
31116 }
31117 return DAG.getBuildVector(VT, dl, Elts);
31118 }
31119
31120 // If the target doesn't support variable shifts, use either FP conversion
31121 // or integer multiplication to avoid shifting each element individually.
31122 if (VT == MVT::v4i32) {
31123 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
31124 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
31125 DAG.getConstant(0x3f800000U, dl, VT));
31126 Amt = DAG.getBitcast(MVT::v4f32, Amt);
31127 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
31128 }
31129
31130 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
31131 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
31132 SDValue Z = DAG.getConstant(0, dl, VT);
31133 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
31134 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
31135 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
31136 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
31137 if (Subtarget.hasSSE41())
31138 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31139 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
31140 }
31141
31142 return SDValue();
31143}
31144
31145static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
31146 SelectionDAG &DAG) {
31147 MVT VT = Op.getSimpleValueType();
31148 SDLoc dl(Op);
31149 SDValue R = Op.getOperand(0);
31150 SDValue Amt = Op.getOperand(1);
31151 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31152 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31153
31154 unsigned Opc = Op.getOpcode();
31155 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
31156 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
31157
31158 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31158, __extension__
__PRETTY_FUNCTION__))
;
31159 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31159, __extension__
__PRETTY_FUNCTION__))
;
31160
31161 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
31162 return V;
31163
31164 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
31165 return V;
31166
31167 if (supportedVectorVarShift(VT, Subtarget, Opc))
31168 return Op;
31169
31170 // i64 vector arithmetic shift can be emulated with the transform:
31171 // M = lshr(SIGN_MASK, Amt)
31172 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
31173 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
31174 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
31175 Opc == ISD::SRA) {
31176 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
31177 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
31178 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31179 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
31180 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
31181 return R;
31182 }
31183
31184 // XOP has 128-bit variable logical/arithmetic shifts.
31185 // +ve/-ve Amt = shift left/right.
31186 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
31187 VT == MVT::v8i16 || VT == MVT::v16i8)) {
31188 if (Opc == ISD::SRL || Opc == ISD::SRA) {
31189 SDValue Zero = DAG.getConstant(0, dl, VT);
31190 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
31191 }
31192 if (Opc == ISD::SHL || Opc == ISD::SRL)
31193 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
31194 if (Opc == ISD::SRA)
31195 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
31196 }
31197
31198 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
31199 // shifts per-lane and then shuffle the partial results back together.
31200 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
31201 // Splat the shift amounts so the scalar shifts above will catch it.
31202 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
31203 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
31204 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
31205 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
31206 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
31207 }
31208
31209 // If possible, lower this shift as a sequence of two shifts by
31210 // constant plus a BLENDing shuffle instead of scalarizing it.
31211 // Example:
31212 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
31213 //
31214 // Could be rewritten as:
31215 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
31216 //
31217 // The advantage is that the two shifts from the example would be
31218 // lowered as X86ISD::VSRLI nodes in parallel before blending.
31219 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
31220 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31221 SDValue Amt1, Amt2;
31222 unsigned NumElts = VT.getVectorNumElements();
31223 SmallVector<int, 8> ShuffleMask;
31224 for (unsigned i = 0; i != NumElts; ++i) {
31225 SDValue A = Amt->getOperand(i);
31226 if (A.isUndef()) {
31227 ShuffleMask.push_back(SM_SentinelUndef);
31228 continue;
31229 }
31230 if (!Amt1 || Amt1 == A) {
31231 ShuffleMask.push_back(i);
31232 Amt1 = A;
31233 continue;
31234 }
31235 if (!Amt2 || Amt2 == A) {
31236 ShuffleMask.push_back(i + NumElts);
31237 Amt2 = A;
31238 continue;
31239 }
31240 break;
31241 }
31242
31243 // Only perform this blend if we can perform it without loading a mask.
31244 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
31245 (VT != MVT::v16i16 ||
31246 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
31247 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
31248 canWidenShuffleElements(ShuffleMask))) {
31249 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
31250 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
31251 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
31252 Cst2->getAPIntValue().ult(EltSizeInBits)) {
31253 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31254 Cst1->getZExtValue(), DAG);
31255 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31256 Cst2->getZExtValue(), DAG);
31257 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
31258 }
31259 }
31260 }
31261
31262 // If possible, lower this packed shift into a vector multiply instead of
31263 // expanding it into a sequence of scalar shifts.
31264 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
31265 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
31266 Subtarget.canExtendTo512BW())))
31267 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
31268 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
31269
31270 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
31271 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
31272 if (Opc == ISD::SRL && ConstantAmt &&
31273 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31274 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31275 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31276 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31277 SDValue Zero = DAG.getConstant(0, dl, VT);
31278 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
31279 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
31280 return DAG.getSelect(dl, VT, ZAmt, R, Res);
31281 }
31282 }
31283
31284 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
31285 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
31286 // TODO: Special case handling for shift by 0/1, really we can afford either
31287 // of these cases in pre-SSE41/XOP/AVX512 but not both.
31288 if (Opc == ISD::SRA && ConstantAmt &&
31289 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
31290 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
31291 !Subtarget.hasAVX512()) ||
31292 DAG.isKnownNeverZero(Amt))) {
31293 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31294 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31295 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31296 SDValue Amt0 =
31297 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
31298 SDValue Amt1 =
31299 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
31300 SDValue Sra1 =
31301 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
31302 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
31303 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
31304 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
31305 }
31306 }
31307
31308 // v4i32 Non Uniform Shifts.
31309 // If the shift amount is constant we can shift each lane using the SSE2
31310 // immediate shifts, else we need to zero-extend each lane to the lower i64
31311 // and shift using the SSE2 variable shifts.
31312 // The separate results can then be blended together.
31313 if (VT == MVT::v4i32) {
31314 SDValue Amt0, Amt1, Amt2, Amt3;
31315 if (ConstantAmt) {
31316 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
31317 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
31318 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
31319 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
31320 } else {
31321 // The SSE2 shifts use the lower i64 as the same shift amount for
31322 // all lanes and the upper i64 is ignored. On AVX we're better off
31323 // just zero-extending, but for SSE just duplicating the top 16-bits is
31324 // cheaper and has the same effect for out of range values.
31325 if (Subtarget.hasAVX()) {
31326 SDValue Z = DAG.getConstant(0, dl, VT);
31327 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
31328 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
31329 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
31330 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
31331 } else {
31332 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
31333 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
31334 {4, 5, 6, 7, -1, -1, -1, -1});
31335 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
31336 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
31337 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
31338 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
31339 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
31340 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
31341 }
31342 }
31343
31344 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
31345 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
31346 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
31347 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
31348 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
31349
31350 // Merge the shifted lane results optimally with/without PBLENDW.
31351 // TODO - ideally shuffle combining would handle this.
31352 if (Subtarget.hasSSE41()) {
31353 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
31354 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
31355 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
31356 }
31357 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
31358 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
31359 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
31360 }
31361
31362 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
31363 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
31364 // make the existing SSE solution better.
31365 // NOTE: We honor prefered vector width before promoting to 512-bits.
31366 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
31367 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
31368 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
31369 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
31370 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
31371 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31372, __extension__
__PRETTY_FUNCTION__))
31372 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31372, __extension__
__PRETTY_FUNCTION__))
;
31373 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
31374 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
31375 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31376 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
31377 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
31378 return DAG.getNode(ISD::TRUNCATE, dl, VT,
31379 DAG.getNode(Opc, dl, ExtVT, R, Amt));
31380 }
31381
31382 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
31383 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
31384 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
31385 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
31386 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
31387 !Subtarget.hasXOP()) {
31388 int NumElts = VT.getVectorNumElements();
31389 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
31390
31391 // Extend constant shift amount to vXi16 (it doesn't matter if the type
31392 // isn't legal).
31393 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
31394 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
31395 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
31396 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
31397 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31398, __extension__
__PRETTY_FUNCTION__))
31398 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31398, __extension__
__PRETTY_FUNCTION__))
;
31399
31400 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
31401 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
31402 : DAG.getZExtOrTrunc(R, dl, ExVT);
31403 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
31404 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
31405 return DAG.getZExtOrTrunc(R, dl, VT);
31406 }
31407
31408 SmallVector<SDValue, 16> LoAmt, HiAmt;
31409 for (int i = 0; i != NumElts; i += 16) {
31410 for (int j = 0; j != 8; ++j) {
31411 LoAmt.push_back(Amt.getOperand(i + j));
31412 HiAmt.push_back(Amt.getOperand(i + j + 8));
31413 }
31414 }
31415
31416 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
31417 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
31418 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
31419
31420 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31421 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31422 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31423 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31424 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31425 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31426 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31427 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31428 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31429 }
31430
31431 if (VT == MVT::v16i8 ||
31432 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31433 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31434 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
31435
31436 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31437 if (VT.is512BitVector()) {
31438 // On AVX512BW targets we make use of the fact that VSELECT lowers
31439 // to a masked blend which selects bytes based just on the sign bit
31440 // extracted to a mask.
31441 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
31442 V0 = DAG.getBitcast(VT, V0);
31443 V1 = DAG.getBitcast(VT, V1);
31444 Sel = DAG.getBitcast(VT, Sel);
31445 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31446 ISD::SETGT);
31447 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31448 } else if (Subtarget.hasSSE41()) {
31449 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31450 // on the sign bit.
31451 V0 = DAG.getBitcast(VT, V0);
31452 V1 = DAG.getBitcast(VT, V1);
31453 Sel = DAG.getBitcast(VT, Sel);
31454 return DAG.getBitcast(SelVT,
31455 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31456 }
31457 // On pre-SSE41 targets we test for the sign bit by comparing to
31458 // zero - a negative value will set all bits of the lanes to true
31459 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31460 SDValue Z = DAG.getConstant(0, dl, SelVT);
31461 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31462 return DAG.getSelect(dl, SelVT, C, V0, V1);
31463 };
31464
31465 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31466 // We can safely do this using i16 shifts as we're only interested in
31467 // the 3 lower bits of each byte.
31468 Amt = DAG.getBitcast(ExtVT, Amt);
31469 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31470 Amt = DAG.getBitcast(VT, Amt);
31471
31472 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31473 // r = VSELECT(r, shift(r, 4), a);
31474 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31475 R = SignBitSelect(VT, Amt, M, R);
31476
31477 // a += a
31478 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31479
31480 // r = VSELECT(r, shift(r, 2), a);
31481 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31482 R = SignBitSelect(VT, Amt, M, R);
31483
31484 // a += a
31485 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31486
31487 // return VSELECT(r, shift(r, 1), a);
31488 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31489 R = SignBitSelect(VT, Amt, M, R);
31490 return R;
31491 }
31492
31493 if (Opc == ISD::SRA) {
31494 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31495 // so we can correctly sign extend. We don't care what happens to the
31496 // lower byte.
31497 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31498 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31499 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31500 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31501 ALo = DAG.getBitcast(ExtVT, ALo);
31502 AHi = DAG.getBitcast(ExtVT, AHi);
31503 RLo = DAG.getBitcast(ExtVT, RLo);
31504 RHi = DAG.getBitcast(ExtVT, RHi);
31505
31506 // r = VSELECT(r, shift(r, 4), a);
31507 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31508 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31509 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31510 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31511
31512 // a += a
31513 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31514 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31515
31516 // r = VSELECT(r, shift(r, 2), a);
31517 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31518 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31519 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31520 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31521
31522 // a += a
31523 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31524 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31525
31526 // r = VSELECT(r, shift(r, 1), a);
31527 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31528 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31529 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31530 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31531
31532 // Logical shift the result back to the lower byte, leaving a zero upper
31533 // byte meaning that we can safely pack with PACKUSWB.
31534 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31535 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31536 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31537 }
31538 }
31539
31540 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31541 MVT ExtVT = MVT::v8i32;
31542 SDValue Z = DAG.getConstant(0, dl, VT);
31543 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31544 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31545 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31546 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31547 ALo = DAG.getBitcast(ExtVT, ALo);
31548 AHi = DAG.getBitcast(ExtVT, AHi);
31549 RLo = DAG.getBitcast(ExtVT, RLo);
31550 RHi = DAG.getBitcast(ExtVT, RHi);
31551 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31552 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31553 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31554 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31555 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31556 }
31557
31558 if (VT == MVT::v8i16) {
31559 // If we have a constant shift amount, the non-SSE41 path is best as
31560 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31561 bool UseSSE41 = Subtarget.hasSSE41() &&
31562 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31563
31564 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31565 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31566 // the sign bit.
31567 if (UseSSE41) {
31568 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
31569 V0 = DAG.getBitcast(ExtVT, V0);
31570 V1 = DAG.getBitcast(ExtVT, V1);
31571 Sel = DAG.getBitcast(ExtVT, Sel);
31572 return DAG.getBitcast(
31573 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31574 }
31575 // On pre-SSE41 targets we splat the sign bit - a negative value will
31576 // set all bits of the lanes to true and VSELECT uses that in
31577 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31578 SDValue C =
31579 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31580 return DAG.getSelect(dl, VT, C, V0, V1);
31581 };
31582
31583 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31584 if (UseSSE41) {
31585 // On SSE41 targets we need to replicate the shift mask in both
31586 // bytes for PBLENDVB.
31587 Amt = DAG.getNode(
31588 ISD::OR, dl, VT,
31589 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31590 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31591 } else {
31592 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31593 }
31594
31595 // r = VSELECT(r, shift(r, 8), a);
31596 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31597 R = SignBitSelect(Amt, M, R);
31598
31599 // a += a
31600 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31601
31602 // r = VSELECT(r, shift(r, 4), a);
31603 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31604 R = SignBitSelect(Amt, M, R);
31605
31606 // a += a
31607 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31608
31609 // r = VSELECT(r, shift(r, 2), a);
31610 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31611 R = SignBitSelect(Amt, M, R);
31612
31613 // a += a
31614 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31615
31616 // return VSELECT(r, shift(r, 1), a);
31617 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31618 R = SignBitSelect(Amt, M, R);
31619 return R;
31620 }
31621
31622 // Decompose 256-bit shifts into 128-bit shifts.
31623 if (VT.is256BitVector())
31624 return splitVectorIntBinary(Op, DAG);
31625
31626 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31627 return splitVectorIntBinary(Op, DAG);
31628
31629 return SDValue();
31630}
31631
31632static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
31633 SelectionDAG &DAG) {
31634 MVT VT = Op.getSimpleValueType();
31635 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31636, __extension__
__PRETTY_FUNCTION__))
31636 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31636, __extension__
__PRETTY_FUNCTION__))
;
31637
31638 SDLoc DL(Op);
31639 SDValue Op0 = Op.getOperand(0);
31640 SDValue Op1 = Op.getOperand(1);
31641 SDValue Amt = Op.getOperand(2);
31642 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31643 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31644
31645 if (VT.isVector()) {
31646 APInt APIntShiftAmt;
31647 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31648
31649 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31650 if (IsFSHR)
31651 std::swap(Op0, Op1);
31652
31653 if (IsCstSplat) {
31654 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31655 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31656 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31657 {Op0, Op1, Imm}, DAG, Subtarget);
31658 }
31659 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31660 {Op0, Op1, Amt}, DAG, Subtarget);
31661 }
31662 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))
31663 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))
31664 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))
31665 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))
;
31666
31667 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31668 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31669 if (IsCstSplat)
31670 return SDValue();
31671
31672 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31673 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31674 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31675
31676 // Constant vXi16 funnel shifts can be efficiently handled by default.
31677 if (IsCst && EltSizeInBits == 16)
31678 return SDValue();
31679
31680 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31681 unsigned NumElts = VT.getVectorNumElements();
31682 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31683 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31684
31685 // Split 256-bit integers on XOP/pre-AVX2 targets.
31686 // Split 512-bit integers on non 512-bit BWI targets.
31687 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31688 !Subtarget.hasAVX2())) ||
31689 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31690 EltSizeInBits < 32)) {
31691 // Pre-mask the amount modulo using the wider vector.
31692 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31693 return splitVectorOp(Op, DAG);
31694 }
31695
31696 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31697 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31698 int ScalarAmtIdx = -1;
31699 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31700 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31701 if (EltSizeInBits == 16)
31702 return SDValue();
31703
31704 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31705 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31706 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31707 ScalarAmtIdx, Subtarget, DAG);
31708 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31709 ScalarAmtIdx, Subtarget, DAG);
31710 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31711 }
31712 }
31713
31714 MVT WideSVT = MVT::getIntegerVT(
31715 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31716 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31717
31718 // If per-element shifts are legal, fallback to generic expansion.
31719 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31720 return SDValue();
31721
31722 // Attempt to fold as:
31723 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31724 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31725 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31726 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31727 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31728 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31729 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31730 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31731 EltSizeInBits, DAG);
31732 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31733 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31734 if (!IsFSHR)
31735 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31736 EltSizeInBits, DAG);
31737 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31738 }
31739
31740 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31741 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31742 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31743 SDValue Z = DAG.getConstant(0, DL, VT);
31744 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31745 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31746 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31747 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31748 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31749 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31750 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31751 }
31752
31753 // Fallback to generic expansion.
31754 return SDValue();
31755 }
31756 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31758, __extension__
__PRETTY_FUNCTION__))
31757 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31758, __extension__
__PRETTY_FUNCTION__))
31758 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31758, __extension__
__PRETTY_FUNCTION__))
;
31759
31760 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31761 bool OptForSize = DAG.shouldOptForSize();
31762 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31763
31764 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31765 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31766 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31767 !isa<ConstantSDNode>(Amt)) {
31768 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31769 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31770 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31771 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31772 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31773 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31774 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31775 if (IsFSHR) {
31776 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31777 } else {
31778 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31779 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31780 }
31781 return DAG.getZExtOrTrunc(Res, DL, VT);
31782 }
31783
31784 if (VT == MVT::i8 || ExpandFunnel)
31785 return SDValue();
31786
31787 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31788 if (VT == MVT::i16) {
31789 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31790 DAG.getConstant(15, DL, Amt.getValueType()));
31791 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31792 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31793 }
31794
31795 return Op;
31796}
31797
31798static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31799 SelectionDAG &DAG) {
31800 MVT VT = Op.getSimpleValueType();
31801 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31801, __extension__
__PRETTY_FUNCTION__))
;
31802
31803 SDLoc DL(Op);
31804 SDValue R = Op.getOperand(0);
31805 SDValue Amt = Op.getOperand(1);
31806 unsigned Opcode = Op.getOpcode();
31807 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31808 int NumElts = VT.getVectorNumElements();
31809 bool IsROTL = Opcode == ISD::ROTL;
31810
31811 // Check for constant splat rotation amount.
31812 APInt CstSplatValue;
31813 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31814
31815 // Check for splat rotate by zero.
31816 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31817 return R;
31818
31819 // AVX512 implicitly uses modulo rotation amounts.
31820 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
31821 // Attempt to rotate by immediate.
31822 if (IsCstSplat) {
31823 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31824 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31825 return DAG.getNode(RotOpc, DL, VT, R,
31826 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31827 }
31828
31829 // Else, fall-back on VPROLV/VPRORV.
31830 return Op;
31831 }
31832
31833 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31834 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31835 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31836 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31837 }
31838
31839 SDValue Z = DAG.getConstant(0, DL, VT);
31840
31841 if (!IsROTL) {
31842 // If the ISD::ROTR amount is constant, we're always better converting to
31843 // ISD::ROTL.
31844 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31845 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31846
31847 // XOP targets always prefers ISD::ROTL.
31848 if (Subtarget.hasXOP())
31849 return DAG.getNode(ISD::ROTL, DL, VT, R,
31850 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31851 }
31852
31853 // Split 256-bit integers on XOP/pre-AVX2 targets.
31854 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31855 return splitVectorIntBinary(Op, DAG);
31856
31857 // XOP has 128-bit vector variable + immediate rotates.
31858 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31859 // XOP implicitly uses modulo rotation amounts.
31860 if (Subtarget.hasXOP()) {
31861 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31861, __extension__
__PRETTY_FUNCTION__))
;
31862 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31862, __extension__
__PRETTY_FUNCTION__))
;
31863
31864 // Attempt to rotate by immediate.
31865 if (IsCstSplat) {
31866 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31867 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31868 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31869 }
31870
31871 // Use general rotate by variable (per-element).
31872 return Op;
31873 }
31874
31875 // Rotate by an uniform constant - expand back to shifts.
31876 if (IsCstSplat)
31877 return SDValue();
31878
31879 // Split 512-bit integers on non 512-bit BWI targets.
31880 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31881 return splitVectorIntBinary(Op, DAG);
31882
31883 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))
31884 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))
31885 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))
31886 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))
31887 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))
31888 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))
;
31889
31890 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31891 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31892
31893 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31894 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31895
31896 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31897 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31898 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31899 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31900 int BaseRotAmtIdx = -1;
31901 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31902 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31903 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31904 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31905 }
31906 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31907 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31908 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31909 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31910 BaseRotAmtIdx, Subtarget, DAG);
31911 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31912 BaseRotAmtIdx, Subtarget, DAG);
31913 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31914 }
31915 }
31916
31917 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31918 // the amount bit.
31919 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31920 if (EltSizeInBits == 8) {
31921 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31922 MVT WideVT =
31923 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31924 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31925
31926 // Attempt to fold as:
31927 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31928 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31929 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31930 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31931 // If we're rotating by constant, just use default promotion.
31932 if (IsConstAmt)
31933 return SDValue();
31934 // See if we can perform this by widening to vXi16 or vXi32.
31935 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31936 R = DAG.getNode(
31937 ISD::OR, DL, WideVT, R,
31938 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31939 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31940 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31941 if (IsROTL)
31942 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31943 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31944 }
31945
31946 // Attempt to fold as unpack(x,x) << zext(y):
31947 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31948 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31949 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31950 // See if we can perform this by unpacking to lo/hi vXi16.
31951 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31952 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31953 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31954 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31955 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31956 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31957 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31958 }
31959 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31959, __extension__
__PRETTY_FUNCTION__))
;
31960
31961 // We don't need ModuloAmt here as we just peek at individual bits.
31962 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31963 if (Subtarget.hasSSE41()) {
31964 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31965 // on the sign bit.
31966 V0 = DAG.getBitcast(VT, V0);
31967 V1 = DAG.getBitcast(VT, V1);
31968 Sel = DAG.getBitcast(VT, Sel);
31969 return DAG.getBitcast(SelVT,
31970 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31971 }
31972 // On pre-SSE41 targets we test for the sign bit by comparing to
31973 // zero - a negative value will set all bits of the lanes to true
31974 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31975 SDValue Z = DAG.getConstant(0, DL, SelVT);
31976 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31977 return DAG.getSelect(DL, SelVT, C, V0, V1);
31978 };
31979
31980 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31981 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31982 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31983 IsROTL = true;
31984 }
31985
31986 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31987 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31988
31989 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31990 // We can safely do this using i16 shifts as we're only interested in
31991 // the 3 lower bits of each byte.
31992 Amt = DAG.getBitcast(ExtVT, Amt);
31993 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31994 Amt = DAG.getBitcast(VT, Amt);
31995
31996 // r = VSELECT(r, rot(r, 4), a);
31997 SDValue M;
31998 M = DAG.getNode(
31999 ISD::OR, DL, VT,
32000 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
32001 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
32002 R = SignBitSelect(VT, Amt, M, R);
32003
32004 // a += a
32005 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32006
32007 // r = VSELECT(r, rot(r, 2), a);
32008 M = DAG.getNode(
32009 ISD::OR, DL, VT,
32010 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
32011 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
32012 R = SignBitSelect(VT, Amt, M, R);
32013
32014 // a += a
32015 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32016
32017 // return VSELECT(r, rot(r, 1), a);
32018 M = DAG.getNode(
32019 ISD::OR, DL, VT,
32020 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
32021 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
32022 return SignBitSelect(VT, Amt, M, R);
32023 }
32024
32025 bool IsSplatAmt = DAG.isSplatValue(Amt);
32026 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
32027 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
32028 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
32029
32030 // Fallback for splats + all supported variable shifts.
32031 // Fallback for non-constants AVX2 vXi16 as well.
32032 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
32033 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32034 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
32035 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
32036 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
32037 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
32038 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
32039 }
32040
32041 // Everything below assumes ISD::ROTL.
32042 if (!IsROTL) {
32043 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32044 IsROTL = true;
32045 }
32046
32047 // ISD::ROT* uses modulo rotate amounts.
32048 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32049
32050 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32050, __extension__
__PRETTY_FUNCTION__))
;
32051
32052 // As with shifts, attempt to convert the rotation amount to a multiplication
32053 // factor, fallback to general expansion.
32054 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
32055 if (!Scale)
32056 return SDValue();
32057
32058 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
32059 if (EltSizeInBits == 16) {
32060 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
32061 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
32062 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32063 }
32064
32065 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
32066 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
32067 // that can then be OR'd with the lower 32-bits.
32068 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32068, __extension__
__PRETTY_FUNCTION__))
;
32069 static const int OddMask[] = {1, -1, 3, -1};
32070 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
32071 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
32072
32073 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32074 DAG.getBitcast(MVT::v2i64, R),
32075 DAG.getBitcast(MVT::v2i64, Scale));
32076 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32077 DAG.getBitcast(MVT::v2i64, R13),
32078 DAG.getBitcast(MVT::v2i64, Scale13));
32079 Res02 = DAG.getBitcast(VT, Res02);
32080 Res13 = DAG.getBitcast(VT, Res13);
32081
32082 return DAG.getNode(ISD::OR, DL, VT,
32083 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
32084 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
32085}
32086
32087/// Returns true if the operand type is exactly twice the native width, and
32088/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
32089/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
32090/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
32091bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
32092 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
32093
32094 if (OpWidth == 64)
32095 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
32096 if (OpWidth == 128)
32097 return Subtarget.canUseCMPXCHG16B();
32098
32099 return false;
32100}
32101
32102TargetLoweringBase::AtomicExpansionKind
32103X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
32104 Type *MemType = SI->getValueOperand()->getType();
32105
32106 bool NoImplicitFloatOps =
32107 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32108 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32109 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32110 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32111 return AtomicExpansionKind::None;
32112
32113 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
32114 : AtomicExpansionKind::None;
32115}
32116
32117// Note: this turns large loads into lock cmpxchg8b/16b.
32118// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
32119TargetLowering::AtomicExpansionKind
32120X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
32121 Type *MemType = LI->getType();
32122
32123 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
32124 // can use movq to do the load. If we have X87 we can load into an 80-bit
32125 // X87 register and store it to a stack temporary.
32126 bool NoImplicitFloatOps =
32127 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32128 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32129 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32130 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32131 return AtomicExpansionKind::None;
32132
32133 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32134 : AtomicExpansionKind::None;
32135}
32136
32137enum BitTestKind : unsigned {
32138 UndefBit,
32139 ConstantBit,
32140 NotConstantBit,
32141 ShiftBit,
32142 NotShiftBit
32143};
32144
32145static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
32146 using namespace llvm::PatternMatch;
32147 BitTestKind BTK = UndefBit;
32148 auto *C = dyn_cast<ConstantInt>(V);
32149 if (C) {
32150 // Check if V is a power of 2 or NOT power of 2.
32151 if (isPowerOf2_64(C->getZExtValue()))
32152 BTK = ConstantBit;
32153 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
32154 BTK = NotConstantBit;
32155 return {V, BTK};
32156 }
32157
32158 // Check if V is some power of 2 pattern known to be non-zero
32159 auto *I = dyn_cast<Instruction>(V);
32160 if (I) {
32161 bool Not = false;
32162 // Check if we have a NOT
32163 Value *PeekI;
32164 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
32165 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
32166 Not = true;
32167 I = dyn_cast<Instruction>(PeekI);
32168
32169 // If I is constant, it will fold and we can evaluate later. If its an
32170 // argument or something of that nature, we can't analyze.
32171 if (I == nullptr)
32172 return {nullptr, UndefBit};
32173 }
32174 // We can only use 1 << X without more sophisticated analysis. C << X where
32175 // C is a power of 2 but not 1 can result in zero which cannot be translated
32176 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
32177 if (I->getOpcode() == Instruction::Shl) {
32178 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
32179 // -X` and some other provable power of 2 patterns that we can use CTZ on
32180 // may be profitable.
32181 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
32182 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
32183 // be provably a non-zero power of 2.
32184 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
32185 // transformable to bittest.
32186 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
32187 if (!ShiftVal)
32188 return {nullptr, UndefBit};
32189 if (ShiftVal->equalsInt(1))
32190 BTK = Not ? NotShiftBit : ShiftBit;
32191
32192 if (BTK == UndefBit)
32193 return {nullptr, UndefBit};
32194
32195 Value *BitV = I->getOperand(1);
32196
32197 Value *AndOp;
32198 const APInt *AndC;
32199 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
32200 // Read past a shiftmask instruction to find count
32201 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
32202 BitV = AndOp;
32203 }
32204 return {BitV, BTK};
32205 }
32206 }
32207 return {nullptr, UndefBit};
32208}
32209
32210TargetLowering::AtomicExpansionKind
32211X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
32212 // If the atomicrmw's result isn't actually used, we can just add a "lock"
32213 // prefix to a normal instruction for these operations.
32214 if (AI->use_empty())
32215 return AtomicExpansionKind::None;
32216
32217 // If the atomicrmw's result is used by a single bit AND, we may use
32218 // bts/btr/btc instruction for these operations.
32219 // Note: InstCombinePass can cause a de-optimization here. It replaces the
32220 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
32221 // (depending on CC). This pattern can only use bts/btr/btc but we don't
32222 // detect it.
32223 Instruction *I = AI->user_back();
32224 auto BitChange = FindSingleBitChange(AI->getValOperand());
32225 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
32226 I->getOpcode() != Instruction::And ||
32227 AI->getType()->getPrimitiveSizeInBits() == 8 ||
32228 AI->getParent() != I->getParent())
32229 return AtomicExpansionKind::CmpXChg;
32230
32231 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
32232
32233 // This is a redundant AND, it should get cleaned up elsewhere.
32234 if (AI == I->getOperand(OtherIdx))
32235 return AtomicExpansionKind::CmpXChg;
32236
32237 // The following instruction must be a AND single bit.
32238 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
32239 auto *C1 = cast<ConstantInt>(AI->getValOperand());
32240 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
32241 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
32242 return AtomicExpansionKind::CmpXChg;
32243 }
32244 if (AI->getOperation() == AtomicRMWInst::And) {
32245 return ~C1->getValue() == C2->getValue()
32246 ? AtomicExpansionKind::BitTestIntrinsic
32247 : AtomicExpansionKind::CmpXChg;
32248 }
32249 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
32250 : AtomicExpansionKind::CmpXChg;
32251 }
32252
32253 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32253, __extension__
__PRETTY_FUNCTION__))
;
32254
32255 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
32256 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
32257 return AtomicExpansionKind::CmpXChg;
32258
32259 assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32259, __extension__
__PRETTY_FUNCTION__))
;
32260
32261 // If shift amounts are not the same we can't use BitTestIntrinsic.
32262 if (BitChange.first != BitTested.first)
32263 return AtomicExpansionKind::CmpXChg;
32264
32265 // If atomic AND need to be masking all be one bit and testing the one bit
32266 // unset in the mask.
32267 if (AI->getOperation() == AtomicRMWInst::And)
32268 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
32269 ? AtomicExpansionKind::BitTestIntrinsic
32270 : AtomicExpansionKind::CmpXChg;
32271
32272 // If atomic XOR/OR need to be setting and testing the same bit.
32273 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
32274 ? AtomicExpansionKind::BitTestIntrinsic
32275 : AtomicExpansionKind::CmpXChg;
32276}
32277
32278void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
32279 IRBuilder<> Builder(AI);
32280 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32281 Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
32282 Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
32283 switch (AI->getOperation()) {
32284 default:
32285 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32285)
;
32286 case AtomicRMWInst::Or:
32287 IID_C = Intrinsic::x86_atomic_bts;
32288 IID_I = Intrinsic::x86_atomic_bts_rm;
32289 break;
32290 case AtomicRMWInst::Xor:
32291 IID_C = Intrinsic::x86_atomic_btc;
32292 IID_I = Intrinsic::x86_atomic_btc_rm;
32293 break;
32294 case AtomicRMWInst::And:
32295 IID_C = Intrinsic::x86_atomic_btr;
32296 IID_I = Intrinsic::x86_atomic_btr_rm;
32297 break;
32298 }
32299 Instruction *I = AI->user_back();
32300 LLVMContext &Ctx = AI->getContext();
32301 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32302 Type::getInt8PtrTy(Ctx));
32303 Function *BitTest = nullptr;
32304 Value *Result = nullptr;
32305 auto BitTested = FindSingleBitChange(AI->getValOperand());
32306 assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32306, __extension__ __PRETTY_FUNCTION__))
;
32307
32308 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
32309 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
32310
32311 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
32312
32313 unsigned Imm = llvm::countr_zero(C->getZExtValue());
32314 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
32315 } else {
32316 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
32317
32318 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32318, __extension__
__PRETTY_FUNCTION__))
;
32319
32320 Value *SI = BitTested.first;
32321 assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32321, __extension__ __PRETTY_FUNCTION__))
;
32322
32323 // BT{S|R|C} on memory operand don't modulo bit position so we need to
32324 // mask it.
32325 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
32326 Value *BitPos =
32327 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
32328 // Todo(1): In many cases it may be provable that SI is less than
32329 // ShiftBits in which case this mask is unnecessary
32330 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
32331 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
32332 // favor of just a raw BT{S|R|C}.
32333
32334 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
32335 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
32336
32337 // If the result is only used for zero/non-zero status then we don't need to
32338 // shift value back. Otherwise do so.
32339 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
32340 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
32341 if (ICmp->isEquality()) {
32342 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
32343 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
32344 if (C0 || C1) {
32345 assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32345, __extension__ __PRETTY_FUNCTION__))
;
32346 if ((C0 ? C0 : C1)->isZero())
32347 continue;
32348 }
32349 }
32350 }
32351 Result = Builder.CreateShl(Result, BitPos);
32352 break;
32353 }
32354 }
32355
32356 I->replaceAllUsesWith(Result);
32357 I->eraseFromParent();
32358 AI->eraseFromParent();
32359}
32360
32361static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
32362 using namespace llvm::PatternMatch;
32363 if (!AI->hasOneUse())
32364 return false;
32365
32366 Value *Op = AI->getOperand(1);
32367 ICmpInst::Predicate Pred;
32368 Instruction *I = AI->user_back();
32369 AtomicRMWInst::BinOp Opc = AI->getOperation();
32370 if (Opc == AtomicRMWInst::Add) {
32371 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32372 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32373 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32374 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32375 return Pred == CmpInst::ICMP_SLT;
32376 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32377 return Pred == CmpInst::ICMP_SGT;
32378 }
32379 return false;
32380 }
32381 if (Opc == AtomicRMWInst::Sub) {
32382 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32383 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32384 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32385 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32386 return Pred == CmpInst::ICMP_SLT;
32387 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32388 return Pred == CmpInst::ICMP_SGT;
32389 }
32390 return false;
32391 }
32392 if ((Opc == AtomicRMWInst::Or &&
32393 match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
32394 (Opc == AtomicRMWInst::And &&
32395 match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
32396 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32397 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32398 Pred == CmpInst::ICMP_SLT;
32399 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32400 return Pred == CmpInst::ICMP_SGT;
32401 return false;
32402 }
32403 if (Opc == AtomicRMWInst::Xor) {
32404 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32405 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32406 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32407 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32408 return Pred == CmpInst::ICMP_SLT;
32409 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32410 return Pred == CmpInst::ICMP_SGT;
32411 }
32412 return false;
32413 }
32414
32415 return false;
32416}
32417
32418void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32419 AtomicRMWInst *AI) const {
32420 IRBuilder<> Builder(AI);
32421 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32422 Instruction *TempI = nullptr;
32423 LLVMContext &Ctx = AI->getContext();
32424 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32425 if (!ICI) {
32426 TempI = AI->user_back();
32427 assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32427, __extension__
__PRETTY_FUNCTION__))
;
32428 ICI = cast<ICmpInst>(TempI->user_back());
32429 }
32430 X86::CondCode CC = X86::COND_INVALID;
32431 ICmpInst::Predicate Pred = ICI->getPredicate();
32432 switch (Pred) {
32433 default:
32434 llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32434)
;
32435 case CmpInst::ICMP_EQ:
32436 CC = X86::COND_E;
32437 break;
32438 case CmpInst::ICMP_NE:
32439 CC = X86::COND_NE;
32440 break;
32441 case CmpInst::ICMP_SLT:
32442 CC = X86::COND_S;
32443 break;
32444 case CmpInst::ICMP_SGT:
32445 CC = X86::COND_NS;
32446 break;
32447 }
32448 Intrinsic::ID IID = Intrinsic::not_intrinsic;
32449 switch (AI->getOperation()) {
32450 default:
32451 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32451)
;
32452 case AtomicRMWInst::Add:
32453 IID = Intrinsic::x86_atomic_add_cc;
32454 break;
32455 case AtomicRMWInst::Sub:
32456 IID = Intrinsic::x86_atomic_sub_cc;
32457 break;
32458 case AtomicRMWInst::Or:
32459 IID = Intrinsic::x86_atomic_or_cc;
32460 break;
32461 case AtomicRMWInst::And:
32462 IID = Intrinsic::x86_atomic_and_cc;
32463 break;
32464 case AtomicRMWInst::Xor:
32465 IID = Intrinsic::x86_atomic_xor_cc;
32466 break;
32467 }
32468 Function *CmpArith =
32469 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
32470 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32471 Type::getInt8PtrTy(Ctx));
32472 Value *Call = Builder.CreateCall(
32473 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32474 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32475 ICI->replaceAllUsesWith(Result);
32476 ICI->eraseFromParent();
32477 if (TempI)
32478 TempI->eraseFromParent();
32479 AI->eraseFromParent();
32480}
32481
32482TargetLowering::AtomicExpansionKind
32483X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32484 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32485 Type *MemType = AI->getType();
32486
32487 // If the operand is too big, we must see if cmpxchg8/16b is available
32488 // and default to library calls otherwise.
32489 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32490 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32491 : AtomicExpansionKind::None;
32492 }
32493
32494 AtomicRMWInst::BinOp Op = AI->getOperation();
32495 switch (Op) {
32496 case AtomicRMWInst::Xchg:
32497 return AtomicExpansionKind::None;
32498 case AtomicRMWInst::Add:
32499 case AtomicRMWInst::Sub:
32500 if (shouldExpandCmpArithRMWInIR(AI))
32501 return AtomicExpansionKind::CmpArithIntrinsic;
32502 // It's better to use xadd, xsub or xchg for these in other cases.
32503 return AtomicExpansionKind::None;
32504 case AtomicRMWInst::Or:
32505 case AtomicRMWInst::And:
32506 case AtomicRMWInst::Xor:
32507 if (shouldExpandCmpArithRMWInIR(AI))
32508 return AtomicExpansionKind::CmpArithIntrinsic;
32509 return shouldExpandLogicAtomicRMWInIR(AI);
32510 case AtomicRMWInst::Nand:
32511 case AtomicRMWInst::Max:
32512 case AtomicRMWInst::Min:
32513 case AtomicRMWInst::UMax:
32514 case AtomicRMWInst::UMin:
32515 case AtomicRMWInst::FAdd:
32516 case AtomicRMWInst::FSub:
32517 case AtomicRMWInst::FMax:
32518 case AtomicRMWInst::FMin:
32519 case AtomicRMWInst::UIncWrap:
32520 case AtomicRMWInst::UDecWrap:
32521 default:
32522 // These always require a non-trivial set of data operations on x86. We must
32523 // use a cmpxchg loop.
32524 return AtomicExpansionKind::CmpXChg;
32525 }
32526}
32527
32528LoadInst *
32529X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32530 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32531 Type *MemType = AI->getType();
32532 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32533 // there is no benefit in turning such RMWs into loads, and it is actually
32534 // harmful as it introduces a mfence.
32535 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32536 return nullptr;
32537
32538 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32539 // lowering available in lowerAtomicArith.
32540 // TODO: push more cases through this path.
32541 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32542 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32543 AI->use_empty())
32544 return nullptr;
32545
32546 IRBuilder<> Builder(AI);
32547 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32548 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
32549 auto SSID = AI->getSyncScopeID();
32550 // We must restrict the ordering to avoid generating loads with Release or
32551 // ReleaseAcquire orderings.
32552 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
32553
32554 // Before the load we need a fence. Here is an example lifted from
32555 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32556 // is required:
32557 // Thread 0:
32558 // x.store(1, relaxed);
32559 // r1 = y.fetch_add(0, release);
32560 // Thread 1:
32561 // y.fetch_add(42, acquire);
32562 // r2 = x.load(relaxed);
32563 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32564 // lowered to just a load without a fence. A mfence flushes the store buffer,
32565 // making the optimization clearly correct.
32566 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32567 // otherwise, we might be able to be more aggressive on relaxed idempotent
32568 // rmw. In practice, they do not look useful, so we don't try to be
32569 // especially clever.
32570 if (SSID == SyncScope::SingleThread)
32571 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
32572 // the IR level, so we must wrap it in an intrinsic.
32573 return nullptr;
32574
32575 if (!Subtarget.hasMFence())
32576 // FIXME: it might make sense to use a locked operation here but on a
32577 // different cache-line to prevent cache-line bouncing. In practice it
32578 // is probably a small win, and x86 processors without mfence are rare
32579 // enough that we do not bother.
32580 return nullptr;
32581
32582 Function *MFence =
32583 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
32584 Builder.CreateCall(MFence, {});
32585
32586 // Finally we can emit the atomic load.
32587 LoadInst *Loaded = Builder.CreateAlignedLoad(
32588 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32589 Loaded->setAtomic(Order, SSID);
32590 AI->replaceAllUsesWith(Loaded);
32591 AI->eraseFromParent();
32592 return Loaded;
32593}
32594
32595bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
32596 if (!SI.isUnordered())
32597 return false;
32598 return ExperimentalUnorderedISEL;
32599}
32600bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
32601 if (!LI.isUnordered())
32602 return false;
32603 return ExperimentalUnorderedISEL;
32604}
32605
32606
32607/// Emit a locked operation on a stack location which does not change any
32608/// memory location, but does involve a lock prefix. Location is chosen to be
32609/// a) very likely accessed only by a single thread to minimize cache traffic,
32610/// and b) definitely dereferenceable. Returns the new Chain result.
32611static SDValue emitLockedStackOp(SelectionDAG &DAG,
32612 const X86Subtarget &Subtarget, SDValue Chain,
32613 const SDLoc &DL) {
32614 // Implementation notes:
32615 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32616 // operations issued by the current processor. As such, the location
32617 // referenced is not relevant for the ordering properties of the instruction.
32618 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32619 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32620 // 2) Using an immediate operand appears to be the best encoding choice
32621 // here since it doesn't require an extra register.
32622 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32623 // is small enough it might just be measurement noise.)
32624 // 4) When choosing offsets, there are several contributing factors:
32625 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32626 // line aligned stack object to improve this case.)
32627 // b) To minimize our chances of introducing a false dependence, we prefer
32628 // to offset the stack usage from TOS slightly.
32629 // c) To minimize concerns about cross thread stack usage - in particular,
32630 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32631 // captures state in the TOS frame and accesses it from many threads -
32632 // we want to use an offset such that the offset is in a distinct cache
32633 // line from the TOS frame.
32634 //
32635 // For a general discussion of the tradeoffs and benchmark results, see:
32636 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32637
32638 auto &MF = DAG.getMachineFunction();
32639 auto &TFL = *Subtarget.getFrameLowering();
32640 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32641
32642 if (Subtarget.is64Bit()) {
32643 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32644 SDValue Ops[] = {
32645 DAG.getRegister(X86::RSP, MVT::i64), // Base
32646 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32647 DAG.getRegister(0, MVT::i64), // Index
32648 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32649 DAG.getRegister(0, MVT::i16), // Segment.
32650 Zero,
32651 Chain};
32652 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32653 MVT::Other, Ops);
32654 return SDValue(Res, 1);
32655 }
32656
32657 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32658 SDValue Ops[] = {
32659 DAG.getRegister(X86::ESP, MVT::i32), // Base
32660 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32661 DAG.getRegister(0, MVT::i32), // Index
32662 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32663 DAG.getRegister(0, MVT::i16), // Segment.
32664 Zero,
32665 Chain
32666 };
32667 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32668 MVT::Other, Ops);
32669 return SDValue(Res, 1);
32670}
32671
32672static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
32673 SelectionDAG &DAG) {
32674 SDLoc dl(Op);
32675 AtomicOrdering FenceOrdering =
32676 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32677 SyncScope::ID FenceSSID =
32678 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32679
32680 // The only fence that needs an instruction is a sequentially-consistent
32681 // cross-thread fence.
32682 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32683 FenceSSID == SyncScope::System) {
32684 if (Subtarget.hasMFence())
32685 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32686
32687 SDValue Chain = Op.getOperand(0);
32688 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32689 }
32690
32691 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32692 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32693}
32694
32695static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
32696 SelectionDAG &DAG) {
32697 MVT T = Op.getSimpleValueType();
32698 SDLoc DL(Op);
32699 unsigned Reg = 0;
32700 unsigned size = 0;
32701 switch(T.SimpleTy) {
32702 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32702)
;
32703 case MVT::i8: Reg = X86::AL; size = 1; break;
32704 case MVT::i16: Reg = X86::AX; size = 2; break;
32705 case MVT::i32: Reg = X86::EAX; size = 4; break;
32706 case MVT::i64:
32707 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32707, __extension__
__PRETTY_FUNCTION__))
;
32708 Reg = X86::RAX; size = 8;
32709 break;
32710 }
32711 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32712 Op.getOperand(2), SDValue());
32713 SDValue Ops[] = { cpIn.getValue(0),
32714 Op.getOperand(1),
32715 Op.getOperand(3),
32716 DAG.getTargetConstant(size, DL, MVT::i8),
32717 cpIn.getValue(1) };
32718 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32719 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32720 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
32721 Ops, T, MMO);
32722
32723 SDValue cpOut =
32724 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32725 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32726 MVT::i32, cpOut.getValue(2));
32727 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32728
32729 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32730 cpOut, Success, EFLAGS.getValue(1));
32731}
32732
32733// Create MOVMSKB, taking into account whether we need to split for AVX1.
32734static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
32735 const X86Subtarget &Subtarget) {
32736 MVT InVT = V.getSimpleValueType();
32737
32738 if (InVT == MVT::v64i8) {
32739 SDValue Lo, Hi;
32740 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32741 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32742 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32743 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32744 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32745 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32746 DAG.getConstant(32, DL, MVT::i8));
32747 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32748 }
32749 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32750 SDValue Lo, Hi;
32751 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32752 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32753 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32754 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32755 DAG.getConstant(16, DL, MVT::i8));
32756 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32757 }
32758
32759 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32760}
32761
32762static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32763 SelectionDAG &DAG) {
32764 SDValue Src = Op.getOperand(0);
32765 MVT SrcVT = Src.getSimpleValueType();
32766 MVT DstVT = Op.getSimpleValueType();
32767
32768 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32769 // half to v32i1 and concatenating the result.
32770 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32771 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32771, __extension__
__PRETTY_FUNCTION__))
;
32772 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32772, __extension__
__PRETTY_FUNCTION__))
;
32773 SDLoc dl(Op);
32774 SDValue Lo, Hi;
32775 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32776 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32777 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32778 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32779 }
32780
32781 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32782 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32783 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32783, __extension__
__PRETTY_FUNCTION__))
;
32784 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32785 SDLoc DL(Op);
32786 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32787 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32788 return DAG.getZExtOrTrunc(V, DL, DstVT);
32789 }
32790
32791 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32792, __extension__
__PRETTY_FUNCTION__))
32792 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32792, __extension__
__PRETTY_FUNCTION__))
;
32793
32794 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32794, __extension__
__PRETTY_FUNCTION__))
;
32795 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32796 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32797 // This conversion needs to be expanded.
32798 return SDValue();
32799
32800 SDLoc dl(Op);
32801 if (SrcVT.isVector()) {
32802 // Widen the vector in input in the case of MVT::v2i32.
32803 // Example: from MVT::v2i32 to MVT::v4i32.
32804 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
32805 SrcVT.getVectorNumElements() * 2);
32806 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32807 DAG.getUNDEF(SrcVT));
32808 } else {
32809 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32810, __extension__
__PRETTY_FUNCTION__))
32810 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32810, __extension__
__PRETTY_FUNCTION__))
;
32811 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32812 }
32813
32814 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32815 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32816
32817 if (DstVT == MVT::x86mmx)
32818 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32819
32820 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32821 DAG.getIntPtrConstant(0, dl));
32822}
32823
32824/// Compute the horizontal sum of bytes in V for the elements of VT.
32825///
32826/// Requires V to be a byte vector and VT to be an integer vector type with
32827/// wider elements than V's type. The width of the elements of VT determines
32828/// how many bytes of V are summed horizontally to produce each element of the
32829/// result.
32830static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
32831 const X86Subtarget &Subtarget,
32832 SelectionDAG &DAG) {
32833 SDLoc DL(V);
32834 MVT ByteVecVT = V.getSimpleValueType();
32835 MVT EltVT = VT.getVectorElementType();
32836 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32837, __extension__
__PRETTY_FUNCTION__))
32837 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32837, __extension__
__PRETTY_FUNCTION__))
;
32838 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32839, __extension__
__PRETTY_FUNCTION__))
32839 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32839, __extension__
__PRETTY_FUNCTION__))
;
32840 unsigned VecSize = VT.getSizeInBits();
32841 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32841, __extension__
__PRETTY_FUNCTION__))
;
32842
32843 // PSADBW instruction horizontally add all bytes and leave the result in i64
32844 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32845 if (EltVT == MVT::i64) {
32846 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32847 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32848 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32849 return DAG.getBitcast(VT, V);
32850 }
32851
32852 if (EltVT == MVT::i32) {
32853 // We unpack the low half and high half into i32s interleaved with zeros so
32854 // that we can use PSADBW to horizontally sum them. The most useful part of
32855 // this is that it lines up the results of two PSADBW instructions to be
32856 // two v2i64 vectors which concatenated are the 4 population counts. We can
32857 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32858 SDValue Zeros = DAG.getConstant(0, DL, VT);
32859 SDValue V32 = DAG.getBitcast(VT, V);
32860 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32861 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32862
32863 // Do the horizontal sums into two v2i64s.
32864 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32865 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32866 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32867 DAG.getBitcast(ByteVecVT, Low), Zeros);
32868 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32869 DAG.getBitcast(ByteVecVT, High), Zeros);
32870
32871 // Merge them together.
32872 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32873 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32874 DAG.getBitcast(ShortVecVT, Low),
32875 DAG.getBitcast(ShortVecVT, High));
32876
32877 return DAG.getBitcast(VT, V);
32878 }
32879
32880 // The only element type left is i16.
32881 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32881, __extension__
__PRETTY_FUNCTION__))
;
32882
32883 // To obtain pop count for each i16 element starting from the pop count for
32884 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32885 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32886 // directly supported.
32887 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32888 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32889 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32890 DAG.getBitcast(ByteVecVT, V));
32891 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32892}
32893
32894static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
32895 const X86Subtarget &Subtarget,
32896 SelectionDAG &DAG) {
32897 MVT VT = Op.getSimpleValueType();
32898 MVT EltVT = VT.getVectorElementType();
32899 int NumElts = VT.getVectorNumElements();
32900 (void)EltVT;
32901 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32901, __extension__
__PRETTY_FUNCTION__))
;
32902
32903 // Implement a lookup table in register by using an algorithm based on:
32904 // http://wm.ite.pl/articles/sse-popcount.html
32905 //
32906 // The general idea is that every lower byte nibble in the input vector is an
32907 // index into a in-register pre-computed pop count table. We then split up the
32908 // input vector in two new ones: (1) a vector with only the shifted-right
32909 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32910 // masked out higher ones) for each byte. PSHUFB is used separately with both
32911 // to index the in-register table. Next, both are added and the result is a
32912 // i8 vector where each element contains the pop count for input byte.
32913 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32914 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32915 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32916 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32917
32918 SmallVector<SDValue, 64> LUTVec;
32919 for (int i = 0; i < NumElts; ++i)
32920 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32921 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32922 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32923
32924 // High nibbles
32925 SDValue FourV = DAG.getConstant(4, DL, VT);
32926 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32927
32928 // Low nibbles
32929 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32930
32931 // The input vector is used as the shuffle mask that index elements into the
32932 // LUT. After counting low and high nibbles, add the vector to obtain the
32933 // final pop count per i8 element.
32934 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32935 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32936 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32937}
32938
32939// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32940// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32941static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
32942 SelectionDAG &DAG) {
32943 MVT VT = Op.getSimpleValueType();
32944 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32945, __extension__
__PRETTY_FUNCTION__))
32945 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32945, __extension__
__PRETTY_FUNCTION__))
;
32946 SDLoc DL(Op.getNode());
32947 SDValue Op0 = Op.getOperand(0);
32948
32949 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32950 if (Subtarget.hasVPOPCNTDQ()) {
32951 unsigned NumElems = VT.getVectorNumElements();
32952 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__))
32953 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__))
;
32954 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32955 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32956 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32957 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32958 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32959 }
32960 }
32961
32962 // Decompose 256-bit ops into smaller 128-bit ops.
32963 if (VT.is256BitVector() && !Subtarget.hasInt256())
32964 return splitVectorIntUnary(Op, DAG);
32965
32966 // Decompose 512-bit ops into smaller 256-bit ops.
32967 if (VT.is512BitVector() && !Subtarget.hasBWI())
32968 return splitVectorIntUnary(Op, DAG);
32969
32970 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32971 if (VT.getScalarType() != MVT::i8) {
32972 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32973 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32974 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32975 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32976 }
32977
32978 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32979 if (!Subtarget.hasSSSE3())
32980 return SDValue();
32981
32982 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32983}
32984
32985static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
32986 SelectionDAG &DAG) {
32987 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32988, __extension__
__PRETTY_FUNCTION__))
32988 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32988, __extension__
__PRETTY_FUNCTION__))
;
32989 return LowerVectorCTPOP(Op, Subtarget, DAG);
32990}
32991
32992static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
32993 MVT VT = Op.getSimpleValueType();
32994 SDValue In = Op.getOperand(0);
32995 SDLoc DL(Op);
32996
32997 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32998 // perform the BITREVERSE.
32999 if (!VT.isVector()) {
33000 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
33001 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
33002 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
33003 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
33004 DAG.getIntPtrConstant(0, DL));
33005 }
33006
33007 int NumElts = VT.getVectorNumElements();
33008 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
33009
33010 // Decompose 256-bit ops into smaller 128-bit ops.
33011 if (VT.is256BitVector())
33012 return splitVectorIntUnary(Op, DAG);
33013
33014 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33015, __extension__
__PRETTY_FUNCTION__))
33015 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33015, __extension__
__PRETTY_FUNCTION__))
;
33016
33017 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
33018 // perform the BSWAP in the shuffle.
33019 // Its best to shuffle using the second operand as this will implicitly allow
33020 // memory folding for multiple vectors.
33021 SmallVector<SDValue, 16> MaskElts;
33022 for (int i = 0; i != NumElts; ++i) {
33023 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
33024 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
33025 int PermuteByte = SourceByte | (2 << 5);
33026 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
33027 }
33028 }
33029
33030 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
33031 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
33032 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
33033 Res, Mask);
33034 return DAG.getBitcast(VT, Res);
33035}
33036
33037static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
33038 SelectionDAG &DAG) {
33039 MVT VT = Op.getSimpleValueType();
33040
33041 if (Subtarget.hasXOP() && !VT.is512BitVector())
33042 return LowerBITREVERSE_XOP(Op, DAG);
33043
33044 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33044, __extension__
__PRETTY_FUNCTION__))
;
33045
33046 SDValue In = Op.getOperand(0);
33047 SDLoc DL(Op);
33048
33049 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33050, __extension__
__PRETTY_FUNCTION__))
33050 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33050, __extension__
__PRETTY_FUNCTION__))
;
33051
33052 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
33053 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
33054 return splitVectorIntUnary(Op, DAG);
33055
33056 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
33057 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
33058 return splitVectorIntUnary(Op, DAG);
33059
33060 unsigned NumElts = VT.getVectorNumElements();
33061
33062 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
33063 if (Subtarget.hasGFNI()) {
33064 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
33065 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
33066 Matrix = DAG.getBitcast(VT, Matrix);
33067 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
33068 DAG.getTargetConstant(0, DL, MVT::i8));
33069 }
33070
33071 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
33072 // two nibbles and a PSHUFB lookup to find the bitreverse of each
33073 // 0-15 value (moved to the other nibble).
33074 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
33075 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
33076 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
33077
33078 const int LoLUT[16] = {
33079 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
33080 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
33081 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
33082 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
33083 const int HiLUT[16] = {
33084 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
33085 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
33086 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
33087 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
33088
33089 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
33090 for (unsigned i = 0; i < NumElts; ++i) {
33091 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
33092 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
33093 }
33094
33095 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
33096 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
33097 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
33098 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
33099 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
33100}
33101
33102static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
33103 SelectionDAG &DAG) {
33104 SDLoc DL(Op);
33105 SDValue X = Op.getOperand(0);
33106 MVT VT = Op.getSimpleValueType();
33107
33108 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
33109 if (VT == MVT::i8 ||
33110 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
33111 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33112 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
33113 DAG.getConstant(0, DL, MVT::i8));
33114 // Copy the inverse of the parity flag into a register with setcc.
33115 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33116 // Extend to the original type.
33117 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33118 }
33119
33120 // If we have POPCNT, use the default expansion.
33121 if (Subtarget.hasPOPCNT())
33122 return SDValue();
33123
33124 if (VT == MVT::i64) {
33125 // Xor the high and low 16-bits together using a 32-bit operation.
33126 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
33127 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
33128 DAG.getConstant(32, DL, MVT::i8)));
33129 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
33130 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
33131 }
33132
33133 if (VT != MVT::i16) {
33134 // Xor the high and low 16-bits together using a 32-bit operation.
33135 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
33136 DAG.getConstant(16, DL, MVT::i8));
33137 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
33138 } else {
33139 // If the input is 16-bits, we need to extend to use an i32 shift below.
33140 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
33141 }
33142
33143 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
33144 // This should allow an h-reg to be used to save a shift.
33145 SDValue Hi = DAG.getNode(
33146 ISD::TRUNCATE, DL, MVT::i8,
33147 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
33148 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33149 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
33150 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
33151
33152 // Copy the inverse of the parity flag into a register with setcc.
33153 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33154 // Extend to the original type.
33155 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33156}
33157
33158static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
33159 const X86Subtarget &Subtarget) {
33160 unsigned NewOpc = 0;
33161 switch (N->getOpcode()) {
33162 case ISD::ATOMIC_LOAD_ADD:
33163 NewOpc = X86ISD::LADD;
33164 break;
33165 case ISD::ATOMIC_LOAD_SUB:
33166 NewOpc = X86ISD::LSUB;
33167 break;
33168 case ISD::ATOMIC_LOAD_OR:
33169 NewOpc = X86ISD::LOR;
33170 break;
33171 case ISD::ATOMIC_LOAD_XOR:
33172 NewOpc = X86ISD::LXOR;
33173 break;
33174 case ISD::ATOMIC_LOAD_AND:
33175 NewOpc = X86ISD::LAND;
33176 break;
33177 default:
33178 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33178)
;
33179 }
33180
33181 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33182
33183 return DAG.getMemIntrinsicNode(
33184 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
33185 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
33186 /*MemVT=*/N->getSimpleValueType(0), MMO);
33187}
33188
33189/// Lower atomic_load_ops into LOCK-prefixed operations.
33190static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
33191 const X86Subtarget &Subtarget) {
33192 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
33193 SDValue Chain = N->getOperand(0);
33194 SDValue LHS = N->getOperand(1);
33195 SDValue RHS = N->getOperand(2);
33196 unsigned Opc = N->getOpcode();
33197 MVT VT = N->getSimpleValueType(0);
33198 SDLoc DL(N);
33199
33200 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
33201 // can only be lowered when the result is unused. They should have already
33202 // been transformed into a cmpxchg loop in AtomicExpand.
33203 if (N->hasAnyUseOfValue(0)) {
33204 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
33205 // select LXADD if LOCK_SUB can't be selected.
33206 if (Opc == ISD::ATOMIC_LOAD_SUB) {
33207 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
33208 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
33209 RHS, AN->getMemOperand());
33210 }
33211 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33212, __extension__
__PRETTY_FUNCTION__))
33212 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33212, __extension__
__PRETTY_FUNCTION__))
;
33213 return N;
33214 }
33215
33216 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
33217 // The core idea here is that since the memory location isn't actually
33218 // changing, all we need is a lowering for the *ordering* impacts of the
33219 // atomicrmw. As such, we can chose a different operation and memory
33220 // location to minimize impact on other code.
33221 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
33222 // On X86, the only ordering which actually requires an instruction is
33223 // seq_cst which isn't SingleThread, everything just needs to be preserved
33224 // during codegen and then dropped. Note that we expect (but don't assume),
33225 // that orderings other than seq_cst and acq_rel have been canonicalized to
33226 // a store or load.
33227 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
33228 AN->getSyncScopeID() == SyncScope::System) {
33229 // Prefer a locked operation against a stack location to minimize cache
33230 // traffic. This assumes that stack locations are very likely to be
33231 // accessed only by the owning thread.
33232 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
33233 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33233, __extension__ __PRETTY_FUNCTION__))
;
33234 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33235 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33236 DAG.getUNDEF(VT), NewChain);
33237 }
33238 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
33239 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
33240 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33240, __extension__ __PRETTY_FUNCTION__))
;
33241 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33242 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33243 DAG.getUNDEF(VT), NewChain);
33244 }
33245
33246 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
33247 // RAUW the chain, but don't worry about the result, as it's unused.
33248 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33248, __extension__ __PRETTY_FUNCTION__))
;
33249 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33250 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33251 DAG.getUNDEF(VT), LockOp.getValue(1));
33252}
33253
33254static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
33255 const X86Subtarget &Subtarget) {
33256 auto *Node = cast<AtomicSDNode>(Op.getNode());
33257 SDLoc dl(Node);
33258 EVT VT = Node->getMemoryVT();
33259
33260 bool IsSeqCst =
33261 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33262 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33263
33264 // If this store is not sequentially consistent and the type is legal
33265 // we can just keep it.
33266 if (!IsSeqCst && IsTypeLegal)
33267 return Op;
33268
33269 if (VT == MVT::i64 && !IsTypeLegal) {
33270 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33271 // is enabled.
33272 bool NoImplicitFloatOps =
33273 DAG.getMachineFunction().getFunction().hasFnAttribute(
33274 Attribute::NoImplicitFloat);
33275 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33276 SDValue Chain;
33277 if (Subtarget.hasSSE1()) {
33278 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
33279 Node->getOperand(2));
33280 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33281 SclToVec = DAG.getBitcast(StVT, SclToVec);
33282 SDVTList Tys = DAG.getVTList(MVT::Other);
33283 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33284 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33285 MVT::i64, Node->getMemOperand());
33286 } else if (Subtarget.hasX87()) {
33287 // First load this into an 80-bit X87 register using a stack temporary.
33288 // This will put the whole integer into the significand.
33289 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33290 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33291 MachinePointerInfo MPI =
33292 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
33293 Chain =
33294 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
33295 MPI, MaybeAlign(), MachineMemOperand::MOStore);
33296 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33297 SDValue LdOps[] = {Chain, StackPtr};
33298 SDValue Value = DAG.getMemIntrinsicNode(
33299 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33300 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33301 Chain = Value.getValue(1);
33302
33303 // Now use an FIST to do the atomic store.
33304 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33305 Chain =
33306 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33307 StoreOps, MVT::i64, Node->getMemOperand());
33308 }
33309
33310 if (Chain) {
33311 // If this is a sequentially consistent store, also emit an appropriate
33312 // barrier.
33313 if (IsSeqCst)
33314 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33315
33316 return Chain;
33317 }
33318 }
33319 }
33320
33321 // Convert seq_cst store -> xchg
33322 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33323 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33324 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
33325 Node->getMemoryVT(),
33326 Node->getOperand(0),
33327 Node->getOperand(1), Node->getOperand(2),
33328 Node->getMemOperand());
33329 return Swap.getValue(1);
33330}
33331
33332static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
33333 SDNode *N = Op.getNode();
33334 MVT VT = N->getSimpleValueType(0);
33335 unsigned Opc = Op.getOpcode();
33336
33337 // Let legalize expand this if it isn't a legal type yet.
33338 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33339 return SDValue();
33340
33341 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33342 SDLoc DL(N);
33343
33344 // Set the carry flag.
33345 SDValue Carry = Op.getOperand(2);
33346 EVT CarryVT = Carry.getValueType();
33347 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33348 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33349
33350 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
33351 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33352 Op.getOperand(0), Op.getOperand(1),
33353 Carry.getValue(1));
33354
33355 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33356 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33357 Sum.getValue(1), DL, DAG);
33358 if (N->getValueType(1) == MVT::i1)
33359 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33360
33361 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33362}
33363
33364static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33365 SelectionDAG &DAG) {
33366 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33366, __extension__
__PRETTY_FUNCTION__))
;
33367
33368 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33369 // which returns the values as { float, float } (in XMM0) or
33370 // { double, double } (which is returned in XMM0, XMM1).
33371 SDLoc dl(Op);
33372 SDValue Arg = Op.getOperand(0);
33373 EVT ArgVT = Arg.getValueType();
33374 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33375
33376 TargetLowering::ArgListTy Args;
33377 TargetLowering::ArgListEntry Entry;
33378
33379 Entry.Node = Arg;
33380 Entry.Ty = ArgTy;
33381 Entry.IsSExt = false;
33382 Entry.IsZExt = false;
33383 Args.push_back(Entry);
33384
33385 bool isF64 = ArgVT == MVT::f64;
33386 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33387 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33388 // the results are returned via SRet in memory.
33389 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33390 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33391 const char *LibcallName = TLI.getLibcallName(LC);
33392 SDValue Callee =
33393 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33394
33395 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33396 : (Type *)FixedVectorType::get(ArgTy, 4);
33397
33398 TargetLowering::CallLoweringInfo CLI(DAG);
33399 CLI.setDebugLoc(dl)
33400 .setChain(DAG.getEntryNode())
33401 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33402
33403 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33404
33405 if (isF64)
33406 // Returned in xmm0 and xmm1.
33407 return CallResult.first;
33408
33409 // Returned in bits 0:31 and 32:64 xmm0.
33410 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33411 CallResult.first, DAG.getIntPtrConstant(0, dl));
33412 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33413 CallResult.first, DAG.getIntPtrConstant(1, dl));
33414 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33415 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33416}
33417
33418/// Widen a vector input to a vector of NVT. The
33419/// input vector must have the same element type as NVT.
33420static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
33421 bool FillWithZeroes = false) {
33422 // Check if InOp already has the right width.
33423 MVT InVT = InOp.getSimpleValueType();
33424 if (InVT == NVT)
33425 return InOp;
33426
33427 if (InOp.isUndef())
33428 return DAG.getUNDEF(NVT);
33429
33430 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__))
33431 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__))
;
33432
33433 unsigned InNumElts = InVT.getVectorNumElements();
33434 unsigned WidenNumElts = NVT.getVectorNumElements();
33435 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33436, __extension__
__PRETTY_FUNCTION__))
33436 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33436, __extension__
__PRETTY_FUNCTION__))
;
33437
33438 SDLoc dl(InOp);
33439 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
33440 InOp.getNumOperands() == 2) {
33441 SDValue N1 = InOp.getOperand(1);
33442 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33443 N1.isUndef()) {
33444 InOp = InOp.getOperand(0);
33445 InVT = InOp.getSimpleValueType();
33446 InNumElts = InVT.getVectorNumElements();
33447 }
33448 }
33449 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
33450 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
33451 SmallVector<SDValue, 16> Ops;
33452 for (unsigned i = 0; i < InNumElts; ++i)
33453 Ops.push_back(InOp.getOperand(i));
33454
33455 EVT EltVT = InOp.getOperand(0).getValueType();
33456
33457 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
33458 DAG.getUNDEF(EltVT);
33459 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
33460 Ops.push_back(FillVal);
33461 return DAG.getBuildVector(NVT, dl, Ops);
33462 }
33463 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
33464 DAG.getUNDEF(NVT);
33465 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
33466 InOp, DAG.getIntPtrConstant(0, dl));
33467}
33468
33469static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
33470 SelectionDAG &DAG) {
33471 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33472, __extension__
__PRETTY_FUNCTION__))
33472 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33472, __extension__
__PRETTY_FUNCTION__))
;
33473
33474 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33475 SDValue Src = N->getValue();
33476 MVT VT = Src.getSimpleValueType();
33477 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33477, __extension__
__PRETTY_FUNCTION__))
;
33478 SDLoc dl(Op);
33479
33480 SDValue Scale = N->getScale();
33481 SDValue Index = N->getIndex();
33482 SDValue Mask = N->getMask();
33483 SDValue Chain = N->getChain();
33484 SDValue BasePtr = N->getBasePtr();
33485
33486 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33487 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33487, __extension__
__PRETTY_FUNCTION__))
;
33488 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33489 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33490 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33491 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33492 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33493 SDVTList VTs = DAG.getVTList(MVT::Other);
33494 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33495 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33496 N->getMemoryVT(), N->getMemOperand());
33497 }
33498 return SDValue();
33499 }
33500
33501 MVT IndexVT = Index.getSimpleValueType();
33502
33503 // If the index is v2i32, we're being called by type legalization and we
33504 // should just let the default handling take care of it.
33505 if (IndexVT == MVT::v2i32)
33506 return SDValue();
33507
33508 // If we don't have VLX and neither the passthru or index is 512-bits, we
33509 // need to widen until one is.
33510 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33511 !Index.getSimpleValueType().is512BitVector()) {
33512 // Determine how much we need to widen by to get a 512-bit type.
33513 unsigned Factor = std::min(512/VT.getSizeInBits(),
33514 512/IndexVT.getSizeInBits());
33515 unsigned NumElts = VT.getVectorNumElements() * Factor;
33516
33517 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33518 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33519 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33520
33521 Src = ExtendToType(Src, VT, DAG);
33522 Index = ExtendToType(Index, IndexVT, DAG);
33523 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33524 }
33525
33526 SDVTList VTs = DAG.getVTList(MVT::Other);
33527 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33528 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33529 N->getMemoryVT(), N->getMemOperand());
33530}
33531
33532static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33533 SelectionDAG &DAG) {
33534
33535 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33536 MVT VT = Op.getSimpleValueType();
33537 MVT ScalarVT = VT.getScalarType();
33538 SDValue Mask = N->getMask();
33539 MVT MaskVT = Mask.getSimpleValueType();
33540 SDValue PassThru = N->getPassThru();
33541 SDLoc dl(Op);
33542
33543 // Handle AVX masked loads which don't support passthru other than 0.
33544 if (MaskVT.getVectorElementType() != MVT::i1) {
33545 // We also allow undef in the isel pattern.
33546 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33547 return Op;
33548
33549 SDValue NewLoad = DAG.getMaskedLoad(
33550 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33551 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33552 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33553 N->isExpandingLoad());
33554 // Emit a blend.
33555 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33556 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33557 }
33558
33559 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33560, __extension__
__PRETTY_FUNCTION__))
33560 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33560, __extension__
__PRETTY_FUNCTION__))
;
33561
33562 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33563, __extension__
__PRETTY_FUNCTION__))
33563 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33563, __extension__
__PRETTY_FUNCTION__))
;
33564
33565 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33566, __extension__
__PRETTY_FUNCTION__))
33566 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33566, __extension__
__PRETTY_FUNCTION__))
;
33567
33568 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))
33569 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))
33570 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))
33571 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))
;
33572
33573 // This operation is legal for targets with VLX, but without
33574 // VLX the vector should be widened to 512 bit
33575 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33576 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33577 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33578
33579 // Mask element has to be i1.
33580 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33581, __extension__
__PRETTY_FUNCTION__))
33581 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33581, __extension__
__PRETTY_FUNCTION__))
;
33582
33583 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33584
33585 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33586 SDValue NewLoad = DAG.getMaskedLoad(
33587 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33588 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33589 N->getExtensionType(), N->isExpandingLoad());
33590
33591 SDValue Extract =
33592 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33593 DAG.getIntPtrConstant(0, dl));
33594 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33595 return DAG.getMergeValues(RetOps, dl);
33596}
33597
33598static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33599 SelectionDAG &DAG) {
33600 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33601 SDValue DataToStore = N->getValue();
33602 MVT VT = DataToStore.getSimpleValueType();
33603 MVT ScalarVT = VT.getScalarType();
33604 SDValue Mask = N->getMask();
33605 SDLoc dl(Op);
33606
33607 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33608, __extension__
__PRETTY_FUNCTION__))
33608 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33608, __extension__
__PRETTY_FUNCTION__))
;
33609
33610 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33611, __extension__
__PRETTY_FUNCTION__))
33611 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33611, __extension__
__PRETTY_FUNCTION__))
;
33612
33613 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33614, __extension__
__PRETTY_FUNCTION__))
33614 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33614, __extension__
__PRETTY_FUNCTION__))
;
33615
33616 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))
33617 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))
33618 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))
33619 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))
;
33620
33621 // This operation is legal for targets with VLX, but without
33622 // VLX the vector should be widened to 512 bit
33623 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33624 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33625
33626 // Mask element has to be i1.
33627 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33628, __extension__
__PRETTY_FUNCTION__))
33628 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33628, __extension__
__PRETTY_FUNCTION__))
;
33629
33630 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33631
33632 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33633 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33634 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33635 N->getOffset(), Mask, N->getMemoryVT(),
33636 N->getMemOperand(), N->getAddressingMode(),
33637 N->isTruncatingStore(), N->isCompressingStore());
33638}
33639
33640static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33641 SelectionDAG &DAG) {
33642 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33643, __extension__
__PRETTY_FUNCTION__))
33643 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33643, __extension__
__PRETTY_FUNCTION__))
;
33644
33645 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33646 SDLoc dl(Op);
33647 MVT VT = Op.getSimpleValueType();
33648 SDValue Index = N->getIndex();
33649 SDValue Mask = N->getMask();
33650 SDValue PassThru = N->getPassThru();
33651 MVT IndexVT = Index.getSimpleValueType();
33652
33653 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33653, __extension__
__PRETTY_FUNCTION__))
;
33654
33655 // If the index is v2i32, we're being called by type legalization.
33656 if (IndexVT == MVT::v2i32)
33657 return SDValue();
33658
33659 // If we don't have VLX and neither the passthru or index is 512-bits, we
33660 // need to widen until one is.
33661 MVT OrigVT = VT;
33662 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33663 !IndexVT.is512BitVector()) {
33664 // Determine how much we need to widen by to get a 512-bit type.
33665 unsigned Factor = std::min(512/VT.getSizeInBits(),
33666 512/IndexVT.getSizeInBits());
33667
33668 unsigned NumElts = VT.getVectorNumElements() * Factor;
33669
33670 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33671 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33672 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33673
33674 PassThru = ExtendToType(PassThru, VT, DAG);
33675 Index = ExtendToType(Index, IndexVT, DAG);
33676 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33677 }
33678
33679 // Break dependency on the data register.
33680 if (PassThru.isUndef())
33681 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33682
33683 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33684 N->getScale() };
33685 SDValue NewGather = DAG.getMemIntrinsicNode(
33686 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33687 N->getMemOperand());
33688 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
33689 NewGather, DAG.getIntPtrConstant(0, dl));
33690 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33691}
33692
33693static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
33694 SDLoc dl(Op);
33695 SDValue Src = Op.getOperand(0);
33696 MVT DstVT = Op.getSimpleValueType();
33697
33698 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33699 unsigned SrcAS = N->getSrcAddressSpace();
33700
33701 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33702, __extension__
__PRETTY_FUNCTION__))
33702 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33702, __extension__
__PRETTY_FUNCTION__))
;
33703
33704 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33705 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33706 } else if (DstVT == MVT::i64) {
33707 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33708 } else if (DstVT == MVT::i32) {
33709 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33710 } else {
33711 report_fatal_error("Bad address space in addrspacecast");
33712 }
33713 return Op;
33714}
33715
33716SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33717 SelectionDAG &DAG) const {
33718 // TODO: Eventually, the lowering of these nodes should be informed by or
33719 // deferred to the GC strategy for the function in which they appear. For
33720 // now, however, they must be lowered to something. Since they are logically
33721 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33722 // require special handling for these nodes), lower them as literal NOOPs for
33723 // the time being.
33724 SmallVector<SDValue, 2> Ops;
33725 Ops.push_back(Op.getOperand(0));
33726 if (Op->getGluedNode())
33727 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33728
33729 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33730 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33731}
33732
33733// Custom split CVTPS2PH with wide types.
33734static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
33735 SDLoc dl(Op);
33736 EVT VT = Op.getValueType();
33737 SDValue Lo, Hi;
33738 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33739 EVT LoVT, HiVT;
33740 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33741 SDValue RC = Op.getOperand(1);
33742 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33743 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33744 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33745}
33746
33747static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
33748 unsigned OpNo) {
33749 const APInt Operand(32, OpNo);
33750 std::string OpNoStr = llvm::toString(Operand, 10, false);
33751 std::string Str(" $");
33752
33753 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33754 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33755
33756 auto I = StringRef::npos;
33757 for (auto &AsmStr : AsmStrs) {
33758 // Match the OpNo string. We should match exactly to exclude match
33759 // sub-string, e.g. "$12" contain "$1"
33760 if (AsmStr.endswith(OpNoStr1))
33761 I = AsmStr.size() - OpNoStr1.size();
33762
33763 // Get the index of operand in AsmStr.
33764 if (I == StringRef::npos)
33765 I = AsmStr.find(OpNoStr1 + ",");
33766 if (I == StringRef::npos)
33767 I = AsmStr.find(OpNoStr2);
33768
33769 if (I == StringRef::npos)
33770 continue;
33771
33772 assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
;
33773 // Remove the operand string and label (if exsit).
33774 // For example:
33775 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33776 // ==>
33777 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33778 // ==>
33779 // "call dword ptr "
33780 auto TmpStr = AsmStr.substr(0, I);
33781 I = TmpStr.rfind(':');
33782 if (I == StringRef::npos)
33783 return TmpStr;
33784
33785 assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33785, __extension__
__PRETTY_FUNCTION__))
;
33786 auto Asm = TmpStr.drop_front(I + 1);
33787 return Asm;
33788 }
33789
33790 return StringRef();
33791}
33792
33793bool X86TargetLowering::isInlineAsmTargetBranch(
33794 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33795 StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);
33796
33797 if (InstrStr.contains("call"))
33798 return true;
33799
33800 return false;
33801}
33802
33803/// Provide custom lowering hooks for some operations.
33804SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
33805 switch (Op.getOpcode()) {
33806 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33806)
;
33807 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33808 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33809 return LowerCMP_SWAP(Op, Subtarget, DAG);
33810 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33811 case ISD::ATOMIC_LOAD_ADD:
33812 case ISD::ATOMIC_LOAD_SUB:
33813 case ISD::ATOMIC_LOAD_OR:
33814 case ISD::ATOMIC_LOAD_XOR:
33815 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33816 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33817 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33818 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33819 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33820 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33821 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33822 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33823 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33824 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33825 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33826 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33827 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33828 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33829 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33830 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33831 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33832 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33833 case ISD::SHL_PARTS:
33834 case ISD::SRA_PARTS:
33835 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33836 case ISD::FSHL:
33837 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33838 case ISD::STRICT_SINT_TO_FP:
33839 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33840 case ISD::STRICT_UINT_TO_FP:
33841 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33842 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33843 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33844 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33845 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33846 case ISD::ZERO_EXTEND_VECTOR_INREG:
33847 case ISD::SIGN_EXTEND_VECTOR_INREG:
33848 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33849 case ISD::FP_TO_SINT:
33850 case ISD::STRICT_FP_TO_SINT:
33851 case ISD::FP_TO_UINT:
33852 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33853 case ISD::FP_TO_SINT_SAT:
33854 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33855 case ISD::FP_EXTEND:
33856 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33857 case ISD::FP_ROUND:
33858 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33859 case ISD::FP16_TO_FP:
33860 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33861 case ISD::FP_TO_FP16:
33862 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33863 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33864 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33865 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33866 case ISD::FADD:
33867 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33868 case ISD::FROUND: return LowerFROUND(Op, DAG);
33869 case ISD::FABS:
33870 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33871 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33872 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33873 case ISD::LRINT:
33874 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33875 case ISD::SETCC:
33876 case ISD::STRICT_FSETCC:
33877 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33878 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33879 case ISD::SELECT: return LowerSELECT(Op, DAG);
33880 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33881 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33882 case ISD::VASTART: return LowerVASTART(Op, DAG);
33883 case ISD::VAARG: return LowerVAARG(Op, DAG);
33884 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33885 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33886 case ISD::INTRINSIC_VOID:
33887 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33888 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33889 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33890 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33891 case ISD::FRAME_TO_ARGS_OFFSET:
33892 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33893 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33894 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33895 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33896 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33897 case ISD::EH_SJLJ_SETUP_DISPATCH:
33898 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33899 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33900 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33901 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33902 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33903 case ISD::CTLZ:
33904 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33905 case ISD::CTTZ:
33906 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33907 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33908 case ISD::MULHS:
33909 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33910 case ISD::ROTL:
33911 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33912 case ISD::SRA:
33913 case ISD::SRL:
33914 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33915 case ISD::SADDO:
33916 case ISD::UADDO:
33917 case ISD::SSUBO:
33918 case ISD::USUBO: return LowerXALUO(Op, DAG);
33919 case ISD::SMULO:
33920 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33921 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33922 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33923 case ISD::SADDO_CARRY:
33924 case ISD::SSUBO_CARRY:
33925 case ISD::ADDCARRY:
33926 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
33927 case ISD::ADD:
33928 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33929 case ISD::UADDSAT:
33930 case ISD::SADDSAT:
33931 case ISD::USUBSAT:
33932 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33933 case ISD::SMAX:
33934 case ISD::SMIN:
33935 case ISD::UMAX:
33936 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33937 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33938 case ISD::ABDS:
33939 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33940 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33941 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33942 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33943 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33944 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33945 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33946 case ISD::GC_TRANSITION_START:
33947 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33948 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33949 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33950 }
33951}
33952
33953/// Replace a node with an illegal result type with a new node built out of
33954/// custom code.
33955void X86TargetLowering::ReplaceNodeResults(SDNode *N,
33956 SmallVectorImpl<SDValue>&Results,
33957 SelectionDAG &DAG) const {
33958 SDLoc dl(N);
33959 switch (N->getOpcode()) {
33960 default:
33961#ifndef NDEBUG
33962 dbgs() << "ReplaceNodeResults: ";
33963 N->dump(&DAG);
33964#endif
33965 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33965)
;
33966 case X86ISD::CVTPH2PS: {
33967 EVT VT = N->getValueType(0);
33968 SDValue Lo, Hi;
33969 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33970 EVT LoVT, HiVT;
33971 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33972 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33973 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33974 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33975 Results.push_back(Res);
33976 return;
33977 }
33978 case X86ISD::STRICT_CVTPH2PS: {
33979 EVT VT = N->getValueType(0);
33980 SDValue Lo, Hi;
33981 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33982 EVT LoVT, HiVT;
33983 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33984 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33985 {N->getOperand(0), Lo});
33986 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33987 {N->getOperand(0), Hi});
33988 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33989 Lo.getValue(1), Hi.getValue(1));
33990 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33991 Results.push_back(Res);
33992 Results.push_back(Chain);
33993 return;
33994 }
33995 case X86ISD::CVTPS2PH:
33996 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33997 return;
33998 case ISD::CTPOP: {
33999 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33999, __extension__
__PRETTY_FUNCTION__))
;
34000 // Use a v2i64 if possible.
34001 bool NoImplicitFloatOps =
34002 DAG.getMachineFunction().getFunction().hasFnAttribute(
34003 Attribute::NoImplicitFloat);
34004 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
34005 SDValue Wide =
34006 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
34007 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
34008 // Bit count should fit in 32-bits, extract it as that and then zero
34009 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
34010 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
34011 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
34012 DAG.getIntPtrConstant(0, dl));
34013 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
34014 Results.push_back(Wide);
34015 }
34016 return;
34017 }
34018 case ISD::MUL: {
34019 EVT VT = N->getValueType(0);
34020 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34021, __extension__
__PRETTY_FUNCTION__))
34021 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34021, __extension__
__PRETTY_FUNCTION__))
;
34022 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
34023 // elements are needed.
34024 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
34025 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
34026 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
34027 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
34028 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34029 unsigned NumConcats = 16 / VT.getVectorNumElements();
34030 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34031 ConcatOps[0] = Res;
34032 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
34033 Results.push_back(Res);
34034 return;
34035 }
34036 case ISD::SMULO:
34037 case ISD::UMULO: {
34038 EVT VT = N->getValueType(0);
34039 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34040, __extension__
__PRETTY_FUNCTION__))
34040 VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34040, __extension__
__PRETTY_FUNCTION__))
;
34041 bool IsSigned = N->getOpcode() == ISD::SMULO;
34042 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
34043 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
34044 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
34045 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
34046 // Extract the high 32 bits from each result using PSHUFD.
34047 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
34048 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
34049 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
34050 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
34051 DAG.getIntPtrConstant(0, dl));
34052
34053 // Truncate the low bits of the result. This will become PSHUFD.
34054 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34055
34056 SDValue HiCmp;
34057 if (IsSigned) {
34058 // SMULO overflows if the high bits don't match the sign of the low.
34059 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
34060 } else {
34061 // UMULO overflows if the high bits are non-zero.
34062 HiCmp = DAG.getConstant(0, dl, VT);
34063 }
34064 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
34065
34066 // Widen the result with by padding with undef.
34067 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34068 DAG.getUNDEF(VT));
34069 Results.push_back(Res);
34070 Results.push_back(Ovf);
34071 return;
34072 }
34073 case X86ISD::VPMADDWD: {
34074 // Legalize types for X86ISD::VPMADDWD by widening.
34075 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34075, __extension__
__PRETTY_FUNCTION__))
;
34076
34077 EVT VT = N->getValueType(0);
34078 EVT InVT = N->getOperand(0).getValueType();
34079 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34080, __extension__
__PRETTY_FUNCTION__))
34080 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34080, __extension__
__PRETTY_FUNCTION__))
;
34081 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34082, __extension__
__PRETTY_FUNCTION__))
34082 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34082, __extension__
__PRETTY_FUNCTION__))
;
34083 unsigned NumConcat = 128 / InVT.getSizeInBits();
34084
34085 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
34086 InVT.getVectorElementType(),
34087 NumConcat * InVT.getVectorNumElements());
34088 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
34089 VT.getVectorElementType(),
34090 NumConcat * VT.getVectorNumElements());
34091
34092 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
34093 Ops[0] = N->getOperand(0);
34094 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34095 Ops[0] = N->getOperand(1);
34096 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34097
34098 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
34099 Results.push_back(Res);
34100 return;
34101 }
34102 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
34103 case X86ISD::FMINC:
34104 case X86ISD::FMIN:
34105 case X86ISD::FMAXC:
34106 case X86ISD::FMAX: {
34107 EVT VT = N->getValueType(0);
34108 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34108, __extension__
__PRETTY_FUNCTION__))
;
34109 SDValue UNDEF = DAG.getUNDEF(VT);
34110 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34111 N->getOperand(0), UNDEF);
34112 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34113 N->getOperand(1), UNDEF);
34114 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
34115 return;
34116 }
34117 case ISD::SDIV:
34118 case ISD::UDIV:
34119 case ISD::SREM:
34120 case ISD::UREM: {
34121 EVT VT = N->getValueType(0);
34122 if (VT.isVector()) {
34123 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34124, __extension__
__PRETTY_FUNCTION__))
34124 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34124, __extension__
__PRETTY_FUNCTION__))
;
34125 // If this RHS is a constant splat vector we can widen this and let
34126 // division/remainder by constant optimize it.
34127 // TODO: Can we do something for non-splat?
34128 APInt SplatVal;
34129 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
34130 unsigned NumConcats = 128 / VT.getSizeInBits();
34131 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
34132 Ops0[0] = N->getOperand(0);
34133 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
34134 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
34135 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
34136 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
34137 Results.push_back(Res);
34138 }
34139 return;
34140 }
34141
34142 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
34143 Results.push_back(V);
34144 return;
34145 }
34146 case ISD::TRUNCATE: {
34147 MVT VT = N->getSimpleValueType(0);
34148 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
34149 return;
34150
34151 // The generic legalizer will try to widen the input type to the same
34152 // number of elements as the widened result type. But this isn't always
34153 // the best thing so do some custom legalization to avoid some cases.
34154 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34155 SDValue In = N->getOperand(0);
34156 EVT InVT = In.getValueType();
34157
34158 unsigned InBits = InVT.getSizeInBits();
34159 if (128 % InBits == 0) {
34160 // 128 bit and smaller inputs should avoid truncate all together and
34161 // just use a build_vector that will become a shuffle.
34162 // TODO: Widen and use a shuffle directly?
34163 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
34164 EVT EltVT = VT.getVectorElementType();
34165 unsigned WidenNumElts = WidenVT.getVectorNumElements();
34166 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
34167 // Use the original element count so we don't do more scalar opts than
34168 // necessary.
34169 unsigned MinElts = VT.getVectorNumElements();
34170 for (unsigned i=0; i < MinElts; ++i) {
34171 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
34172 DAG.getIntPtrConstant(i, dl));
34173 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
34174 }
34175 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
34176 return;
34177 }
34178 // With AVX512 there are some cases that can use a target specific
34179 // truncate node to go from 256/512 to less than 128 with zeros in the
34180 // upper elements of the 128 bit result.
34181 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34182 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34183 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34184 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34185 return;
34186 }
34187 // There's one case we can widen to 512 bits and use VTRUNC.
34188 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34189 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34190 DAG.getUNDEF(MVT::v4i64));
34191 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34192 return;
34193 }
34194 }
34195 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34196 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34197 isTypeLegal(MVT::v4i64)) {
34198 // Input needs to be split and output needs to widened. Let's use two
34199 // VTRUNCs, and shuffle their results together into the wider type.
34200 SDValue Lo, Hi;
34201 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34202
34203 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34204 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34205 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34206 { 0, 1, 2, 3, 16, 17, 18, 19,
34207 -1, -1, -1, -1, -1, -1, -1, -1 });
34208 Results.push_back(Res);
34209 return;
34210 }
34211
34212 return;
34213 }
34214 case ISD::ANY_EXTEND:
34215 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34216 // It's intended to custom handle the input type.
34217 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34218, __extension__
__PRETTY_FUNCTION__))
34218 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34218, __extension__
__PRETTY_FUNCTION__))
;
34219 return;
34220 case ISD::SIGN_EXTEND:
34221 case ISD::ZERO_EXTEND: {
34222 EVT VT = N->getValueType(0);
34223 SDValue In = N->getOperand(0);
34224 EVT InVT = In.getValueType();
34225 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34226 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34227 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34228, __extension__
__PRETTY_FUNCTION__))
34228 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34228, __extension__
__PRETTY_FUNCTION__))
;
34229 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34229, __extension__
__PRETTY_FUNCTION__))
;
34230 // Custom split this so we can extend i8/i16->i32 invec. This is better
34231 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34232 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34233 // we allow the sra from the extend to i32 to be shared by the split.
34234 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34235
34236 // Fill a vector with sign bits for each element.
34237 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34238 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34239
34240 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34241 // to v2i64.
34242 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34243 {0, 4, 1, 5});
34244 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34245 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34246 {2, 6, 3, 7});
34247 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34248
34249 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34250 Results.push_back(Res);
34251 return;
34252 }
34253
34254 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34255 if (!InVT.is128BitVector()) {
34256 // Not a 128 bit vector, but maybe type legalization will promote
34257 // it to 128 bits.
34258 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34259 return;
34260 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34261 if (!InVT.is128BitVector())
34262 return;
34263
34264 // Promote the input to 128 bits. Type legalization will turn this into
34265 // zext_inreg/sext_inreg.
34266 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
34267 }
34268
34269 // Perform custom splitting instead of the two stage extend we would get
34270 // by default.
34271 EVT LoVT, HiVT;
34272 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34273 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34273, __extension__
__PRETTY_FUNCTION__))
;
34274
34275 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
34276
34277 // We need to shift the input over by half the number of elements.
34278 unsigned NumElts = InVT.getVectorNumElements();
34279 unsigned HalfNumElts = NumElts / 2;
34280 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34281 for (unsigned i = 0; i != HalfNumElts; ++i)
34282 ShufMask[i] = i + HalfNumElts;
34283
34284 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34285 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
34286
34287 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34288 Results.push_back(Res);
34289 }
34290 return;
34291 }
34292 case ISD::FP_TO_SINT:
34293 case ISD::STRICT_FP_TO_SINT:
34294 case ISD::FP_TO_UINT:
34295 case ISD::STRICT_FP_TO_UINT: {
34296 bool IsStrict = N->isStrictFPOpcode();
34297 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
34298 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
34299 EVT VT = N->getValueType(0);
34300 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34301 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34302 EVT SrcVT = Src.getValueType();
34303
34304 SDValue Res;
34305 if (isSoftFP16(SrcVT)) {
34306 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34307 if (IsStrict) {
34308 Res =
34309 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
34310 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34311 {NVT, MVT::Other}, {Chain, Src})});
34312 Chain = Res.getValue(1);
34313 } else {
34314 Res = DAG.getNode(N->getOpcode(), dl, VT,
34315 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34316 }
34317 Results.push_back(Res);
34318 if (IsStrict)
34319 Results.push_back(Chain);
34320
34321 return;
34322 }
34323
34324 if (VT.isVector() && Subtarget.hasFP16() &&
34325 SrcVT.getVectorElementType() == MVT::f16) {
34326 EVT EleVT = VT.getVectorElementType();
34327 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34328
34329 if (SrcVT != MVT::v8f16) {
34330 SDValue Tmp =
34331 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34332 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34333 Ops[0] = Src;
34334 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34335 }
34336
34337 if (IsStrict) {
34338 unsigned Opc =
34339 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34340 Res =
34341 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34342 Chain = Res.getValue(1);
34343 } else {
34344 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34345 Res = DAG.getNode(Opc, dl, ResVT, Src);
34346 }
34347
34348 // TODO: Need to add exception check code for strict FP.
34349 if (EleVT.getSizeInBits() < 16) {
34350 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34351 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34352
34353 // Now widen to 128 bits.
34354 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34355 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34356 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34357 ConcatOps[0] = Res;
34358 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34359 }
34360
34361 Results.push_back(Res);
34362 if (IsStrict)
34363 Results.push_back(Chain);
34364
34365 return;
34366 }
34367
34368 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34369 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34370, __extension__
__PRETTY_FUNCTION__))
34370 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34370, __extension__
__PRETTY_FUNCTION__))
;
34371
34372 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34373 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34374 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34375 VT.getVectorNumElements());
34376 SDValue Res;
34377 SDValue Chain;
34378 if (IsStrict) {
34379 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34380 {N->getOperand(0), Src});
34381 Chain = Res.getValue(1);
34382 } else
34383 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34384
34385 // Preserve what we know about the size of the original result. If the
34386 // result is v2i32, we have to manually widen the assert.
34387 if (PromoteVT == MVT::v2i32)
34388 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34389 DAG.getUNDEF(MVT::v2i32));
34390
34391 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34392 Res.getValueType(), Res,
34393 DAG.getValueType(VT.getVectorElementType()));
34394
34395 if (PromoteVT == MVT::v2i32)
34396 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34397 DAG.getIntPtrConstant(0, dl));
34398
34399 // Truncate back to the original width.
34400 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34401
34402 // Now widen to 128 bits.
34403 unsigned NumConcats = 128 / VT.getSizeInBits();
34404 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
34405 VT.getVectorNumElements() * NumConcats);
34406 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34407 ConcatOps[0] = Res;
34408 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34409 Results.push_back(Res);
34410 if (IsStrict)
34411 Results.push_back(Chain);
34412 return;
34413 }
34414
34415
34416 if (VT == MVT::v2i32) {
34417 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34418, __extension__
__PRETTY_FUNCTION__))
34418 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34418, __extension__
__PRETTY_FUNCTION__))
;
34419 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34419, __extension__
__PRETTY_FUNCTION__))
;
34420 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34421, __extension__
__PRETTY_FUNCTION__))
34421 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34421, __extension__
__PRETTY_FUNCTION__))
;
34422 if (Src.getValueType() == MVT::v2f64) {
34423 if (!IsSigned && !Subtarget.hasAVX512()) {
34424 SDValue Res =
34425 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34426 Results.push_back(Res);
34427 return;
34428 }
34429
34430 unsigned Opc;
34431 if (IsStrict)
34432 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34433 else
34434 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34435
34436 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34437 if (!IsSigned && !Subtarget.hasVLX()) {
34438 // Otherwise we can defer to the generic legalizer which will widen
34439 // the input as well. This will be further widened during op
34440 // legalization to v8i32<-v8f64.
34441 // For strict nodes we'll need to widen ourselves.
34442 // FIXME: Fix the type legalizer to safely widen strict nodes?
34443 if (!IsStrict)
34444 return;
34445 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34446 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34447 Opc = N->getOpcode();
34448 }
34449 SDValue Res;
34450 SDValue Chain;
34451 if (IsStrict) {
34452 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34453 {N->getOperand(0), Src});
34454 Chain = Res.getValue(1);
34455 } else {
34456 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34457 }
34458 Results.push_back(Res);
34459 if (IsStrict)
34460 Results.push_back(Chain);
34461 return;
34462 }
34463
34464 // Custom widen strict v2f32->v2i32 by padding with zeros.
34465 // FIXME: Should generic type legalizer do this?
34466 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34467 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34468 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34469 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
34470 {N->getOperand(0), Src});
34471 Results.push_back(Res);
34472 Results.push_back(Res.getValue(1));
34473 return;
34474 }
34475
34476 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34477 // so early out here.
34478 return;
34479 }
34480
34481 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34481, __extension__
__PRETTY_FUNCTION__))
;
34482
34483 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34484 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34485 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34486 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34486, __extension__
__PRETTY_FUNCTION__))
;
34487 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34488 // If we use a 128-bit result we might need to use a target specific node.
34489 unsigned SrcElts =
34490 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34491 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34492 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34493 unsigned Opc = N->getOpcode();
34494 if (NumElts != SrcElts) {
34495 if (IsStrict)
34496 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34497 else
34498 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34499 }
34500
34501 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
34502 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34503 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34504 ZeroIdx);
34505 SDValue Chain;
34506 if (IsStrict) {
34507 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34508 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34509 Chain = Res.getValue(1);
34510 } else
34511 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34512 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34513 Results.push_back(Res);
34514 if (IsStrict)
34515 Results.push_back(Chain);
34516 return;
34517 }
34518
34519 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34520 SDValue Chain;
34521 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34522 Results.push_back(V);
34523 if (IsStrict)
34524 Results.push_back(Chain);
34525 return;
34526 }
34527
34528 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34529 Results.push_back(V);
34530 if (IsStrict)
34531 Results.push_back(Chain);
34532 }
34533 return;
34534 }
34535 case ISD::LRINT:
34536 case ISD::LLRINT: {
34537 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34538 Results.push_back(V);
34539 return;
34540 }
34541
34542 case ISD::SINT_TO_FP:
34543 case ISD::STRICT_SINT_TO_FP:
34544 case ISD::UINT_TO_FP:
34545 case ISD::STRICT_UINT_TO_FP: {
34546 bool IsStrict = N->isStrictFPOpcode();
34547 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
34548 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
34549 EVT VT = N->getValueType(0);
34550 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34551 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34552 Subtarget.hasVLX()) {
34553 if (Src.getValueType().getVectorElementType() == MVT::i16)
34554 return;
34555
34556 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34557 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34558 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34559 : DAG.getUNDEF(MVT::v2i32));
34560 if (IsStrict) {
34561 unsigned Opc =
34562 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
34563 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34564 {N->getOperand(0), Src});
34565 Results.push_back(Res);
34566 Results.push_back(Res.getValue(1));
34567 } else {
34568 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34569 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34570 }
34571 return;
34572 }
34573 if (VT != MVT::v2f32)
34574 return;
34575 EVT SrcVT = Src.getValueType();
34576 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34577 if (IsStrict) {
34578 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34579 : X86ISD::STRICT_CVTUI2P;
34580 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34581 {N->getOperand(0), Src});
34582 Results.push_back(Res);
34583 Results.push_back(Res.getValue(1));
34584 } else {
34585 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34586 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34587 }
34588 return;
34589 }
34590 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34591 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34592 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34593 SDValue One = DAG.getConstant(1, dl, SrcVT);
34594 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34595 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34596 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34597 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34598 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34599 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34600 for (int i = 0; i != 2; ++i) {
34601 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34602 SignSrc, DAG.getIntPtrConstant(i, dl));
34603 if (IsStrict)
34604 SignCvts[i] =
34605 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34606 {N->getOperand(0), Elt});
34607 else
34608 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34609 };
34610 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34611 SDValue Slow, Chain;
34612 if (IsStrict) {
34613 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34614 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34615 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34616 {Chain, SignCvt, SignCvt});
34617 Chain = Slow.getValue(1);
34618 } else {
34619 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34620 }
34621 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34622 IsNeg =
34623 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34624 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34625 Results.push_back(Cvt);
34626 if (IsStrict)
34627 Results.push_back(Chain);
34628 return;
34629 }
34630
34631 if (SrcVT != MVT::v2i32)
34632 return;
34633
34634 if (IsSigned || Subtarget.hasAVX512()) {
34635 if (!IsStrict)
34636 return;
34637
34638 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34639 // FIXME: Should generic type legalizer do this?
34640 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34641 DAG.getConstant(0, dl, MVT::v2i32));
34642 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
34643 {N->getOperand(0), Src});
34644 Results.push_back(Res);
34645 Results.push_back(Res.getValue(1));
34646 return;
34647 }
34648
34649 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34649, __extension__
__PRETTY_FUNCTION__))
;
34650 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34651 SDValue VBias = DAG.getConstantFP(
34652 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34653 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34654 DAG.getBitcast(MVT::v2i64, VBias));
34655 Or = DAG.getBitcast(MVT::v2f64, Or);
34656 if (IsStrict) {
34657 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34658 {N->getOperand(0), Or, VBias});
34659 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
34660 {MVT::v4f32, MVT::Other},
34661 {Sub.getValue(1), Sub});
34662 Results.push_back(Res);
34663 Results.push_back(Res.getValue(1));
34664 } else {
34665 // TODO: Are there any fast-math-flags to propagate here?
34666 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34667 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34668 }
34669 return;
34670 }
34671 case ISD::STRICT_FP_ROUND:
34672 case ISD::FP_ROUND: {
34673 bool IsStrict = N->isStrictFPOpcode();
34674 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34675 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34676 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34677 EVT SrcVT = Src.getValueType();
34678 EVT VT = N->getValueType(0);
34679 SDValue V;
34680 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34681 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34682 : DAG.getUNDEF(MVT::v2f32);
34683 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34684 }
34685 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34686 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34686, __extension__
__PRETTY_FUNCTION__))
;
34687 if (SrcVT.getVectorElementType() != MVT::f32)
34688 return;
34689
34690 if (IsStrict)
34691 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34692 {Chain, Src, Rnd});
34693 else
34694 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34695
34696 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34697 if (IsStrict)
34698 Results.push_back(V.getValue(1));
34699 return;
34700 }
34701 if (!isTypeLegal(Src.getValueType()))
34702 return;
34703 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34704 if (IsStrict)
34705 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34706 {Chain, Src});
34707 else
34708 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34709 Results.push_back(V);
34710 if (IsStrict)
34711 Results.push_back(V.getValue(1));
34712 return;
34713 }
34714 case ISD::FP_EXTEND:
34715 case ISD::STRICT_FP_EXTEND: {
34716 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34717 // No other ValueType for FP_EXTEND should reach this point.
34718 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34719, __extension__
__PRETTY_FUNCTION__))
34719 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34719, __extension__
__PRETTY_FUNCTION__))
;
34720 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34721 return;
34722 bool IsStrict = N->isStrictFPOpcode();
34723 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34724 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34725 : DAG.getUNDEF(MVT::v2f16);
34726 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34727 if (IsStrict)
34728 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34729 {N->getOperand(0), V});
34730 else
34731 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34732 Results.push_back(V);
34733 if (IsStrict)
34734 Results.push_back(V.getValue(1));
34735 return;
34736 }
34737 case ISD::INTRINSIC_W_CHAIN: {
34738 unsigned IntNo = N->getConstantOperandVal(1);
34739 switch (IntNo) {
34740 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34741)
34741 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34741)
;
34742 case Intrinsic::x86_rdtsc:
34743 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34744 Results);
34745 case Intrinsic::x86_rdtscp:
34746 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34747 Results);
34748 case Intrinsic::x86_rdpmc:
34749 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34750 Results);
34751 return;
34752 case Intrinsic::x86_rdpru:
34753 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34754 Results);
34755 return;
34756 case Intrinsic::x86_xgetbv:
34757 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34758 Results);
34759 return;
34760 }
34761 }
34762 case ISD::READCYCLECOUNTER: {
34763 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34764 }
34765 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34766 EVT T = N->getValueType(0);
34767 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34767, __extension__
__PRETTY_FUNCTION__))
;
34768 bool Regs64bit = T == MVT::i128;
34769 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34770, __extension__
__PRETTY_FUNCTION__))
34770 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34770, __extension__
__PRETTY_FUNCTION__))
;
34771 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34772 SDValue cpInL, cpInH;
34773 std::tie(cpInL, cpInH) =
34774 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34775 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34776 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34777 cpInH =
34778 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34779 cpInH, cpInL.getValue(1));
34780 SDValue swapInL, swapInH;
34781 std::tie(swapInL, swapInH) =
34782 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34783 swapInH =
34784 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34785 swapInH, cpInH.getValue(1));
34786
34787 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34788 // until later. So we keep the RBX input in a vreg and use a custom
34789 // inserter.
34790 // Since RBX will be a reserved register the register allocator will not
34791 // make sure its value will be properly saved and restored around this
34792 // live-range.
34793 SDValue Result;
34794 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34795 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34796 if (Regs64bit) {
34797 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34798 swapInH.getValue(1)};
34799 Result =
34800 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34801 } else {
34802 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34803 swapInH.getValue(1));
34804 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34805 swapInL.getValue(1)};
34806 Result =
34807 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34808 }
34809
34810 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34811 Regs64bit ? X86::RAX : X86::EAX,
34812 HalfT, Result.getValue(1));
34813 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34814 Regs64bit ? X86::RDX : X86::EDX,
34815 HalfT, cpOutL.getValue(2));
34816 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34817
34818 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34819 MVT::i32, cpOutH.getValue(2));
34820 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34821 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34822
34823 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34824 Results.push_back(Success);
34825 Results.push_back(EFLAGS.getValue(1));
34826 return;
34827 }
34828 case ISD::ATOMIC_LOAD: {
34829 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34829, __extension__
__PRETTY_FUNCTION__))
;
34830 bool NoImplicitFloatOps =
34831 DAG.getMachineFunction().getFunction().hasFnAttribute(
34832 Attribute::NoImplicitFloat);
34833 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34834 auto *Node = cast<AtomicSDNode>(N);
34835 if (Subtarget.hasSSE1()) {
34836 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34837 // Then extract the lower 64-bits.
34838 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34839 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34840 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34841 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34842 MVT::i64, Node->getMemOperand());
34843 if (Subtarget.hasSSE2()) {
34844 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34845 DAG.getIntPtrConstant(0, dl));
34846 Results.push_back(Res);
34847 Results.push_back(Ld.getValue(1));
34848 return;
34849 }
34850 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34851 // then casts to i64. This avoids a 128-bit stack temporary being
34852 // created by type legalization if we were to cast v4f32->v2i64.
34853 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34854 DAG.getIntPtrConstant(0, dl));
34855 Res = DAG.getBitcast(MVT::i64, Res);
34856 Results.push_back(Res);
34857 Results.push_back(Ld.getValue(1));
34858 return;
34859 }
34860 if (Subtarget.hasX87()) {
34861 // First load this into an 80-bit X87 register. This will put the whole
34862 // integer into the significand.
34863 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34864 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34865 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
34866 dl, Tys, Ops, MVT::i64,
34867 Node->getMemOperand());
34868 SDValue Chain = Result.getValue(1);
34869
34870 // Now store the X87 register to a stack temporary and convert to i64.
34871 // This store is not atomic and doesn't need to be.
34872 // FIXME: We don't need a stack temporary if the result of the load
34873 // is already being stored. We could just directly store there.
34874 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34875 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34876 MachinePointerInfo MPI =
34877 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
34878 SDValue StoreOps[] = { Chain, Result, StackPtr };
34879 Chain = DAG.getMemIntrinsicNode(
34880 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34881 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34882
34883 // Finally load the value back from the stack temporary and return it.
34884 // This load is not atomic and doesn't need to be.
34885 // This load will be further type legalized.
34886 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34887 Results.push_back(Result);
34888 Results.push_back(Result.getValue(1));
34889 return;
34890 }
34891 }
34892 // TODO: Use MOVLPS when SSE1 is available?
34893 // Delegate to generic TypeLegalization. Situations we can really handle
34894 // should have already been dealt with by AtomicExpandPass.cpp.
34895 break;
34896 }
34897 case ISD::ATOMIC_SWAP:
34898 case ISD::ATOMIC_LOAD_ADD:
34899 case ISD::ATOMIC_LOAD_SUB:
34900 case ISD::ATOMIC_LOAD_AND:
34901 case ISD::ATOMIC_LOAD_OR:
34902 case ISD::ATOMIC_LOAD_XOR:
34903 case ISD::ATOMIC_LOAD_NAND:
34904 case ISD::ATOMIC_LOAD_MIN:
34905 case ISD::ATOMIC_LOAD_MAX:
34906 case ISD::ATOMIC_LOAD_UMIN:
34907 case ISD::ATOMIC_LOAD_UMAX:
34908 // Delegate to generic TypeLegalization. Situations we can really handle
34909 // should have already been dealt with by AtomicExpandPass.cpp.
34910 break;
34911
34912 case ISD::BITCAST: {
34913 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34913, __extension__
__PRETTY_FUNCTION__))
;
34914 EVT DstVT = N->getValueType(0);
34915 EVT SrcVT = N->getOperand(0).getValueType();
34916
34917 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34918 // we can split using the k-register rather than memory.
34919 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34920 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34920, __extension__
__PRETTY_FUNCTION__))
;
34921 SDValue Lo, Hi;
34922 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34923 Lo = DAG.getBitcast(MVT::i32, Lo);
34924 Hi = DAG.getBitcast(MVT::i32, Hi);
34925 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34926 Results.push_back(Res);
34927 return;
34928 }
34929
34930 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34931 // FIXME: Use v4f32 for SSE1?
34932 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34932, __extension__
__PRETTY_FUNCTION__))
;
34933 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34934, __extension__
__PRETTY_FUNCTION__))
34934 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34934, __extension__
__PRETTY_FUNCTION__))
;
34935 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34936 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34937 N->getOperand(0));
34938 Res = DAG.getBitcast(WideVT, Res);
34939 Results.push_back(Res);
34940 return;
34941 }
34942
34943 return;
34944 }
34945 case ISD::MGATHER: {
34946 EVT VT = N->getValueType(0);
34947 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34948 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34949 auto *Gather = cast<MaskedGatherSDNode>(N);
34950 SDValue Index = Gather->getIndex();
34951 if (Index.getValueType() != MVT::v2i64)
34952 return;
34953 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34954, __extension__
__PRETTY_FUNCTION__))
34954 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34954, __extension__
__PRETTY_FUNCTION__))
;
34955 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34956 SDValue Mask = Gather->getMask();
34957 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34957, __extension__
__PRETTY_FUNCTION__))
;
34958 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34959 Gather->getPassThru(),
34960 DAG.getUNDEF(VT));
34961 if (!Subtarget.hasVLX()) {
34962 // We need to widen the mask, but the instruction will only use 2
34963 // of its elements. So we can use undef.
34964 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34965 DAG.getUNDEF(MVT::v2i1));
34966 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34967 }
34968 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34969 Gather->getBasePtr(), Index, Gather->getScale() };
34970 SDValue Res = DAG.getMemIntrinsicNode(
34971 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34972 Gather->getMemoryVT(), Gather->getMemOperand());
34973 Results.push_back(Res);
34974 Results.push_back(Res.getValue(1));
34975 return;
34976 }
34977 return;
34978 }
34979 case ISD::LOAD: {
34980 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34981 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34982 // cast since type legalization will try to use an i64 load.
34983 MVT VT = N->getSimpleValueType(0);
34984 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34984, __extension__
__PRETTY_FUNCTION__))
;
34985 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34986, __extension__
__PRETTY_FUNCTION__))
34986 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34986, __extension__
__PRETTY_FUNCTION__))
;
34987 if (!ISD::isNON_EXTLoad(N))
34988 return;
34989 auto *Ld = cast<LoadSDNode>(N);
34990 if (Subtarget.hasSSE2()) {
34991 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34992 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34993 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34994 Ld->getMemOperand()->getFlags());
34995 SDValue Chain = Res.getValue(1);
34996 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34997 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34998 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34999 Res = DAG.getBitcast(WideVT, Res);
35000 Results.push_back(Res);
35001 Results.push_back(Chain);
35002 return;
35003 }
35004 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35004, __extension__
__PRETTY_FUNCTION__))
;
35005 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
35006 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
35007 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35008 MVT::i64, Ld->getMemOperand());
35009 Results.push_back(Res);
35010 Results.push_back(Res.getValue(1));
35011 return;
35012 }
35013 case ISD::ADDRSPACECAST: {
35014 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
35015 Results.push_back(V);
35016 return;
35017 }
35018 case ISD::BITREVERSE: {
35019 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35019, __extension__
__PRETTY_FUNCTION__))
;
35020 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35020, __extension__
__PRETTY_FUNCTION__))
;
35021 // We can use VPPERM by copying to a vector register and back. We'll need
35022 // to move the scalar in two i32 pieces.
35023 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
35024 return;
35025 }
35026 case ISD::EXTRACT_VECTOR_ELT: {
35027 // f16 = extract vXf16 %vec, i64 %idx
35028 assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35029, __extension__
__PRETTY_FUNCTION__))
35029 "Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35029, __extension__
__PRETTY_FUNCTION__))
;
35030 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35030, __extension__
__PRETTY_FUNCTION__))
;
35031 SDValue VecOp = N->getOperand(0);
35032 EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
35033 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
35034 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
35035 N->getOperand(1));
35036 Split = DAG.getBitcast(MVT::f16, Split);
35037 Results.push_back(Split);
35038 return;
35039 }
35040 }
35041}
35042
35043const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
35044 switch ((X86ISD::NodeType)Opcode) {
35045 case X86ISD::FIRST_NUMBER: break;
35046#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
35047 NODE_NAME_CASE(BSF)
35048 NODE_NAME_CASE(BSR)
35049 NODE_NAME_CASE(FSHL)
35050 NODE_NAME_CASE(FSHR)
35051 NODE_NAME_CASE(FAND)
35052 NODE_NAME_CASE(FANDN)
35053 NODE_NAME_CASE(FOR)
35054 NODE_NAME_CASE(FXOR)
35055 NODE_NAME_CASE(FILD)
35056 NODE_NAME_CASE(FIST)
35057 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
35058 NODE_NAME_CASE(FLD)
35059 NODE_NAME_CASE(FST)
35060 NODE_NAME_CASE(CALL)
35061 NODE_NAME_CASE(CALL_RVMARKER)
35062 NODE_NAME_CASE(BT)
35063 NODE_NAME_CASE(CMP)
35064 NODE_NAME_CASE(FCMP)
35065 NODE_NAME_CASE(STRICT_FCMP)
35066 NODE_NAME_CASE(STRICT_FCMPS)
35067 NODE_NAME_CASE(COMI)
35068 NODE_NAME_CASE(UCOMI)
35069 NODE_NAME_CASE(CMPM)
35070 NODE_NAME_CASE(CMPMM)
35071 NODE_NAME_CASE(STRICT_CMPM)
35072 NODE_NAME_CASE(CMPMM_SAE)
35073 NODE_NAME_CASE(SETCC)
35074 NODE_NAME_CASE(SETCC_CARRY)
35075 NODE_NAME_CASE(FSETCC)
35076 NODE_NAME_CASE(FSETCCM)
35077 NODE_NAME_CASE(FSETCCM_SAE)
35078 NODE_NAME_CASE(CMOV)
35079 NODE_NAME_CASE(BRCOND)
35080 NODE_NAME_CASE(RET_GLUE)
35081 NODE_NAME_CASE(IRET)
35082 NODE_NAME_CASE(REP_STOS)
35083 NODE_NAME_CASE(REP_MOVS)
35084 NODE_NAME_CASE(GlobalBaseReg)
35085 NODE_NAME_CASE(Wrapper)
35086 NODE_NAME_CASE(WrapperRIP)
35087 NODE_NAME_CASE(MOVQ2DQ)
35088 NODE_NAME_CASE(MOVDQ2Q)
35089 NODE_NAME_CASE(MMX_MOVD2W)
35090 NODE_NAME_CASE(MMX_MOVW2D)
35091 NODE_NAME_CASE(PEXTRB)
35092 NODE_NAME_CASE(PEXTRW)
35093 NODE_NAME_CASE(INSERTPS)
35094 NODE_NAME_CASE(PINSRB)
35095 NODE_NAME_CASE(PINSRW)
35096 NODE_NAME_CASE(PSHUFB)
35097 NODE_NAME_CASE(ANDNP)
35098 NODE_NAME_CASE(BLENDI)
35099 NODE_NAME_CASE(BLENDV)
35100 NODE_NAME_CASE(HADD)
35101 NODE_NAME_CASE(HSUB)
35102 NODE_NAME_CASE(FHADD)
35103 NODE_NAME_CASE(FHSUB)
35104 NODE_NAME_CASE(CONFLICT)
35105 NODE_NAME_CASE(FMAX)
35106 NODE_NAME_CASE(FMAXS)
35107 NODE_NAME_CASE(FMAX_SAE)
35108 NODE_NAME_CASE(FMAXS_SAE)
35109 NODE_NAME_CASE(FMIN)
35110 NODE_NAME_CASE(FMINS)
35111 NODE_NAME_CASE(FMIN_SAE)
35112 NODE_NAME_CASE(FMINS_SAE)
35113 NODE_NAME_CASE(FMAXC)
35114 NODE_NAME_CASE(FMINC)
35115 NODE_NAME_CASE(FRSQRT)
35116 NODE_NAME_CASE(FRCP)
35117 NODE_NAME_CASE(EXTRQI)
35118 NODE_NAME_CASE(INSERTQI)
35119 NODE_NAME_CASE(TLSADDR)
35120 NODE_NAME_CASE(TLSBASEADDR)
35121 NODE_NAME_CASE(TLSCALL)
35122 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35123 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35124 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35125 NODE_NAME_CASE(EH_RETURN)
35126 NODE_NAME_CASE(TC_RETURN)
35127 NODE_NAME_CASE(FNSTCW16m)
35128 NODE_NAME_CASE(FLDCW16m)
35129 NODE_NAME_CASE(LCMPXCHG_DAG)
35130 NODE_NAME_CASE(LCMPXCHG8_DAG)
35131 NODE_NAME_CASE(LCMPXCHG16_DAG)
35132 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35133 NODE_NAME_CASE(LADD)
35134 NODE_NAME_CASE(LSUB)
35135 NODE_NAME_CASE(LOR)
35136 NODE_NAME_CASE(LXOR)
35137 NODE_NAME_CASE(LAND)
35138 NODE_NAME_CASE(LBTS)
35139 NODE_NAME_CASE(LBTC)
35140 NODE_NAME_CASE(LBTR)
35141 NODE_NAME_CASE(LBTS_RM)
35142 NODE_NAME_CASE(LBTC_RM)
35143 NODE_NAME_CASE(LBTR_RM)
35144 NODE_NAME_CASE(AADD)
35145 NODE_NAME_CASE(AOR)
35146 NODE_NAME_CASE(AXOR)
35147 NODE_NAME_CASE(AAND)
35148 NODE_NAME_CASE(VZEXT_MOVL)
35149 NODE_NAME_CASE(VZEXT_LOAD)
35150 NODE_NAME_CASE(VEXTRACT_STORE)
35151 NODE_NAME_CASE(VTRUNC)
35152 NODE_NAME_CASE(VTRUNCS)
35153 NODE_NAME_CASE(VTRUNCUS)
35154 NODE_NAME_CASE(VMTRUNC)
35155 NODE_NAME_CASE(VMTRUNCS)
35156 NODE_NAME_CASE(VMTRUNCUS)
35157 NODE_NAME_CASE(VTRUNCSTORES)
35158 NODE_NAME_CASE(VTRUNCSTOREUS)
35159 NODE_NAME_CASE(VMTRUNCSTORES)
35160 NODE_NAME_CASE(VMTRUNCSTOREUS)
35161 NODE_NAME_CASE(VFPEXT)
35162 NODE_NAME_CASE(STRICT_VFPEXT)
35163 NODE_NAME_CASE(VFPEXT_SAE)
35164 NODE_NAME_CASE(VFPEXTS)
35165 NODE_NAME_CASE(VFPEXTS_SAE)
35166 NODE_NAME_CASE(VFPROUND)
35167 NODE_NAME_CASE(STRICT_VFPROUND)
35168 NODE_NAME_CASE(VMFPROUND)
35169 NODE_NAME_CASE(VFPROUND_RND)
35170 NODE_NAME_CASE(VFPROUNDS)
35171 NODE_NAME_CASE(VFPROUNDS_RND)
35172 NODE_NAME_CASE(VSHLDQ)
35173 NODE_NAME_CASE(VSRLDQ)
35174 NODE_NAME_CASE(VSHL)
35175 NODE_NAME_CASE(VSRL)
35176 NODE_NAME_CASE(VSRA)
35177 NODE_NAME_CASE(VSHLI)
35178 NODE_NAME_CASE(VSRLI)
35179 NODE_NAME_CASE(VSRAI)
35180 NODE_NAME_CASE(VSHLV)
35181 NODE_NAME_CASE(VSRLV)
35182 NODE_NAME_CASE(VSRAV)
35183 NODE_NAME_CASE(VROTLI)
35184 NODE_NAME_CASE(VROTRI)
35185 NODE_NAME_CASE(VPPERM)
35186 NODE_NAME_CASE(CMPP)
35187 NODE_NAME_CASE(STRICT_CMPP)
35188 NODE_NAME_CASE(PCMPEQ)
35189 NODE_NAME_CASE(PCMPGT)
35190 NODE_NAME_CASE(PHMINPOS)
35191 NODE_NAME_CASE(ADD)
35192 NODE_NAME_CASE(SUB)
35193 NODE_NAME_CASE(ADC)
35194 NODE_NAME_CASE(SBB)
35195 NODE_NAME_CASE(SMUL)
35196 NODE_NAME_CASE(UMUL)
35197 NODE_NAME_CASE(OR)
35198 NODE_NAME_CASE(XOR)
35199 NODE_NAME_CASE(AND)
35200 NODE_NAME_CASE(BEXTR)
35201 NODE_NAME_CASE(BEXTRI)
35202 NODE_NAME_CASE(BZHI)
35203 NODE_NAME_CASE(PDEP)
35204 NODE_NAME_CASE(PEXT)
35205 NODE_NAME_CASE(MUL_IMM)
35206 NODE_NAME_CASE(MOVMSK)
35207 NODE_NAME_CASE(PTEST)
35208 NODE_NAME_CASE(TESTP)
35209 NODE_NAME_CASE(KORTEST)
35210 NODE_NAME_CASE(KTEST)
35211 NODE_NAME_CASE(KADD)
35212 NODE_NAME_CASE(KSHIFTL)
35213 NODE_NAME_CASE(KSHIFTR)
35214 NODE_NAME_CASE(PACKSS)
35215 NODE_NAME_CASE(PACKUS)
35216 NODE_NAME_CASE(PALIGNR)
35217 NODE_NAME_CASE(VALIGN)
35218 NODE_NAME_CASE(VSHLD)
35219 NODE_NAME_CASE(VSHRD)
35220 NODE_NAME_CASE(VSHLDV)
35221 NODE_NAME_CASE(VSHRDV)
35222 NODE_NAME_CASE(PSHUFD)
35223 NODE_NAME_CASE(PSHUFHW)
35224 NODE_NAME_CASE(PSHUFLW)
35225 NODE_NAME_CASE(SHUFP)
35226 NODE_NAME_CASE(SHUF128)
35227 NODE_NAME_CASE(MOVLHPS)
35228 NODE_NAME_CASE(MOVHLPS)
35229 NODE_NAME_CASE(MOVDDUP)
35230 NODE_NAME_CASE(MOVSHDUP)
35231 NODE_NAME_CASE(MOVSLDUP)
35232 NODE_NAME_CASE(MOVSD)
35233 NODE_NAME_CASE(MOVSS)
35234 NODE_NAME_CASE(MOVSH)
35235 NODE_NAME_CASE(UNPCKL)
35236 NODE_NAME_CASE(UNPCKH)
35237 NODE_NAME_CASE(VBROADCAST)
35238 NODE_NAME_CASE(VBROADCAST_LOAD)
35239 NODE_NAME_CASE(VBROADCASTM)
35240 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35241 NODE_NAME_CASE(VPERMILPV)
35242 NODE_NAME_CASE(VPERMILPI)
35243 NODE_NAME_CASE(VPERM2X128)
35244 NODE_NAME_CASE(VPERMV)
35245 NODE_NAME_CASE(VPERMV3)
35246 NODE_NAME_CASE(VPERMI)
35247 NODE_NAME_CASE(VPTERNLOG)
35248 NODE_NAME_CASE(VFIXUPIMM)
35249 NODE_NAME_CASE(VFIXUPIMM_SAE)
35250 NODE_NAME_CASE(VFIXUPIMMS)
35251 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35252 NODE_NAME_CASE(VRANGE)
35253 NODE_NAME_CASE(VRANGE_SAE)
35254 NODE_NAME_CASE(VRANGES)
35255 NODE_NAME_CASE(VRANGES_SAE)
35256 NODE_NAME_CASE(PMULUDQ)
35257 NODE_NAME_CASE(PMULDQ)
35258 NODE_NAME_CASE(PSADBW)
35259 NODE_NAME_CASE(DBPSADBW)
35260 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35261 NODE_NAME_CASE(VAARG_64)
35262 NODE_NAME_CASE(VAARG_X32)
35263 NODE_NAME_CASE(DYN_ALLOCA)
35264 NODE_NAME_CASE(MFENCE)
35265 NODE_NAME_CASE(SEG_ALLOCA)
35266 NODE_NAME_CASE(PROBED_ALLOCA)
35267 NODE_NAME_CASE(RDRAND)
35268 NODE_NAME_CASE(RDSEED)
35269 NODE_NAME_CASE(RDPKRU)
35270 NODE_NAME_CASE(WRPKRU)
35271 NODE_NAME_CASE(VPMADDUBSW)
35272 NODE_NAME_CASE(VPMADDWD)
35273 NODE_NAME_CASE(VPSHA)
35274 NODE_NAME_CASE(VPSHL)
35275 NODE_NAME_CASE(VPCOM)
35276 NODE_NAME_CASE(VPCOMU)
35277 NODE_NAME_CASE(VPERMIL2)
35278 NODE_NAME_CASE(FMSUB)
35279 NODE_NAME_CASE(STRICT_FMSUB)
35280 NODE_NAME_CASE(FNMADD)
35281 NODE_NAME_CASE(STRICT_FNMADD)
35282 NODE_NAME_CASE(FNMSUB)
35283 NODE_NAME_CASE(STRICT_FNMSUB)
35284 NODE_NAME_CASE(FMADDSUB)
35285 NODE_NAME_CASE(FMSUBADD)
35286 NODE_NAME_CASE(FMADD_RND)
35287 NODE_NAME_CASE(FNMADD_RND)
35288 NODE_NAME_CASE(FMSUB_RND)
35289 NODE_NAME_CASE(FNMSUB_RND)
35290 NODE_NAME_CASE(FMADDSUB_RND)
35291 NODE_NAME_CASE(FMSUBADD_RND)
35292 NODE_NAME_CASE(VFMADDC)
35293 NODE_NAME_CASE(VFMADDC_RND)
35294 NODE_NAME_CASE(VFCMADDC)
35295 NODE_NAME_CASE(VFCMADDC_RND)
35296 NODE_NAME_CASE(VFMULC)
35297 NODE_NAME_CASE(VFMULC_RND)
35298 NODE_NAME_CASE(VFCMULC)
35299 NODE_NAME_CASE(VFCMULC_RND)
35300 NODE_NAME_CASE(VFMULCSH)
35301 NODE_NAME_CASE(VFMULCSH_RND)
35302 NODE_NAME_CASE(VFCMULCSH)
35303 NODE_NAME_CASE(VFCMULCSH_RND)
35304 NODE_NAME_CASE(VFMADDCSH)
35305 NODE_NAME_CASE(VFMADDCSH_RND)
35306 NODE_NAME_CASE(VFCMADDCSH)
35307 NODE_NAME_CASE(VFCMADDCSH_RND)
35308 NODE_NAME_CASE(VPMADD52H)
35309 NODE_NAME_CASE(VPMADD52L)
35310 NODE_NAME_CASE(VRNDSCALE)
35311 NODE_NAME_CASE(STRICT_VRNDSCALE)
35312 NODE_NAME_CASE(VRNDSCALE_SAE)
35313 NODE_NAME_CASE(VRNDSCALES)
35314 NODE_NAME_CASE(VRNDSCALES_SAE)
35315 NODE_NAME_CASE(VREDUCE)
35316 NODE_NAME_CASE(VREDUCE_SAE)
35317 NODE_NAME_CASE(VREDUCES)
35318 NODE_NAME_CASE(VREDUCES_SAE)
35319 NODE_NAME_CASE(VGETMANT)
35320 NODE_NAME_CASE(VGETMANT_SAE)
35321 NODE_NAME_CASE(VGETMANTS)
35322 NODE_NAME_CASE(VGETMANTS_SAE)
35323 NODE_NAME_CASE(PCMPESTR)
35324 NODE_NAME_CASE(PCMPISTR)
35325 NODE_NAME_CASE(XTEST)
35326 NODE_NAME_CASE(COMPRESS)
35327 NODE_NAME_CASE(EXPAND)
35328 NODE_NAME_CASE(SELECTS)
35329 NODE_NAME_CASE(ADDSUB)
35330 NODE_NAME_CASE(RCP14)
35331 NODE_NAME_CASE(RCP14S)
35332 NODE_NAME_CASE(RCP28)
35333 NODE_NAME_CASE(RCP28_SAE)
35334 NODE_NAME_CASE(RCP28S)
35335 NODE_NAME_CASE(RCP28S_SAE)
35336 NODE_NAME_CASE(EXP2)
35337 NODE_NAME_CASE(EXP2_SAE)
35338 NODE_NAME_CASE(RSQRT14)
35339 NODE_NAME_CASE(RSQRT14S)
35340 NODE_NAME_CASE(RSQRT28)
35341 NODE_NAME_CASE(RSQRT28_SAE)
35342 NODE_NAME_CASE(RSQRT28S)
35343 NODE_NAME_CASE(RSQRT28S_SAE)
35344 NODE_NAME_CASE(FADD_RND)
35345 NODE_NAME_CASE(FADDS)
35346 NODE_NAME_CASE(FADDS_RND)
35347 NODE_NAME_CASE(FSUB_RND)
35348 NODE_NAME_CASE(FSUBS)
35349 NODE_NAME_CASE(FSUBS_RND)
35350 NODE_NAME_CASE(FMUL_RND)
35351 NODE_NAME_CASE(FMULS)
35352 NODE_NAME_CASE(FMULS_RND)
35353 NODE_NAME_CASE(FDIV_RND)
35354 NODE_NAME_CASE(FDIVS)
35355 NODE_NAME_CASE(FDIVS_RND)
35356 NODE_NAME_CASE(FSQRT_RND)
35357 NODE_NAME_CASE(FSQRTS)
35358 NODE_NAME_CASE(FSQRTS_RND)
35359 NODE_NAME_CASE(FGETEXP)
35360 NODE_NAME_CASE(FGETEXP_SAE)
35361 NODE_NAME_CASE(FGETEXPS)
35362 NODE_NAME_CASE(FGETEXPS_SAE)
35363 NODE_NAME_CASE(SCALEF)
35364 NODE_NAME_CASE(SCALEF_RND)
35365 NODE_NAME_CASE(SCALEFS)
35366 NODE_NAME_CASE(SCALEFS_RND)
35367 NODE_NAME_CASE(MULHRS)
35368 NODE_NAME_CASE(SINT_TO_FP_RND)
35369 NODE_NAME_CASE(UINT_TO_FP_RND)
35370 NODE_NAME_CASE(CVTTP2SI)
35371 NODE_NAME_CASE(CVTTP2UI)
35372 NODE_NAME_CASE(STRICT_CVTTP2SI)
35373 NODE_NAME_CASE(STRICT_CVTTP2UI)
35374 NODE_NAME_CASE(MCVTTP2SI)
35375 NODE_NAME_CASE(MCVTTP2UI)
35376 NODE_NAME_CASE(CVTTP2SI_SAE)
35377 NODE_NAME_CASE(CVTTP2UI_SAE)
35378 NODE_NAME_CASE(CVTTS2SI)
35379 NODE_NAME_CASE(CVTTS2UI)
35380 NODE_NAME_CASE(CVTTS2SI_SAE)
35381 NODE_NAME_CASE(CVTTS2UI_SAE)
35382 NODE_NAME_CASE(CVTSI2P)
35383 NODE_NAME_CASE(CVTUI2P)
35384 NODE_NAME_CASE(STRICT_CVTSI2P)
35385 NODE_NAME_CASE(STRICT_CVTUI2P)
35386 NODE_NAME_CASE(MCVTSI2P)
35387 NODE_NAME_CASE(MCVTUI2P)
35388 NODE_NAME_CASE(VFPCLASS)
35389 NODE_NAME_CASE(VFPCLASSS)
35390 NODE_NAME_CASE(MULTISHIFT)
35391 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35392 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35393 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35394 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35395 NODE_NAME_CASE(CVTPS2PH)
35396 NODE_NAME_CASE(STRICT_CVTPS2PH)
35397 NODE_NAME_CASE(CVTPS2PH_SAE)
35398 NODE_NAME_CASE(MCVTPS2PH)
35399 NODE_NAME_CASE(MCVTPS2PH_SAE)
35400 NODE_NAME_CASE(CVTPH2PS)
35401 NODE_NAME_CASE(STRICT_CVTPH2PS)
35402 NODE_NAME_CASE(CVTPH2PS_SAE)
35403 NODE_NAME_CASE(CVTP2SI)
35404 NODE_NAME_CASE(CVTP2UI)
35405 NODE_NAME_CASE(MCVTP2SI)
35406 NODE_NAME_CASE(MCVTP2UI)
35407 NODE_NAME_CASE(CVTP2SI_RND)
35408 NODE_NAME_CASE(CVTP2UI_RND)
35409 NODE_NAME_CASE(CVTS2SI)
35410 NODE_NAME_CASE(CVTS2UI)
35411 NODE_NAME_CASE(CVTS2SI_RND)
35412 NODE_NAME_CASE(CVTS2UI_RND)
35413 NODE_NAME_CASE(CVTNE2PS2BF16)
35414 NODE_NAME_CASE(CVTNEPS2BF16)
35415 NODE_NAME_CASE(MCVTNEPS2BF16)
35416 NODE_NAME_CASE(DPBF16PS)
35417 NODE_NAME_CASE(LWPINS)
35418 NODE_NAME_CASE(MGATHER)
35419 NODE_NAME_CASE(MSCATTER)
35420 NODE_NAME_CASE(VPDPBUSD)
35421 NODE_NAME_CASE(VPDPBUSDS)
35422 NODE_NAME_CASE(VPDPWSSD)
35423 NODE_NAME_CASE(VPDPWSSDS)
35424 NODE_NAME_CASE(VPSHUFBITQMB)
35425 NODE_NAME_CASE(GF2P8MULB)
35426 NODE_NAME_CASE(GF2P8AFFINEQB)
35427 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35428 NODE_NAME_CASE(NT_CALL)
35429 NODE_NAME_CASE(NT_BRIND)
35430 NODE_NAME_CASE(UMWAIT)
35431 NODE_NAME_CASE(TPAUSE)
35432 NODE_NAME_CASE(ENQCMD)
35433 NODE_NAME_CASE(ENQCMDS)
35434 NODE_NAME_CASE(VP2INTERSECT)
35435 NODE_NAME_CASE(VPDPBSUD)
35436 NODE_NAME_CASE(VPDPBSUDS)
35437 NODE_NAME_CASE(VPDPBUUD)
35438 NODE_NAME_CASE(VPDPBUUDS)
35439 NODE_NAME_CASE(VPDPBSSD)
35440 NODE_NAME_CASE(VPDPBSSDS)
35441 NODE_NAME_CASE(AESENC128KL)
35442 NODE_NAME_CASE(AESDEC128KL)
35443 NODE_NAME_CASE(AESENC256KL)
35444 NODE_NAME_CASE(AESDEC256KL)
35445 NODE_NAME_CASE(AESENCWIDE128KL)
35446 NODE_NAME_CASE(AESDECWIDE128KL)
35447 NODE_NAME_CASE(AESENCWIDE256KL)
35448 NODE_NAME_CASE(AESDECWIDE256KL)
35449 NODE_NAME_CASE(CMPCCXADD)
35450 NODE_NAME_CASE(TESTUI)
35451 NODE_NAME_CASE(FP80_ADD)
35452 NODE_NAME_CASE(STRICT_FP80_ADD)
35453 }
35454 return nullptr;
35455#undef NODE_NAME_CASE
35456}
35457
35458/// Return true if the addressing mode represented by AM is legal for this
35459/// target, for a load/store of the specified type.
35460bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
35461 const AddrMode &AM, Type *Ty,
35462 unsigned AS,
35463 Instruction *I) const {
35464 // X86 supports extremely general addressing modes.
35465 CodeModel::Model M = getTargetMachine().getCodeModel();
35466
35467 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35468 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35469 return false;
35470
35471 if (AM.BaseGV) {
35472 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35473
35474 // If a reference to this global requires an extra load, we can't fold it.
35475 if (isGlobalStubReference(GVFlags))
35476 return false;
35477
35478 // If BaseGV requires a register for the PIC base, we cannot also have a
35479 // BaseReg specified.
35480 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35481 return false;
35482
35483 // If lower 4G is not available, then we must use rip-relative addressing.
35484 if ((M != CodeModel::Small || isPositionIndependent()) &&
35485 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35486 return false;
35487 }
35488
35489 switch (AM.Scale) {
35490 case 0:
35491 case 1:
35492 case 2:
35493 case 4:
35494 case 8:
35495 // These scales always work.
35496 break;
35497 case 3:
35498 case 5:
35499 case 9:
35500 // These scales are formed with basereg+scalereg. Only accept if there is
35501 // no basereg yet.
35502 if (AM.HasBaseReg)
35503 return false;
35504 break;
35505 default: // Other stuff never works.
35506 return false;
35507 }
35508
35509 return true;
35510}
35511
35512bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
35513 unsigned Bits = Ty->getScalarSizeInBits();
35514
35515 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
35516 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
35517 if (Subtarget.hasXOP() &&
35518 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
35519 return false;
35520
35521 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
35522 // shifts just as cheap as scalar ones.
35523 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
35524 return false;
35525
35526 // AVX512BW has shifts such as vpsllvw.
35527 if (Subtarget.hasBWI() && Bits == 16)
35528 return false;
35529
35530 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
35531 // fully general vector.
35532 return true;
35533}
35534
35535bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35536 switch (Opcode) {
35537 // These are non-commutative binops.
35538 // TODO: Add more X86ISD opcodes once we have test coverage.
35539 case X86ISD::ANDNP:
35540 case X86ISD::PCMPGT:
35541 case X86ISD::FMAX:
35542 case X86ISD::FMIN:
35543 case X86ISD::FANDN:
35544 case X86ISD::VPSHA:
35545 case X86ISD::VPSHL:
35546 case X86ISD::VSHLV:
35547 case X86ISD::VSRLV:
35548 case X86ISD::VSRAV:
35549 return true;
35550 }
35551
35552 return TargetLoweringBase::isBinOp(Opcode);
35553}
35554
35555bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35556 switch (Opcode) {
35557 // TODO: Add more X86ISD opcodes once we have test coverage.
35558 case X86ISD::PCMPEQ:
35559 case X86ISD::PMULDQ:
35560 case X86ISD::PMULUDQ:
35561 case X86ISD::FMAXC:
35562 case X86ISD::FMINC:
35563 case X86ISD::FAND:
35564 case X86ISD::FOR:
35565 case X86ISD::FXOR:
35566 return true;
35567 }
35568
35569 return TargetLoweringBase::isCommutativeBinOp(Opcode);
35570}
35571
35572bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
35573 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35574 return false;
35575 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35576 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35577 return NumBits1 > NumBits2;
35578}
35579
35580bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
35581 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35582 return false;
35583
35584 if (!isTypeLegal(EVT::getEVT(Ty1)))
35585 return false;
35586
35587 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35587, __extension__
__PRETTY_FUNCTION__))
;
35588
35589 // Assuming the caller doesn't have a zeroext or signext return parameter,
35590 // truncation all the way down to i1 is valid.
35591 return true;
35592}
35593
35594bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
35595 return isInt<32>(Imm);
35596}
35597
35598bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
35599 // Can also use sub to handle negated immediates.
35600 return isInt<32>(Imm);
35601}
35602
35603bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
35604 return isInt<32>(Imm);
35605}
35606
35607bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
35608 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35609 return false;
35610 unsigned NumBits1 = VT1.getSizeInBits();
35611 unsigned NumBits2 = VT2.getSizeInBits();
35612 return NumBits1 > NumBits2;
35613}
35614
35615bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
35616 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35617 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35618}
35619
35620bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
35621 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35622 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35623}
35624
35625bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
35626 EVT VT1 = Val.getValueType();
35627 if (isZExtFree(VT1, VT2))
35628 return true;
35629
35630 if (Val.getOpcode() != ISD::LOAD)
35631 return false;
35632
35633 if (!VT1.isSimple() || !VT1.isInteger() ||
35634 !VT2.isSimple() || !VT2.isInteger())
35635 return false;
35636
35637 switch (VT1.getSimpleVT().SimpleTy) {
35638 default: break;
35639 case MVT::i8:
35640 case MVT::i16:
35641 case MVT::i32:
35642 // X86 has 8, 16, and 32-bit zero-extending loads.
35643 return true;
35644 }
35645
35646 return false;
35647}
35648
35649bool X86TargetLowering::shouldSinkOperands(Instruction *I,
35650 SmallVectorImpl<Use *> &Ops) const {
35651 using namespace llvm::PatternMatch;
35652
35653 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
35654 if (!VTy)
35655 return false;
35656
35657 if (I->getOpcode() == Instruction::Mul &&
35658 VTy->getElementType()->isIntegerTy(64)) {
35659 for (auto &Op : I->operands()) {
35660 // Make sure we are not already sinking this operand
35661 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
35662 continue;
35663
35664 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
35665 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
35666 if (Subtarget.hasSSE41() &&
35667 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
35668 m_SpecificInt(32)))) {
35669 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
35670 Ops.push_back(&Op);
35671 } else if (Subtarget.hasSSE2() &&
35672 match(Op.get(),
35673 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
35674 Ops.push_back(&Op);
35675 }
35676 }
35677
35678 return !Ops.empty();
35679 }
35680
35681 // A uniform shift amount in a vector shift or funnel shift may be much
35682 // cheaper than a generic variable vector shift, so make that pattern visible
35683 // to SDAG by sinking the shuffle instruction next to the shift.
35684 int ShiftAmountOpNum = -1;
35685 if (I->isShift())
35686 ShiftAmountOpNum = 1;
35687 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
35688 if (II->getIntrinsicID() == Intrinsic::fshl ||
35689 II->getIntrinsicID() == Intrinsic::fshr)
35690 ShiftAmountOpNum = 2;
35691 }
35692
35693 if (ShiftAmountOpNum == -1)
35694 return false;
35695
35696 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
35697 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
35698 isVectorShiftByScalarCheap(I->getType())) {
35699 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
35700 return true;
35701 }
35702
35703 return false;
35704}
35705
35706bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
35707 if (!Subtarget.is64Bit())
35708 return false;
35709 return TargetLowering::shouldConvertPhiType(From, To);
35710}
35711
35712bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
35713 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35714 return false;
35715
35716 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35717
35718 // There is no extending load for vXi1.
35719 if (SrcVT.getScalarType() == MVT::i1)
35720 return false;
35721
35722 return true;
35723}
35724
35725bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
35726 EVT VT) const {
35727 if (!Subtarget.hasAnyFMA())
35728 return false;
35729
35730 VT = VT.getScalarType();
35731
35732 if (!VT.isSimple())
35733 return false;
35734
35735 switch (VT.getSimpleVT().SimpleTy) {
35736 case MVT::f16:
35737 return Subtarget.hasFP16();
35738 case MVT::f32:
35739 case MVT::f64:
35740 return true;
35741 default:
35742 break;
35743 }
35744
35745 return false;
35746}
35747
35748bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
35749 // i16 instructions are longer (0x66 prefix) and potentially slower.
35750 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35751}
35752
35753bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
35754 EVT VT) const {
35755 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35756 // benefit. The transform may also be profitable for scalar code.
35757 if (!Subtarget.hasAVX512())
35758 return false;
35759 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35760 return false;
35761 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35762 return false;
35763
35764 return true;
35765}
35766
35767/// Targets can use this to indicate that they only support *some*
35768/// VECTOR_SHUFFLE operations, those with specific masks.
35769/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35770/// are assumed to be legal.
35771bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
35772 if (!VT.isSimple())
35773 return false;
35774
35775 // Not for i1 vectors
35776 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35777 return false;
35778
35779 // Very little shuffling can be done for 64-bit vectors right now.
35780 if (VT.getSimpleVT().getSizeInBits() == 64)
35781 return false;
35782
35783 // We only care that the types being shuffled are legal. The lowering can
35784 // handle any possible shuffle mask that results.
35785 return isTypeLegal(VT.getSimpleVT());
35786}
35787
35788bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
35789 EVT VT) const {
35790 // Don't convert an 'and' into a shuffle that we don't directly support.
35791 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35792 if (!Subtarget.hasAVX2())
35793 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35794 return false;
35795
35796 // Just delegate to the generic legality, clear masks aren't special.
35797 return isShuffleMaskLegal(Mask, VT);
35798}
35799
35800bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
35801 // If the subtarget is using thunks, we need to not generate jump tables.
35802 if (Subtarget.useIndirectThunkBranches())
35803 return false;
35804
35805 // Otherwise, fallback on the generic logic.
35806 return TargetLowering::areJTsAllowed(Fn);
35807}
35808
35809MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
35810 EVT ConditionVT) const {
35811 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35812 // zero-extensions.
35813 if (ConditionVT.getSizeInBits() < 32)
35814 return MVT::i32;
35815 return TargetLoweringBase::getPreferredSwitchConditionType(Context,
35816 ConditionVT);
35817}
35818
35819//===----------------------------------------------------------------------===//
35820// X86 Scheduler Hooks
35821//===----------------------------------------------------------------------===//
35822
35823// Returns true if EFLAG is consumed after this iterator in the rest of the
35824// basic block or any successors of the basic block.
35825static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
35826 MachineBasicBlock *BB) {
35827 // Scan forward through BB for a use/def of EFLAGS.
35828 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35829 if (mi.readsRegister(X86::EFLAGS))
35830 return true;
35831 // If we found a def, we can stop searching.
35832 if (mi.definesRegister(X86::EFLAGS))
35833 return false;
35834 }
35835
35836 // If we hit the end of the block, check whether EFLAGS is live into a
35837 // successor.
35838 for (MachineBasicBlock *Succ : BB->successors())
35839 if (Succ->isLiveIn(X86::EFLAGS))
35840 return true;
35841
35842 return false;
35843}
35844
35845/// Utility function to emit xbegin specifying the start of an RTM region.
35846static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
35847 const TargetInstrInfo *TII) {
35848 const DebugLoc &DL = MI.getDebugLoc();
35849
35850 const BasicBlock *BB = MBB->getBasicBlock();
35851 MachineFunction::iterator I = ++MBB->getIterator();
35852
35853 // For the v = xbegin(), we generate
35854 //
35855 // thisMBB:
35856 // xbegin sinkMBB
35857 //
35858 // mainMBB:
35859 // s0 = -1
35860 //
35861 // fallBB:
35862 // eax = # XABORT_DEF
35863 // s1 = eax
35864 //
35865 // sinkMBB:
35866 // v = phi(s0/mainBB, s1/fallBB)
35867
35868 MachineBasicBlock *thisMBB = MBB;
35869 MachineFunction *MF = MBB->getParent();
35870 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35871 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35872 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35873 MF->insert(I, mainMBB);
35874 MF->insert(I, fallMBB);
35875 MF->insert(I, sinkMBB);
35876
35877 if (isEFLAGSLiveAfter(MI, MBB)) {
35878 mainMBB->addLiveIn(X86::EFLAGS);
35879 fallMBB->addLiveIn(X86::EFLAGS);
35880 sinkMBB->addLiveIn(X86::EFLAGS);
35881 }
35882
35883 // Transfer the remainder of BB and its successor edges to sinkMBB.
35884 sinkMBB->splice(sinkMBB->begin(), MBB,
35885 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35886 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35887
35888 MachineRegisterInfo &MRI = MF->getRegInfo();
35889 Register DstReg = MI.getOperand(0).getReg();
35890 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35891 Register mainDstReg = MRI.createVirtualRegister(RC);
35892 Register fallDstReg = MRI.createVirtualRegister(RC);
35893
35894 // thisMBB:
35895 // xbegin fallMBB
35896 // # fallthrough to mainMBB
35897 // # abortion to fallMBB
35898 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35899 thisMBB->addSuccessor(mainMBB);
35900 thisMBB->addSuccessor(fallMBB);
35901
35902 // mainMBB:
35903 // mainDstReg := -1
35904 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35905 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35906 mainMBB->addSuccessor(sinkMBB);
35907
35908 // fallMBB:
35909 // ; pseudo instruction to model hardware's definition from XABORT
35910 // EAX := XABORT_DEF
35911 // fallDstReg := EAX
35912 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
35913 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
35914 .addReg(X86::EAX);
35915 fallMBB->addSuccessor(sinkMBB);
35916
35917 // sinkMBB:
35918 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35919 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
35920 .addReg(mainDstReg).addMBB(mainMBB)
35921 .addReg(fallDstReg).addMBB(fallMBB);
35922
35923 MI.eraseFromParent();
35924 return sinkMBB;
35925}
35926
35927MachineBasicBlock *
35928X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35929 MachineBasicBlock *MBB) const {
35930 // Emit va_arg instruction on X86-64.
35931
35932 // Operands to this pseudo-instruction:
35933 // 0 ) Output : destination address (reg)
35934 // 1-5) Input : va_list address (addr, i64mem)
35935 // 6 ) ArgSize : Size (in bytes) of vararg type
35936 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35937 // 8 ) Align : Alignment of type
35938 // 9 ) EFLAGS (implicit-def)
35939
35940 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35940, __extension__
__PRETTY_FUNCTION__))
;
35941 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35942
35943 Register DestReg = MI.getOperand(0).getReg();
35944 MachineOperand &Base = MI.getOperand(1);
35945 MachineOperand &Scale = MI.getOperand(2);
35946 MachineOperand &Index = MI.getOperand(3);
35947 MachineOperand &Disp = MI.getOperand(4);
35948 MachineOperand &Segment = MI.getOperand(5);
35949 unsigned ArgSize = MI.getOperand(6).getImm();
35950 unsigned ArgMode = MI.getOperand(7).getImm();
35951 Align Alignment = Align(MI.getOperand(8).getImm());
35952
35953 MachineFunction *MF = MBB->getParent();
35954
35955 // Memory Reference
35956 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35956, __extension__
__PRETTY_FUNCTION__))
;
35957
35958 MachineMemOperand *OldMMO = MI.memoperands().front();
35959
35960 // Clone the MMO into two separate MMOs for loading and storing
35961 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35962 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35963 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35964 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35965
35966 // Machine Information
35967 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35968 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35969 const TargetRegisterClass *AddrRegClass =
35970 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
35971 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35972 const DebugLoc &DL = MI.getDebugLoc();
35973
35974 // struct va_list {
35975 // i32 gp_offset
35976 // i32 fp_offset
35977 // i64 overflow_area (address)
35978 // i64 reg_save_area (address)
35979 // }
35980 // sizeof(va_list) = 24
35981 // alignment(va_list) = 8
35982
35983 unsigned TotalNumIntRegs = 6;
35984 unsigned TotalNumXMMRegs = 8;
35985 bool UseGPOffset = (ArgMode == 1);
35986 bool UseFPOffset = (ArgMode == 2);
35987 unsigned MaxOffset = TotalNumIntRegs * 8 +
35988 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35989
35990 /* Align ArgSize to a multiple of 8 */
35991 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35992 bool NeedsAlign = (Alignment > 8);
35993
35994 MachineBasicBlock *thisMBB = MBB;
35995 MachineBasicBlock *overflowMBB;
35996 MachineBasicBlock *offsetMBB;
35997 MachineBasicBlock *endMBB;
35998
35999 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
36000 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
36001 unsigned OffsetReg = 0;
36002
36003 if (!UseGPOffset && !UseFPOffset) {
36004 // If we only pull from the overflow region, we don't create a branch.
36005 // We don't need to alter control flow.
36006 OffsetDestReg = 0; // unused
36007 OverflowDestReg = DestReg;
36008
36009 offsetMBB = nullptr;
36010 overflowMBB = thisMBB;
36011 endMBB = thisMBB;
36012 } else {
36013 // First emit code to check if gp_offset (or fp_offset) is below the bound.
36014 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
36015 // If not, pull from overflow_area. (branch to overflowMBB)
36016 //
36017 // thisMBB
36018 // | .
36019 // | .
36020 // offsetMBB overflowMBB
36021 // | .
36022 // | .
36023 // endMBB
36024
36025 // Registers for the PHI in endMBB
36026 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
36027 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
36028
36029 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36030 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36031 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36032 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36033
36034 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36035
36036 // Insert the new basic blocks
36037 MF->insert(MBBIter, offsetMBB);
36038 MF->insert(MBBIter, overflowMBB);
36039 MF->insert(MBBIter, endMBB);
36040
36041 // Transfer the remainder of MBB and its successor edges to endMBB.
36042 endMBB->splice(endMBB->begin(), thisMBB,
36043 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
36044 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
36045
36046 // Make offsetMBB and overflowMBB successors of thisMBB
36047 thisMBB->addSuccessor(offsetMBB);
36048 thisMBB->addSuccessor(overflowMBB);
36049
36050 // endMBB is a successor of both offsetMBB and overflowMBB
36051 offsetMBB->addSuccessor(endMBB);
36052 overflowMBB->addSuccessor(endMBB);
36053
36054 // Load the offset value into a register
36055 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36056 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
36057 .add(Base)
36058 .add(Scale)
36059 .add(Index)
36060 .addDisp(Disp, UseFPOffset ? 4 : 0)
36061 .add(Segment)
36062 .setMemRefs(LoadOnlyMMO);
36063
36064 // Check if there is enough room left to pull this argument.
36065 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
36066 .addReg(OffsetReg)
36067 .addImm(MaxOffset + 8 - ArgSizeA8);
36068
36069 // Branch to "overflowMBB" if offset >= max
36070 // Fall through to "offsetMBB" otherwise
36071 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
36072 .addMBB(overflowMBB).addImm(X86::COND_AE);
36073 }
36074
36075 // In offsetMBB, emit code to use the reg_save_area.
36076 if (offsetMBB) {
36077 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36077, __extension__ __PRETTY_FUNCTION__))
;
36078
36079 // Read the reg_save_area address.
36080 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
36081 BuildMI(
36082 offsetMBB, DL,
36083 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36084 RegSaveReg)
36085 .add(Base)
36086 .add(Scale)
36087 .add(Index)
36088 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
36089 .add(Segment)
36090 .setMemRefs(LoadOnlyMMO);
36091
36092 if (Subtarget.isTarget64BitLP64()) {
36093 // Zero-extend the offset
36094 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36095 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36096 .addImm(0)
36097 .addReg(OffsetReg)
36098 .addImm(X86::sub_32bit);
36099
36100 // Add the offset to the reg_save_area to get the final address.
36101 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
36102 .addReg(OffsetReg64)
36103 .addReg(RegSaveReg);
36104 } else {
36105 // Add the offset to the reg_save_area to get the final address.
36106 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
36107 .addReg(OffsetReg)
36108 .addReg(RegSaveReg);
36109 }
36110
36111 // Compute the offset for the next argument
36112 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36113 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
36114 .addReg(OffsetReg)
36115 .addImm(UseFPOffset ? 16 : 8);
36116
36117 // Store it back into the va_list.
36118 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
36119 .add(Base)
36120 .add(Scale)
36121 .add(Index)
36122 .addDisp(Disp, UseFPOffset ? 4 : 0)
36123 .add(Segment)
36124 .addReg(NextOffsetReg)
36125 .setMemRefs(StoreOnlyMMO);
36126
36127 // Jump to endMBB
36128 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
36129 .addMBB(endMBB);
36130 }
36131
36132 //
36133 // Emit code to use overflow area
36134 //
36135
36136 // Load the overflow_area address into a register.
36137 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36138 BuildMI(overflowMBB, DL,
36139 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36140 OverflowAddrReg)
36141 .add(Base)
36142 .add(Scale)
36143 .add(Index)
36144 .addDisp(Disp, 8)
36145 .add(Segment)
36146 .setMemRefs(LoadOnlyMMO);
36147
36148 // If we need to align it, do so. Otherwise, just copy the address
36149 // to OverflowDestReg.
36150 if (NeedsAlign) {
36151 // Align the overflow address
36152 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36153
36154 // aligned_addr = (addr + (align-1)) & ~(align-1)
36155 BuildMI(
36156 overflowMBB, DL,
36157 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36158 TmpReg)
36159 .addReg(OverflowAddrReg)
36160 .addImm(Alignment.value() - 1);
36161
36162 BuildMI(
36163 overflowMBB, DL,
36164 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36165 OverflowDestReg)
36166 .addReg(TmpReg)
36167 .addImm(~(uint64_t)(Alignment.value() - 1));
36168 } else {
36169 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
36170 .addReg(OverflowAddrReg);
36171 }
36172
36173 // Compute the next overflow address after this argument.
36174 // (the overflow address should be kept 8-byte aligned)
36175 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36176 BuildMI(
36177 overflowMBB, DL,
36178 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36179 NextAddrReg)
36180 .addReg(OverflowDestReg)
36181 .addImm(ArgSizeA8);
36182
36183 // Store the new overflow address.
36184 BuildMI(overflowMBB, DL,
36185 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36186 .add(Base)
36187 .add(Scale)
36188 .add(Index)
36189 .addDisp(Disp, 8)
36190 .add(Segment)
36191 .addReg(NextAddrReg)
36192 .setMemRefs(StoreOnlyMMO);
36193
36194 // If we branched, emit the PHI to the front of endMBB.
36195 if (offsetMBB) {
36196 BuildMI(*endMBB, endMBB->begin(), DL,
36197 TII->get(X86::PHI), DestReg)
36198 .addReg(OffsetDestReg).addMBB(offsetMBB)
36199 .addReg(OverflowDestReg).addMBB(overflowMBB);
36200 }
36201
36202 // Erase the pseudo instruction
36203 MI.eraseFromParent();
36204
36205 return endMBB;
36206}
36207
36208// The EFLAGS operand of SelectItr might be missing a kill marker
36209// because there were multiple uses of EFLAGS, and ISel didn't know
36210// which to mark. Figure out whether SelectItr should have had a
36211// kill marker, and set it if it should. Returns the correct kill
36212// marker value.
36213static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
36214 MachineBasicBlock* BB,
36215 const TargetRegisterInfo* TRI) {
36216 if (isEFLAGSLiveAfter(SelectItr, BB))
36217 return false;
36218
36219 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36220 // out. SelectMI should have a kill flag on EFLAGS.
36221 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36222 return true;
36223}
36224
36225// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36226// together with other CMOV pseudo-opcodes into a single basic-block with
36227// conditional jump around it.
36228static bool isCMOVPseudo(MachineInstr &MI) {
36229 switch (MI.getOpcode()) {
36230 case X86::CMOV_FR16:
36231 case X86::CMOV_FR16X:
36232 case X86::CMOV_FR32:
36233 case X86::CMOV_FR32X:
36234 case X86::CMOV_FR64:
36235 case X86::CMOV_FR64X:
36236 case X86::CMOV_GR8:
36237 case X86::CMOV_GR16:
36238 case X86::CMOV_GR32:
36239 case X86::CMOV_RFP32:
36240 case X86::CMOV_RFP64:
36241 case X86::CMOV_RFP80:
36242 case X86::CMOV_VR64:
36243 case X86::CMOV_VR128:
36244 case X86::CMOV_VR128X:
36245 case X86::CMOV_VR256:
36246 case X86::CMOV_VR256X:
36247 case X86::CMOV_VR512:
36248 case X86::CMOV_VK1:
36249 case X86::CMOV_VK2:
36250 case X86::CMOV_VK4:
36251 case X86::CMOV_VK8:
36252 case X86::CMOV_VK16:
36253 case X86::CMOV_VK32:
36254 case X86::CMOV_VK64:
36255 return true;
36256
36257 default:
36258 return false;
36259 }
36260}
36261
36262// Helper function, which inserts PHI functions into SinkMBB:
36263// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36264// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36265// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36266// the last PHI function inserted.
36267static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
36268 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
36269 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36270 MachineBasicBlock *SinkMBB) {
36271 MachineFunction *MF = TrueMBB->getParent();
36272 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
36273 const DebugLoc &DL = MIItBegin->getDebugLoc();
36274
36275 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36276 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36277
36278 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36279
36280 // As we are creating the PHIs, we have to be careful if there is more than
36281 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36282 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36283 // That also means that PHI construction must work forward from earlier to
36284 // later, and that the code must maintain a mapping from earlier PHI's
36285 // destination registers, and the registers that went into the PHI.
36286 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
36287 MachineInstrBuilder MIB;
36288
36289 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36290 Register DestReg = MIIt->getOperand(0).getReg();
36291 Register Op1Reg = MIIt->getOperand(1).getReg();
36292 Register Op2Reg = MIIt->getOperand(2).getReg();
36293
36294 // If this CMOV we are generating is the opposite condition from
36295 // the jump we generated, then we have to swap the operands for the
36296 // PHI that is going to be generated.
36297 if (MIIt->getOperand(3).getImm() == OppCC)
36298 std::swap(Op1Reg, Op2Reg);
36299
36300 if (RegRewriteTable.contains(Op1Reg))
36301 Op1Reg = RegRewriteTable[Op1Reg].first;
36302
36303 if (RegRewriteTable.contains(Op2Reg))
36304 Op2Reg = RegRewriteTable[Op2Reg].second;
36305
36306 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
36307 .addReg(Op1Reg)
36308 .addMBB(FalseMBB)
36309 .addReg(Op2Reg)
36310 .addMBB(TrueMBB);
36311
36312 // Add this PHI to the rewrite table.
36313 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36314 }
36315
36316 return MIB;
36317}
36318
36319// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36320MachineBasicBlock *
36321X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36322 MachineInstr &SecondCascadedCMOV,
36323 MachineBasicBlock *ThisMBB) const {
36324 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36325 const DebugLoc &DL = FirstCMOV.getDebugLoc();
36326
36327 // We lower cascaded CMOVs such as
36328 //
36329 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36330 //
36331 // to two successive branches.
36332 //
36333 // Without this, we would add a PHI between the two jumps, which ends up
36334 // creating a few copies all around. For instance, for
36335 //
36336 // (sitofp (zext (fcmp une)))
36337 //
36338 // we would generate:
36339 //
36340 // ucomiss %xmm1, %xmm0
36341 // movss <1.0f>, %xmm0
36342 // movaps %xmm0, %xmm1
36343 // jne .LBB5_2
36344 // xorps %xmm1, %xmm1
36345 // .LBB5_2:
36346 // jp .LBB5_4
36347 // movaps %xmm1, %xmm0
36348 // .LBB5_4:
36349 // retq
36350 //
36351 // because this custom-inserter would have generated:
36352 //
36353 // A
36354 // | \
36355 // | B
36356 // | /
36357 // C
36358 // | \
36359 // | D
36360 // | /
36361 // E
36362 //
36363 // A: X = ...; Y = ...
36364 // B: empty
36365 // C: Z = PHI [X, A], [Y, B]
36366 // D: empty
36367 // E: PHI [X, C], [Z, D]
36368 //
36369 // If we lower both CMOVs in a single step, we can instead generate:
36370 //
36371 // A
36372 // | \
36373 // | C
36374 // | /|
36375 // |/ |
36376 // | |
36377 // | D
36378 // | /
36379 // E
36380 //
36381 // A: X = ...; Y = ...
36382 // D: empty
36383 // E: PHI [X, A], [X, C], [Y, D]
36384 //
36385 // Which, in our sitofp/fcmp example, gives us something like:
36386 //
36387 // ucomiss %xmm1, %xmm0
36388 // movss <1.0f>, %xmm0
36389 // jne .LBB5_4
36390 // jp .LBB5_4
36391 // xorps %xmm0, %xmm0
36392 // .LBB5_4:
36393 // retq
36394 //
36395
36396 // We lower cascaded CMOV into two successive branches to the same block.
36397 // EFLAGS is used by both, so mark it as live in the second.
36398 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36399 MachineFunction *F = ThisMBB->getParent();
36400 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36401 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36402 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36403
36404 MachineFunction::iterator It = ++ThisMBB->getIterator();
36405 F->insert(It, FirstInsertedMBB);
36406 F->insert(It, SecondInsertedMBB);
36407 F->insert(It, SinkMBB);
36408
36409 // For a cascaded CMOV, we lower it to two successive branches to
36410 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36411 // the FirstInsertedMBB.
36412 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36413
36414 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36415 // live into the sink and copy blocks.
36416 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36417 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
36418 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36419 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36420 SinkMBB->addLiveIn(X86::EFLAGS);
36421 }
36422
36423 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36424 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36425 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36426 ThisMBB->end());
36427 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36428
36429 // Fallthrough block for ThisMBB.
36430 ThisMBB->addSuccessor(FirstInsertedMBB);
36431 // The true block target of the first branch is always SinkMBB.
36432 ThisMBB->addSuccessor(SinkMBB);
36433 // Fallthrough block for FirstInsertedMBB.
36434 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36435 // The true block for the branch of FirstInsertedMBB.
36436 FirstInsertedMBB->addSuccessor(SinkMBB);
36437 // This is fallthrough.
36438 SecondInsertedMBB->addSuccessor(SinkMBB);
36439
36440 // Create the conditional branch instructions.
36441 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36442 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36443
36444 X86::CondCode SecondCC =
36445 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36446 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
36447
36448 // SinkMBB:
36449 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36450 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36451 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36452 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36453 MachineInstrBuilder MIB =
36454 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
36455 .addReg(Op1Reg)
36456 .addMBB(SecondInsertedMBB)
36457 .addReg(Op2Reg)
36458 .addMBB(ThisMBB);
36459
36460 // The second SecondInsertedMBB provides the same incoming value as the
36461 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36462 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36463
36464 // Now remove the CMOVs.
36465 FirstCMOV.eraseFromParent();
36466 SecondCascadedCMOV.eraseFromParent();
36467
36468 return SinkMBB;
36469}
36470
36471MachineBasicBlock *
36472X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36473 MachineBasicBlock *ThisMBB) const {
36474 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36475 const DebugLoc &DL = MI.getDebugLoc();
36476
36477 // To "insert" a SELECT_CC instruction, we actually have to insert the
36478 // diamond control-flow pattern. The incoming instruction knows the
36479 // destination vreg to set, the condition code register to branch on, the
36480 // true/false values to select between and a branch opcode to use.
36481
36482 // ThisMBB:
36483 // ...
36484 // TrueVal = ...
36485 // cmpTY ccX, r1, r2
36486 // bCC copy1MBB
36487 // fallthrough --> FalseMBB
36488
36489 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36490 // as described above, by inserting a BB, and then making a PHI at the join
36491 // point to select the true and false operands of the CMOV in the PHI.
36492 //
36493 // The code also handles two different cases of multiple CMOV opcodes
36494 // in a row.
36495 //
36496 // Case 1:
36497 // In this case, there are multiple CMOVs in a row, all which are based on
36498 // the same condition setting (or the exact opposite condition setting).
36499 // In this case we can lower all the CMOVs using a single inserted BB, and
36500 // then make a number of PHIs at the join point to model the CMOVs. The only
36501 // trickiness here, is that in a case like:
36502 //
36503 // t2 = CMOV cond1 t1, f1
36504 // t3 = CMOV cond1 t2, f2
36505 //
36506 // when rewriting this into PHIs, we have to perform some renaming on the
36507 // temps since you cannot have a PHI operand refer to a PHI result earlier
36508 // in the same block. The "simple" but wrong lowering would be:
36509 //
36510 // t2 = PHI t1(BB1), f1(BB2)
36511 // t3 = PHI t2(BB1), f2(BB2)
36512 //
36513 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36514 // renaming is to note that on the path through BB1, t2 is really just a
36515 // copy of t1, and do that renaming, properly generating:
36516 //
36517 // t2 = PHI t1(BB1), f1(BB2)
36518 // t3 = PHI t1(BB1), f2(BB2)
36519 //
36520 // Case 2:
36521 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36522 // function - EmitLoweredCascadedSelect.
36523
36524 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36525 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36526 MachineInstr *LastCMOV = &MI;
36527 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
36528
36529 // Check for case 1, where there are multiple CMOVs with the same condition
36530 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36531 // number of jumps the most.
36532
36533 if (isCMOVPseudo(MI)) {
36534 // See if we have a string of CMOVS with the same condition. Skip over
36535 // intervening debug insts.
36536 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36537 (NextMIIt->getOperand(3).getImm() == CC ||
36538 NextMIIt->getOperand(3).getImm() == OppCC)) {
36539 LastCMOV = &*NextMIIt;
36540 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36541 }
36542 }
36543
36544 // This checks for case 2, but only do this if we didn't already find
36545 // case 1, as indicated by LastCMOV == MI.
36546 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36547 NextMIIt->getOpcode() == MI.getOpcode() &&
36548 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36549 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36550 NextMIIt->getOperand(1).isKill()) {
36551 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36552 }
36553
36554 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36555 MachineFunction *F = ThisMBB->getParent();
36556 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36557 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36558
36559 MachineFunction::iterator It = ++ThisMBB->getIterator();
36560 F->insert(It, FalseMBB);
36561 F->insert(It, SinkMBB);
36562
36563 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36564 // live into the sink and copy blocks.
36565 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36566 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
36567 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36568 FalseMBB->addLiveIn(X86::EFLAGS);
36569 SinkMBB->addLiveIn(X86::EFLAGS);
36570 }
36571
36572 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36573 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
36574 MachineBasicBlock::iterator(LastCMOV));
36575 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36576 if (MI.isDebugInstr())
36577 SinkMBB->push_back(MI.removeFromParent());
36578
36579 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36580 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36581 std::next(MachineBasicBlock::iterator(LastCMOV)),
36582 ThisMBB->end());
36583 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36584
36585 // Fallthrough block for ThisMBB.
36586 ThisMBB->addSuccessor(FalseMBB);
36587 // The true block target of the first (or only) branch is always a SinkMBB.
36588 ThisMBB->addSuccessor(SinkMBB);
36589 // Fallthrough block for FalseMBB.
36590 FalseMBB->addSuccessor(SinkMBB);
36591
36592 // Create the conditional branch instruction.
36593 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36594
36595 // SinkMBB:
36596 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36597 // ...
36598 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
36599 MachineBasicBlock::iterator MIItEnd =
36600 std::next(MachineBasicBlock::iterator(LastCMOV));
36601 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36602
36603 // Now remove the CMOV(s).
36604 ThisMBB->erase(MIItBegin, MIItEnd);
36605
36606 return SinkMBB;
36607}
36608
36609static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
36610 if (IsLP64) {
36611 if (isInt<8>(Imm))
36612 return X86::SUB64ri8;
36613 return X86::SUB64ri32;
36614 } else {
36615 if (isInt<8>(Imm))
36616 return X86::SUB32ri8;
36617 return X86::SUB32ri;
36618 }
36619}
36620
36621MachineBasicBlock *
36622X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36623 MachineBasicBlock *MBB) const {
36624 MachineFunction *MF = MBB->getParent();
36625 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36626 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36627 const DebugLoc &DL = MI.getDebugLoc();
36628 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36629
36630 const unsigned ProbeSize = getStackProbeSize(*MF);
36631
36632 MachineRegisterInfo &MRI = MF->getRegInfo();
36633 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36634 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36635 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36636
36637 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36638 MF->insert(MBBIter, testMBB);
36639 MF->insert(MBBIter, blockMBB);
36640 MF->insert(MBBIter, tailMBB);
36641
36642 Register sizeVReg = MI.getOperand(1).getReg();
36643
36644 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36645
36646 Register TmpStackPtr = MRI.createVirtualRegister(
36647 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36648 Register FinalStackPtr = MRI.createVirtualRegister(
36649 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36650
36651 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
36652 .addReg(physSPReg);
36653 {
36654 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36655 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
36656 .addReg(TmpStackPtr)
36657 .addReg(sizeVReg);
36658 }
36659
36660 // test rsp size
36661
36662 BuildMI(testMBB, DL,
36663 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36664 .addReg(FinalStackPtr)
36665 .addReg(physSPReg);
36666
36667 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
36668 .addMBB(tailMBB)
36669 .addImm(X86::COND_GE);
36670 testMBB->addSuccessor(blockMBB);
36671 testMBB->addSuccessor(tailMBB);
36672
36673 // Touch the block then extend it. This is done on the opposite side of
36674 // static probe where we allocate then touch, to avoid the need of probing the
36675 // tail of the static alloca. Possible scenarios are:
36676 //
36677 // + ---- <- ------------ <- ------------- <- ------------ +
36678 // | |
36679 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36680 // | |
36681 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36682 //
36683 // The property we want to enforce is to never have more than [page alloc] between two probes.
36684
36685 const unsigned XORMIOpc =
36686 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
36687 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
36688 .addImm(0);
36689
36690 BuildMI(blockMBB, DL,
36691 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
36692 .addReg(physSPReg)
36693 .addImm(ProbeSize);
36694
36695
36696 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
36697 blockMBB->addSuccessor(testMBB);
36698
36699 // Replace original instruction by the expected stack ptr
36700 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
36701 .addReg(FinalStackPtr);
36702
36703 tailMBB->splice(tailMBB->end(), MBB,
36704 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36705 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36706 MBB->addSuccessor(testMBB);
36707
36708 // Delete the original pseudo instruction.
36709 MI.eraseFromParent();
36710
36711 // And we're done.
36712 return tailMBB;
36713}
36714
36715MachineBasicBlock *
36716X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36717 MachineBasicBlock *BB) const {
36718 MachineFunction *MF = BB->getParent();
36719 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36720 const DebugLoc &DL = MI.getDebugLoc();
36721 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36722
36723 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36723, __extension__ __PRETTY_FUNCTION__))
;
36724
36725 const bool Is64Bit = Subtarget.is64Bit();
36726 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36727
36728 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36729 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36730
36731 // BB:
36732 // ... [Till the alloca]
36733 // If stacklet is not large enough, jump to mallocMBB
36734 //
36735 // bumpMBB:
36736 // Allocate by subtracting from RSP
36737 // Jump to continueMBB
36738 //
36739 // mallocMBB:
36740 // Allocate by call to runtime
36741 //
36742 // continueMBB:
36743 // ...
36744 // [rest of original BB]
36745 //
36746
36747 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36748 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36749 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36750
36751 MachineRegisterInfo &MRI = MF->getRegInfo();
36752 const TargetRegisterClass *AddrRegClass =
36753 getRegClassFor(getPointerTy(MF->getDataLayout()));
36754
36755 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36756 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36757 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36758 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36759 sizeVReg = MI.getOperand(1).getReg(),
36760 physSPReg =
36761 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36762
36763 MachineFunction::iterator MBBIter = ++BB->getIterator();
36764
36765 MF->insert(MBBIter, bumpMBB);
36766 MF->insert(MBBIter, mallocMBB);
36767 MF->insert(MBBIter, continueMBB);
36768
36769 continueMBB->splice(continueMBB->begin(), BB,
36770 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36771 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36772
36773 // Add code to the main basic block to check if the stack limit has been hit,
36774 // and if so, jump to mallocMBB otherwise to bumpMBB.
36775 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36776 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36777 .addReg(tmpSPVReg).addReg(sizeVReg);
36778 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36779 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36780 .addReg(SPLimitVReg);
36781 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36782
36783 // bumpMBB simply decreases the stack pointer, since we know the current
36784 // stacklet has enough space.
36785 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
36786 .addReg(SPLimitVReg);
36787 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36788 .addReg(SPLimitVReg);
36789 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36790
36791 // Calls into a routine in libgcc to allocate more space from the heap.
36792 const uint32_t *RegMask =
36793 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36794 if (IsLP64) {
36795 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
36796 .addReg(sizeVReg);
36797 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
36798 .addExternalSymbol("__morestack_allocate_stack_space")
36799 .addRegMask(RegMask)
36800 .addReg(X86::RDI, RegState::Implicit)
36801 .addReg(X86::RAX, RegState::ImplicitDefine);
36802 } else if (Is64Bit) {
36803 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
36804 .addReg(sizeVReg);
36805 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
36806 .addExternalSymbol("__morestack_allocate_stack_space")
36807 .addRegMask(RegMask)
36808 .addReg(X86::EDI, RegState::Implicit)
36809 .addReg(X86::EAX, RegState::ImplicitDefine);
36810 } else {
36811 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36812 .addImm(12);
36813 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36814 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
36815 .addExternalSymbol("__morestack_allocate_stack_space")
36816 .addRegMask(RegMask)
36817 .addReg(X86::EAX, RegState::ImplicitDefine);
36818 }
36819
36820 if (!Is64Bit)
36821 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36822 .addImm(16);
36823
36824 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36825 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36826 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36827
36828 // Set up the CFG correctly.
36829 BB->addSuccessor(bumpMBB);
36830 BB->addSuccessor(mallocMBB);
36831 mallocMBB->addSuccessor(continueMBB);
36832 bumpMBB->addSuccessor(continueMBB);
36833
36834 // Take care of the PHI nodes.
36835 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
36836 MI.getOperand(0).getReg())
36837 .addReg(mallocPtrVReg)
36838 .addMBB(mallocMBB)
36839 .addReg(bumpSPPtrVReg)
36840 .addMBB(bumpMBB);
36841
36842 // Delete the original pseudo instruction.
36843 MI.eraseFromParent();
36844
36845 // And we're done.
36846 return continueMBB;
36847}
36848
36849MachineBasicBlock *
36850X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36851 MachineBasicBlock *BB) const {
36852 MachineFunction *MF = BB->getParent();
36853 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36854 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36855 const DebugLoc &DL = MI.getDebugLoc();
36856
36857 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36859, __extension__
__PRETTY_FUNCTION__))
36858 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36859, __extension__
__PRETTY_FUNCTION__))
36859 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36859, __extension__
__PRETTY_FUNCTION__))
;
36860
36861 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36862 if (!Subtarget.is32Bit())
36863 return BB;
36864
36865 // C++ EH creates a new target block to hold the restore code, and wires up
36866 // the new block to the return destination with a normal JMP_4.
36867 MachineBasicBlock *RestoreMBB =
36868 MF->CreateMachineBasicBlock(BB->getBasicBlock());
36869 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36869, __extension__ __PRETTY_FUNCTION__))
;
36870 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36871 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36872 BB->addSuccessor(RestoreMBB);
36873 MI.getOperand(0).setMBB(RestoreMBB);
36874
36875 // Marking this as an EH pad but not a funclet entry block causes PEI to
36876 // restore stack pointers in the block.
36877 RestoreMBB->setIsEHPad(true);
36878
36879 auto RestoreMBBI = RestoreMBB->begin();
36880 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36881 return BB;
36882}
36883
36884MachineBasicBlock *
36885X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
36886 MachineBasicBlock *BB) const {
36887 // So, here we replace TLSADDR with the sequence:
36888 // adjust_stackdown -> TLSADDR -> adjust_stackup.
36889 // We need this because TLSADDR is lowered into calls
36890 // inside MC, therefore without the two markers shrink-wrapping
36891 // may push the prologue/epilogue pass them.
36892 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36893 const DebugLoc &DL = MI.getDebugLoc();
36894 MachineFunction &MF = *BB->getParent();
36895
36896 // Emit CALLSEQ_START right before the instruction.
36897 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
36898 MachineInstrBuilder CallseqStart =
36899 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
36900 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
36901
36902 // Emit CALLSEQ_END right after the instruction.
36903 // We don't call erase from parent because we want to keep the
36904 // original instruction around.
36905 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
36906 MachineInstrBuilder CallseqEnd =
36907 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
36908 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
36909
36910 return BB;
36911}
36912
36913MachineBasicBlock *
36914X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36915 MachineBasicBlock *BB) const {
36916 // This is pretty easy. We're taking the value that we received from
36917 // our load from the relocation, sticking it in either RDI (x86-64)
36918 // or EAX and doing an indirect call. The return value will then
36919 // be in the normal return register.
36920 MachineFunction *F = BB->getParent();
36921 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36922 const DebugLoc &DL = MI.getDebugLoc();
36923
36924 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36924, __extension__
__PRETTY_FUNCTION__))
;
36925 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36925, __extension__
__PRETTY_FUNCTION__))
;
36926
36927 // Get a register mask for the lowered call.
36928 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36929 // proper register mask.
36930 const uint32_t *RegMask =
36931 Subtarget.is64Bit() ?
36932 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36933 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36934 if (Subtarget.is64Bit()) {
36935 MachineInstrBuilder MIB =
36936 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
36937 .addReg(X86::RIP)
36938 .addImm(0)
36939 .addReg(0)
36940 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36941 MI.getOperand(3).getTargetFlags())
36942 .addReg(0);
36943 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
36944 addDirectMem(MIB, X86::RDI);
36945 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36946 } else if (!isPositionIndependent()) {
36947 MachineInstrBuilder MIB =
36948 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
36949 .addReg(0)
36950 .addImm(0)
36951 .addReg(0)
36952 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36953 MI.getOperand(3).getTargetFlags())
36954 .addReg(0);
36955 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
36956 addDirectMem(MIB, X86::EAX);
36957 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36958 } else {
36959 MachineInstrBuilder MIB =
36960 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
36961 .addReg(TII->getGlobalBaseReg(F))
36962 .addImm(0)
36963 .addReg(0)
36964 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36965 MI.getOperand(3).getTargetFlags())
36966 .addReg(0);
36967 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
36968 addDirectMem(MIB, X86::EAX);
36969 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36970 }
36971
36972 MI.eraseFromParent(); // The pseudo instruction is gone now.
36973 return BB;
36974}
36975
36976static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36977 switch (RPOpc) {
36978 case X86::INDIRECT_THUNK_CALL32:
36979 return X86::CALLpcrel32;
36980 case X86::INDIRECT_THUNK_CALL64:
36981 return X86::CALL64pcrel32;
36982 case X86::INDIRECT_THUNK_TCRETURN32:
36983 return X86::TCRETURNdi;
36984 case X86::INDIRECT_THUNK_TCRETURN64:
36985 return X86::TCRETURNdi64;
36986 }
36987 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36987)
;
36988}
36989
36990static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36991 unsigned Reg) {
36992 if (Subtarget.useRetpolineExternalThunk()) {
36993 // When using an external thunk for retpolines, we pick names that match the
36994 // names GCC happens to use as well. This helps simplify the implementation
36995 // of the thunks for kernels where they have no easy ability to create
36996 // aliases and are doing non-trivial configuration of the thunk's body. For
36997 // example, the Linux kernel will do boot-time hot patching of the thunk
36998 // bodies and cannot easily export aliases of these to loaded modules.
36999 //
37000 // Note that at any point in the future, we may need to change the semantics
37001 // of how we implement retpolines and at that time will likely change the
37002 // name of the called thunk. Essentially, there is no hard guarantee that
37003 // LLVM will generate calls to specific thunks, we merely make a best-effort
37004 // attempt to help out kernels and other systems where duplicating the
37005 // thunks is costly.
37006 switch (Reg) {
37007 case X86::EAX:
37008 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37008, __extension__
__PRETTY_FUNCTION__))
;
37009 return "__x86_indirect_thunk_eax";
37010 case X86::ECX:
37011 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37011, __extension__
__PRETTY_FUNCTION__))
;
37012 return "__x86_indirect_thunk_ecx";
37013 case X86::EDX:
37014 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37014, __extension__
__PRETTY_FUNCTION__))
;
37015 return "__x86_indirect_thunk_edx";
37016 case X86::EDI:
37017 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37017, __extension__
__PRETTY_FUNCTION__))
;
37018 return "__x86_indirect_thunk_edi";
37019 case X86::R11:
37020 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37020, __extension__
__PRETTY_FUNCTION__))
;
37021 return "__x86_indirect_thunk_r11";
37022 }
37023 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37023)
;
37024 }
37025
37026 if (Subtarget.useRetpolineIndirectCalls() ||
37027 Subtarget.useRetpolineIndirectBranches()) {
37028 // When targeting an internal COMDAT thunk use an LLVM-specific name.
37029 switch (Reg) {
37030 case X86::EAX:
37031 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37031, __extension__
__PRETTY_FUNCTION__))
;
37032 return "__llvm_retpoline_eax";
37033 case X86::ECX:
37034 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37034, __extension__
__PRETTY_FUNCTION__))
;
37035 return "__llvm_retpoline_ecx";
37036 case X86::EDX:
37037 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37037, __extension__
__PRETTY_FUNCTION__))
;
37038 return "__llvm_retpoline_edx";
37039 case X86::EDI:
37040 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37040, __extension__
__PRETTY_FUNCTION__))
;
37041 return "__llvm_retpoline_edi";
37042 case X86::R11:
37043 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37043, __extension__
__PRETTY_FUNCTION__))
;
37044 return "__llvm_retpoline_r11";
37045 }
37046 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37046)
;
37047 }
37048
37049 if (Subtarget.useLVIControlFlowIntegrity()) {
37050 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37050, __extension__
__PRETTY_FUNCTION__))
;
37051 return "__llvm_lvi_thunk_r11";
37052 }
37053 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37053)
;
37054}
37055
37056MachineBasicBlock *
37057X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
37058 MachineBasicBlock *BB) const {
37059 // Copy the virtual register into the R11 physical register and
37060 // call the retpoline thunk.
37061 const DebugLoc &DL = MI.getDebugLoc();
37062 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37063 Register CalleeVReg = MI.getOperand(0).getReg();
37064 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
37065
37066 // Find an available scratch register to hold the callee. On 64-bit, we can
37067 // just use R11, but we scan for uses anyway to ensure we don't generate
37068 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
37069 // already a register use operand to the call to hold the callee. If none
37070 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
37071 // register and ESI is the base pointer to realigned stack frames with VLAs.
37072 SmallVector<unsigned, 3> AvailableRegs;
37073 if (Subtarget.is64Bit())
37074 AvailableRegs.push_back(X86::R11);
37075 else
37076 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
37077
37078 // Zero out any registers that are already used.
37079 for (const auto &MO : MI.operands()) {
37080 if (MO.isReg() && MO.isUse())
37081 for (unsigned &Reg : AvailableRegs)
37082 if (Reg == MO.getReg())
37083 Reg = 0;
37084 }
37085
37086 // Choose the first remaining non-zero available register.
37087 unsigned AvailableReg = 0;
37088 for (unsigned MaybeReg : AvailableRegs) {
37089 if (MaybeReg) {
37090 AvailableReg = MaybeReg;
37091 break;
37092 }
37093 }
37094 if (!AvailableReg)
37095 report_fatal_error("calling convention incompatible with retpoline, no "
37096 "available registers");
37097
37098 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
37099
37100 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
37101 .addReg(CalleeVReg);
37102 MI.getOperand(0).ChangeToES(Symbol);
37103 MI.setDesc(TII->get(Opc));
37104 MachineInstrBuilder(*BB->getParent(), &MI)
37105 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
37106 return BB;
37107}
37108
37109/// SetJmp implies future control flow change upon calling the corresponding
37110/// LongJmp.
37111/// Instead of using the 'return' instruction, the long jump fixes the stack and
37112/// performs an indirect branch. To do so it uses the registers that were stored
37113/// in the jump buffer (when calling SetJmp).
37114/// In case the shadow stack is enabled we need to fix it as well, because some
37115/// return addresses will be skipped.
37116/// The function will save the SSP for future fixing in the function
37117/// emitLongJmpShadowStackFix.
37118/// \sa emitLongJmpShadowStackFix
37119/// \param [in] MI The temporary Machine Instruction for the builtin.
37120/// \param [in] MBB The Machine Basic Block that will be modified.
37121void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37122 MachineBasicBlock *MBB) const {
37123 const DebugLoc &DL = MI.getDebugLoc();
37124 MachineFunction *MF = MBB->getParent();
37125 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37126 MachineRegisterInfo &MRI = MF->getRegInfo();
37127 MachineInstrBuilder MIB;
37128
37129 // Memory Reference.
37130 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37131 MI.memoperands_end());
37132
37133 // Initialize a register with zero.
37134 MVT PVT = getPointerTy(MF->getDataLayout());
37135 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37136 Register ZReg = MRI.createVirtualRegister(PtrRC);
37137 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37138 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
37139 .addDef(ZReg)
37140 .addReg(ZReg, RegState::Undef)
37141 .addReg(ZReg, RegState::Undef);
37142
37143 // Read the current SSP Register value to the zeroed register.
37144 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37145 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37146 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37147
37148 // Write the SSP register value to offset 3 in input memory buffer.
37149 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37150 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
37151 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37152 const unsigned MemOpndSlot = 1;
37153 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37154 if (i == X86::AddrDisp)
37155 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37156 else
37157 MIB.add(MI.getOperand(MemOpndSlot + i));
37158 }
37159 MIB.addReg(SSPCopyReg);
37160 MIB.setMemRefs(MMOs);
37161}
37162
37163MachineBasicBlock *
37164X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37165 MachineBasicBlock *MBB) const {
37166 const DebugLoc &DL = MI.getDebugLoc();
37167 MachineFunction *MF = MBB->getParent();
37168 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37169 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37170 MachineRegisterInfo &MRI = MF->getRegInfo();
37171
37172 const BasicBlock *BB = MBB->getBasicBlock();
37173 MachineFunction::iterator I = ++MBB->getIterator();
37174
37175 // Memory Reference
37176 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37177 MI.memoperands_end());
37178
37179 unsigned DstReg;
37180 unsigned MemOpndSlot = 0;
37181
37182 unsigned CurOp = 0;
37183
37184 DstReg = MI.getOperand(CurOp++).getReg();
37185 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37186 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37186, __extension__
__PRETTY_FUNCTION__))
;
37187 (void)TRI;
37188 Register mainDstReg = MRI.createVirtualRegister(RC);
37189 Register restoreDstReg = MRI.createVirtualRegister(RC);
37190
37191 MemOpndSlot = CurOp;
37192
37193 MVT PVT = getPointerTy(MF->getDataLayout());
37194 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37195, __extension__
__PRETTY_FUNCTION__))
37195 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37195, __extension__
__PRETTY_FUNCTION__))
;
37196
37197 // For v = setjmp(buf), we generate
37198 //
37199 // thisMBB:
37200 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37201 // SjLjSetup restoreMBB
37202 //
37203 // mainMBB:
37204 // v_main = 0
37205 //
37206 // sinkMBB:
37207 // v = phi(main, restore)
37208 //
37209 // restoreMBB:
37210 // if base pointer being used, load it from frame
37211 // v_restore = 1
37212
37213 MachineBasicBlock *thisMBB = MBB;
37214 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37215 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37216 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37217 MF->insert(I, mainMBB);
37218 MF->insert(I, sinkMBB);
37219 MF->push_back(restoreMBB);
37220 restoreMBB->setMachineBlockAddressTaken();
37221
37222 MachineInstrBuilder MIB;
37223
37224 // Transfer the remainder of BB and its successor edges to sinkMBB.
37225 sinkMBB->splice(sinkMBB->begin(), MBB,
37226 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37227 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37228
37229 // thisMBB:
37230 unsigned PtrStoreOpc = 0;
37231 unsigned LabelReg = 0;
37232 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37233 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37234 !isPositionIndependent();
37235
37236 // Prepare IP either in reg or imm.
37237 if (!UseImmLabel) {
37238 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37239 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37240 LabelReg = MRI.createVirtualRegister(PtrRC);
37241 if (Subtarget.is64Bit()) {
37242 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
37243 .addReg(X86::RIP)
37244 .addImm(0)
37245 .addReg(0)
37246 .addMBB(restoreMBB)
37247 .addReg(0);
37248 } else {
37249 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37250 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
37251 .addReg(XII->getGlobalBaseReg(MF))
37252 .addImm(0)
37253 .addReg(0)
37254 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37255 .addReg(0);
37256 }
37257 } else
37258 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37259 // Store IP
37260 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
37261 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37262 if (i == X86::AddrDisp)
37263 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37264 else
37265 MIB.add(MI.getOperand(MemOpndSlot + i));
37266 }
37267 if (!UseImmLabel)
37268 MIB.addReg(LabelReg);
37269 else
37270 MIB.addMBB(restoreMBB);
37271 MIB.setMemRefs(MMOs);
37272
37273 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37274 emitSetJmpShadowStackFix(MI, thisMBB);
37275 }
37276
37277 // Setup
37278 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
37279 .addMBB(restoreMBB);
37280
37281 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37282 MIB.addRegMask(RegInfo->getNoPreservedMask());
37283 thisMBB->addSuccessor(mainMBB);
37284 thisMBB->addSuccessor(restoreMBB);
37285
37286 // mainMBB:
37287 // EAX = 0
37288 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
37289 mainMBB->addSuccessor(sinkMBB);
37290
37291 // sinkMBB:
37292 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
37293 TII->get(X86::PHI), DstReg)
37294 .addReg(mainDstReg).addMBB(mainMBB)
37295 .addReg(restoreDstReg).addMBB(restoreMBB);
37296
37297 // restoreMBB:
37298 if (RegInfo->hasBasePointer(*MF)) {
37299 const bool Uses64BitFramePtr =
37300 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37301 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37302 X86FI->setRestoreBasePointer(MF);
37303 Register FramePtr = RegInfo->getFrameRegister(*MF);
37304 Register BasePtr = RegInfo->getBaseRegister();
37305 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37306 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
37307 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37308 .setMIFlag(MachineInstr::FrameSetup);
37309 }
37310 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37311 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37312 restoreMBB->addSuccessor(sinkMBB);
37313
37314 MI.eraseFromParent();
37315 return sinkMBB;
37316}
37317
37318/// Fix the shadow stack using the previously saved SSP pointer.
37319/// \sa emitSetJmpShadowStackFix
37320/// \param [in] MI The temporary Machine Instruction for the builtin.
37321/// \param [in] MBB The Machine Basic Block that will be modified.
37322/// \return The sink MBB that will perform the future indirect branch.
37323MachineBasicBlock *
37324X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37325 MachineBasicBlock *MBB) const {
37326 const DebugLoc &DL = MI.getDebugLoc();
37327 MachineFunction *MF = MBB->getParent();
37328 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37329 MachineRegisterInfo &MRI = MF->getRegInfo();
37330
37331 // Memory Reference
37332 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37333 MI.memoperands_end());
37334
37335 MVT PVT = getPointerTy(MF->getDataLayout());
37336 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37337
37338 // checkSspMBB:
37339 // xor vreg1, vreg1
37340 // rdssp vreg1
37341 // test vreg1, vreg1
37342 // je sinkMBB # Jump if Shadow Stack is not supported
37343 // fallMBB:
37344 // mov buf+24/12(%rip), vreg2
37345 // sub vreg1, vreg2
37346 // jbe sinkMBB # No need to fix the Shadow Stack
37347 // fixShadowMBB:
37348 // shr 3/2, vreg2
37349 // incssp vreg2 # fix the SSP according to the lower 8 bits
37350 // shr 8, vreg2
37351 // je sinkMBB
37352 // fixShadowLoopPrepareMBB:
37353 // shl vreg2
37354 // mov 128, vreg3
37355 // fixShadowLoopMBB:
37356 // incssp vreg3
37357 // dec vreg2
37358 // jne fixShadowLoopMBB # Iterate until you finish fixing
37359 // # the Shadow Stack
37360 // sinkMBB:
37361
37362 MachineFunction::iterator I = ++MBB->getIterator();
37363 const BasicBlock *BB = MBB->getBasicBlock();
37364
37365 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37366 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37367 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37368 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37369 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37370 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37371 MF->insert(I, checkSspMBB);
37372 MF->insert(I, fallMBB);
37373 MF->insert(I, fixShadowMBB);
37374 MF->insert(I, fixShadowLoopPrepareMBB);
37375 MF->insert(I, fixShadowLoopMBB);
37376 MF->insert(I, sinkMBB);
37377
37378 // Transfer the remainder of BB and its successor edges to sinkMBB.
37379 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37380 MBB->end());
37381 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37382
37383 MBB->addSuccessor(checkSspMBB);
37384
37385 // Initialize a register with zero.
37386 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37387 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
37388
37389 if (PVT == MVT::i64) {
37390 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37391 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37392 .addImm(0)
37393 .addReg(ZReg)
37394 .addImm(X86::sub_32bit);
37395 ZReg = TmpZReg;
37396 }
37397
37398 // Read the current SSP Register value to the zeroed register.
37399 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37400 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37401 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37402
37403 // Check whether the result of the SSP register is zero and jump directly
37404 // to the sink.
37405 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37406 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
37407 .addReg(SSPCopyReg)
37408 .addReg(SSPCopyReg);
37409 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37410 checkSspMBB->addSuccessor(sinkMBB);
37411 checkSspMBB->addSuccessor(fallMBB);
37412
37413 // Reload the previously saved SSP register value.
37414 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37415 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37416 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37417 MachineInstrBuilder MIB =
37418 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
37419 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37420 const MachineOperand &MO = MI.getOperand(i);
37421 if (i == X86::AddrDisp)
37422 MIB.addDisp(MO, SPPOffset);
37423 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37424 // preserve kill flags.
37425 MIB.addReg(MO.getReg());
37426 else
37427 MIB.add(MO);
37428 }
37429 MIB.setMemRefs(MMOs);
37430
37431 // Subtract the current SSP from the previous SSP.
37432 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37433 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37434 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
37435 .addReg(PrevSSPReg)
37436 .addReg(SSPCopyReg);
37437
37438 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37439 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
37440 fallMBB->addSuccessor(sinkMBB);
37441 fallMBB->addSuccessor(fixShadowMBB);
37442
37443 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37444 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37445 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37446 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37447 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
37448 .addReg(SspSubReg)
37449 .addImm(Offset);
37450
37451 // Increase SSP when looking only on the lower 8 bits of the delta.
37452 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37453 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37454
37455 // Reset the lower 8 bits.
37456 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37457 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
37458 .addReg(SspFirstShrReg)
37459 .addImm(8);
37460
37461 // Jump if the result of the shift is zero.
37462 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37463 fixShadowMBB->addSuccessor(sinkMBB);
37464 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37465
37466 // Do a single shift left.
37467 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
37468 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37469 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
37470 .addReg(SspSecondShrReg);
37471
37472 // Save the value 128 to a register (will be used next with incssp).
37473 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37474 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37475 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
37476 .addImm(128);
37477 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37478
37479 // Since incssp only looks at the lower 8 bits, we might need to do several
37480 // iterations of incssp until we finish fixing the shadow stack.
37481 Register DecReg = MRI.createVirtualRegister(PtrRC);
37482 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37483 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
37484 .addReg(SspAfterShlReg)
37485 .addMBB(fixShadowLoopPrepareMBB)
37486 .addReg(DecReg)
37487 .addMBB(fixShadowLoopMBB);
37488
37489 // Every iteration we increase the SSP by 128.
37490 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
37491
37492 // Every iteration we decrement the counter by 1.
37493 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37494 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
37495
37496 // Jump if the counter is not zero yet.
37497 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
37498 fixShadowLoopMBB->addSuccessor(sinkMBB);
37499 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37500
37501 return sinkMBB;
37502}
37503
37504MachineBasicBlock *
37505X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37506 MachineBasicBlock *MBB) const {
37507 const DebugLoc &DL = MI.getDebugLoc();
37508 MachineFunction *MF = MBB->getParent();
37509 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37510 MachineRegisterInfo &MRI = MF->getRegInfo();
37511
37512 // Memory Reference
37513 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37514 MI.memoperands_end());
37515
37516 MVT PVT = getPointerTy(MF->getDataLayout());
37517 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37518, __extension__
__PRETTY_FUNCTION__))
37518 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37518, __extension__
__PRETTY_FUNCTION__))
;
37519
37520 const TargetRegisterClass *RC =
37521 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37522 Register Tmp = MRI.createVirtualRegister(RC);
37523 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37524 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37525 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37526 Register SP = RegInfo->getStackRegister();
37527
37528 MachineInstrBuilder MIB;
37529
37530 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37531 const int64_t SPOffset = 2 * PVT.getStoreSize();
37532
37533 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37534 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37535
37536 MachineBasicBlock *thisMBB = MBB;
37537
37538 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37539 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37540 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37541 }
37542
37543 // Reload FP
37544 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
37545 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37546 const MachineOperand &MO = MI.getOperand(i);
37547 if (MO.isReg()) // Don't add the whole operand, we don't want to
37548 // preserve kill flags.
37549 MIB.addReg(MO.getReg());
37550 else
37551 MIB.add(MO);
37552 }
37553 MIB.setMemRefs(MMOs);
37554
37555 // Reload IP
37556 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
37557 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37558 const MachineOperand &MO = MI.getOperand(i);
37559 if (i == X86::AddrDisp)
37560 MIB.addDisp(MO, LabelOffset);
37561 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37562 // preserve kill flags.
37563 MIB.addReg(MO.getReg());
37564 else
37565 MIB.add(MO);
37566 }
37567 MIB.setMemRefs(MMOs);
37568
37569 // Reload SP
37570 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
37571 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37572 if (i == X86::AddrDisp)
37573 MIB.addDisp(MI.getOperand(i), SPOffset);
37574 else
37575 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37576 // the last instruction of the expansion.
37577 }
37578 MIB.setMemRefs(MMOs);
37579
37580 // Jump
37581 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
37582
37583 MI.eraseFromParent();
37584 return thisMBB;
37585}
37586
37587void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37588 MachineBasicBlock *MBB,
37589 MachineBasicBlock *DispatchBB,
37590 int FI) const {
37591 const DebugLoc &DL = MI.getDebugLoc();
37592 MachineFunction *MF = MBB->getParent();
37593 MachineRegisterInfo *MRI = &MF->getRegInfo();
37594 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37595
37596 MVT PVT = getPointerTy(MF->getDataLayout());
37597 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37597, __extension__
__PRETTY_FUNCTION__))
;
37598
37599 unsigned Op = 0;
37600 unsigned VR = 0;
37601
37602 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37603 !isPositionIndependent();
37604
37605 if (UseImmLabel) {
37606 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37607 } else {
37608 const TargetRegisterClass *TRC =
37609 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37610 VR = MRI->createVirtualRegister(TRC);
37611 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37612
37613 if (Subtarget.is64Bit())
37614 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
37615 .addReg(X86::RIP)
37616 .addImm(1)
37617 .addReg(0)
37618 .addMBB(DispatchBB)
37619 .addReg(0);
37620 else
37621 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
37622 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37623 .addImm(1)
37624 .addReg(0)
37625 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37626 .addReg(0);
37627 }
37628
37629 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
37630 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37631 if (UseImmLabel)
37632 MIB.addMBB(DispatchBB);
37633 else
37634 MIB.addReg(VR);
37635}
37636
37637MachineBasicBlock *
37638X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37639 MachineBasicBlock *BB) const {
37640 const DebugLoc &DL = MI.getDebugLoc();
37641 MachineFunction *MF = BB->getParent();
37642 MachineRegisterInfo *MRI = &MF->getRegInfo();
37643 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37644 int FI = MF->getFrameInfo().getFunctionContextIndex();
37645
37646 // Get a mapping of the call site numbers to all of the landing pads they're
37647 // associated with.
37648 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37649 unsigned MaxCSNum = 0;
37650 for (auto &MBB : *MF) {
37651 if (!MBB.isEHPad())
37652 continue;
37653
37654 MCSymbol *Sym = nullptr;
37655 for (const auto &MI : MBB) {
37656 if (MI.isDebugInstr())
37657 continue;
37658
37659 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37659, __extension__
__PRETTY_FUNCTION__))
;
37660 Sym = MI.getOperand(0).getMCSymbol();
37661 break;
37662 }
37663
37664 if (!MF->hasCallSiteLandingPad(Sym))
37665 continue;
37666
37667 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37668 CallSiteNumToLPad[CSI].push_back(&MBB);
37669 MaxCSNum = std::max(MaxCSNum, CSI);
37670 }
37671 }
37672
37673 // Get an ordered list of the machine basic blocks for the jump table.
37674 std::vector<MachineBasicBlock *> LPadList;
37675 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37676 LPadList.reserve(CallSiteNumToLPad.size());
37677
37678 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37679 for (auto &LP : CallSiteNumToLPad[CSI]) {
37680 LPadList.push_back(LP);
37681 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37682 }
37683 }
37684
37685 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37686, __extension__
__PRETTY_FUNCTION__))
37686 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37686, __extension__
__PRETTY_FUNCTION__))
;
37687
37688 // Create the MBBs for the dispatch code.
37689
37690 // Shove the dispatch's address into the return slot in the function context.
37691 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37692 DispatchBB->setIsEHPad(true);
37693
37694 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37695 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
37696 DispatchBB->addSuccessor(TrapBB);
37697
37698 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37699 DispatchBB->addSuccessor(DispContBB);
37700
37701 // Insert MBBs.
37702 MF->push_back(DispatchBB);
37703 MF->push_back(DispContBB);
37704 MF->push_back(TrapBB);
37705
37706 // Insert code into the entry block that creates and registers the function
37707 // context.
37708 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37709
37710 // Create the jump table and associated information
37711 unsigned JTE = getJumpTableEncoding();
37712 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37713 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37714
37715 const X86RegisterInfo &RI = TII->getRegisterInfo();
37716 // Add a register mask with no preserved registers. This results in all
37717 // registers being marked as clobbered.
37718 if (RI.hasBasePointer(*MF)) {
37719 const bool FPIs64Bit =
37720 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37721 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37722 MFI->setRestoreBasePointer(MF);
37723
37724 Register FP = RI.getFrameRegister(*MF);
37725 Register BP = RI.getBaseRegister();
37726 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37727 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
37728 MFI->getRestoreBasePointerOffset())
37729 .addRegMask(RI.getNoPreservedMask());
37730 } else {
37731 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
37732 .addRegMask(RI.getNoPreservedMask());
37733 }
37734
37735 // IReg is used as an index in a memory operand and therefore can't be SP
37736 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37737 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
37738 Subtarget.is64Bit() ? 8 : 4);
37739 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
37740 .addReg(IReg)
37741 .addImm(LPadList.size());
37742 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
37743
37744 if (Subtarget.is64Bit()) {
37745 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37746 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37747
37748 // leaq .LJTI0_0(%rip), BReg
37749 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
37750 .addReg(X86::RIP)
37751 .addImm(1)
37752 .addReg(0)
37753 .addJumpTableIndex(MJTI)
37754 .addReg(0);
37755 // movzx IReg64, IReg
37756 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37757 .addImm(0)
37758 .addReg(IReg)
37759 .addImm(X86::sub_32bit);
37760
37761 switch (JTE) {
37762 case MachineJumpTableInfo::EK_BlockAddress:
37763 // jmpq *(BReg,IReg64,8)
37764 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
37765 .addReg(BReg)
37766 .addImm(8)
37767 .addReg(IReg64)
37768 .addImm(0)
37769 .addReg(0);
37770 break;
37771 case MachineJumpTableInfo::EK_LabelDifference32: {
37772 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37773 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37774 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37775
37776 // movl (BReg,IReg64,4), OReg
37777 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
37778 .addReg(BReg)
37779 .addImm(4)
37780 .addReg(IReg64)
37781 .addImm(0)
37782 .addReg(0);
37783 // movsx OReg64, OReg
37784 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
37785 // addq BReg, OReg64, TReg
37786 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
37787 .addReg(OReg64)
37788 .addReg(BReg);
37789 // jmpq *TReg
37790 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
37791 break;
37792 }
37793 default:
37794 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37794)
;
37795 }
37796 } else {
37797 // jmpl *.LJTI0_0(,IReg,4)
37798 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
37799 .addReg(0)
37800 .addImm(4)
37801 .addReg(IReg)
37802 .addJumpTableIndex(MJTI)
37803 .addReg(0);
37804 }
37805
37806 // Add the jump table entries as successors to the MBB.
37807 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37808 for (auto &LP : LPadList)
37809 if (SeenMBBs.insert(LP).second)
37810 DispContBB->addSuccessor(LP);
37811
37812 // N.B. the order the invoke BBs are processed in doesn't matter here.
37813 SmallVector<MachineBasicBlock *, 64> MBBLPads;
37814 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37815 for (MachineBasicBlock *MBB : InvokeBBs) {
37816 // Remove the landing pad successor from the invoke block and replace it
37817 // with the new dispatch block.
37818 // Keep a copy of Successors since it's modified inside the loop.
37819 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37820 MBB->succ_rend());
37821 // FIXME: Avoid quadratic complexity.
37822 for (auto *MBBS : Successors) {
37823 if (MBBS->isEHPad()) {
37824 MBB->removeSuccessor(MBBS);
37825 MBBLPads.push_back(MBBS);
37826 }
37827 }
37828
37829 MBB->addSuccessor(DispatchBB);
37830
37831 // Find the invoke call and mark all of the callee-saved registers as
37832 // 'implicit defined' so that they're spilled. This prevents code from
37833 // moving instructions to before the EH block, where they will never be
37834 // executed.
37835 for (auto &II : reverse(*MBB)) {
37836 if (!II.isCall())
37837 continue;
37838
37839 DenseMap<unsigned, bool> DefRegs;
37840 for (auto &MOp : II.operands())
37841 if (MOp.isReg())
37842 DefRegs[MOp.getReg()] = true;
37843
37844 MachineInstrBuilder MIB(*MF, &II);
37845 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37846 unsigned Reg = SavedRegs[RegIdx];
37847 if (!DefRegs[Reg])
37848 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
37849 }
37850
37851 break;
37852 }
37853 }
37854
37855 // Mark all former landing pads as non-landing pads. The dispatch is the only
37856 // landing pad now.
37857 for (auto &LP : MBBLPads)
37858 LP->setIsEHPad(false);
37859
37860 // The instruction is gone now.
37861 MI.eraseFromParent();
37862 return BB;
37863}
37864
37865MachineBasicBlock *
37866X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
37867 MachineBasicBlock *BB) const {
37868 MachineFunction *MF = BB->getParent();
37869 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37870 const DebugLoc &DL = MI.getDebugLoc();
37871
37872 auto TMMImmToTMMReg = [](unsigned Imm) {
37873 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37873, __extension__
__PRETTY_FUNCTION__))
;
37874 return X86::TMM0 + Imm;
37875 };
37876 switch (MI.getOpcode()) {
37877 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37877)
;
37878 case X86::TLS_addr32:
37879 case X86::TLS_addr64:
37880 case X86::TLS_addrX32:
37881 case X86::TLS_base_addr32:
37882 case X86::TLS_base_addr64:
37883 case X86::TLS_base_addrX32:
37884 return EmitLoweredTLSAddr(MI, BB);
37885 case X86::INDIRECT_THUNK_CALL32:
37886 case X86::INDIRECT_THUNK_CALL64:
37887 case X86::INDIRECT_THUNK_TCRETURN32:
37888 case X86::INDIRECT_THUNK_TCRETURN64:
37889 return EmitLoweredIndirectThunk(MI, BB);
37890 case X86::CATCHRET:
37891 return EmitLoweredCatchRet(MI, BB);
37892 case X86::SEG_ALLOCA_32:
37893 case X86::SEG_ALLOCA_64:
37894 return EmitLoweredSegAlloca(MI, BB);
37895 case X86::PROBED_ALLOCA_32:
37896 case X86::PROBED_ALLOCA_64:
37897 return EmitLoweredProbedAlloca(MI, BB);
37898 case X86::TLSCall_32:
37899 case X86::TLSCall_64:
37900 return EmitLoweredTLSCall(MI, BB);
37901 case X86::CMOV_FR16:
37902 case X86::CMOV_FR16X:
37903 case X86::CMOV_FR32:
37904 case X86::CMOV_FR32X:
37905 case X86::CMOV_FR64:
37906 case X86::CMOV_FR64X:
37907 case X86::CMOV_GR8:
37908 case X86::CMOV_GR16:
37909 case X86::CMOV_GR32:
37910 case X86::CMOV_RFP32:
37911 case X86::CMOV_RFP64:
37912 case X86::CMOV_RFP80:
37913 case X86::CMOV_VR64:
37914 case X86::CMOV_VR128:
37915 case X86::CMOV_VR128X:
37916 case X86::CMOV_VR256:
37917 case X86::CMOV_VR256X:
37918 case X86::CMOV_VR512:
37919 case X86::CMOV_VK1:
37920 case X86::CMOV_VK2:
37921 case X86::CMOV_VK4:
37922 case X86::CMOV_VK8:
37923 case X86::CMOV_VK16:
37924 case X86::CMOV_VK32:
37925 case X86::CMOV_VK64:
37926 return EmitLoweredSelect(MI, BB);
37927
37928 case X86::FP80_ADDr:
37929 case X86::FP80_ADDm32: {
37930 // Change the floating point control register to use double extended
37931 // precision when performing the addition.
37932 int OrigCWFrameIdx =
37933 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37934 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
37935 OrigCWFrameIdx);
37936
37937 // Load the old value of the control word...
37938 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37939 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
37940 OrigCWFrameIdx);
37941
37942 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37943 // precision.
37944 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37945 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
37946 .addReg(OldCW, RegState::Kill)
37947 .addImm(0x300);
37948
37949 // Extract to 16 bits.
37950 Register NewCW16 =
37951 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37952 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
37953 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37954
37955 // Prepare memory for FLDCW.
37956 int NewCWFrameIdx =
37957 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37958 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
37959 NewCWFrameIdx)
37960 .addReg(NewCW16, RegState::Kill);
37961
37962 // Reload the modified control word now...
37963 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37964 NewCWFrameIdx);
37965
37966 // Do the addition.
37967 if (MI.getOpcode() == X86::FP80_ADDr) {
37968 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
37969 .add(MI.getOperand(0))
37970 .add(MI.getOperand(1))
37971 .add(MI.getOperand(2));
37972 } else {
37973 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
37974 .add(MI.getOperand(0))
37975 .add(MI.getOperand(1))
37976 .add(MI.getOperand(2))
37977 .add(MI.getOperand(3))
37978 .add(MI.getOperand(4))
37979 .add(MI.getOperand(5))
37980 .add(MI.getOperand(6));
37981 }
37982
37983 // Reload the original control word now.
37984 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37985 OrigCWFrameIdx);
37986
37987 MI.eraseFromParent(); // The pseudo instruction is gone now.
37988 return BB;
37989 }
37990
37991 case X86::FP32_TO_INT16_IN_MEM:
37992 case X86::FP32_TO_INT32_IN_MEM:
37993 case X86::FP32_TO_INT64_IN_MEM:
37994 case X86::FP64_TO_INT16_IN_MEM:
37995 case X86::FP64_TO_INT32_IN_MEM:
37996 case X86::FP64_TO_INT64_IN_MEM:
37997 case X86::FP80_TO_INT16_IN_MEM:
37998 case X86::FP80_TO_INT32_IN_MEM:
37999 case X86::FP80_TO_INT64_IN_MEM: {
38000 // Change the floating point control register to use "round towards zero"
38001 // mode when truncating to an integer value.
38002 int OrigCWFrameIdx =
38003 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38004 addFrameReference(BuildMI(*BB, MI, DL,
38005 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
38006
38007 // Load the old value of the control word...
38008 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38009 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38010 OrigCWFrameIdx);
38011
38012 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
38013 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38014 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38015 .addReg(OldCW, RegState::Kill).addImm(0xC00);
38016
38017 // Extract to 16 bits.
38018 Register NewCW16 =
38019 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38020 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38021 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38022
38023 // Prepare memory for FLDCW.
38024 int NewCWFrameIdx =
38025 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38026 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38027 NewCWFrameIdx)
38028 .addReg(NewCW16, RegState::Kill);
38029
38030 // Reload the modified control word now...
38031 addFrameReference(BuildMI(*BB, MI, DL,
38032 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
38033
38034 // Get the X86 opcode to use.
38035 unsigned Opc;
38036 switch (MI.getOpcode()) {
38037 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38037)
;
38038 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
38039 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
38040 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
38041 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
38042 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
38043 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
38044 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
38045 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
38046 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
38047 }
38048
38049 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38050 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
38051 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
38052
38053 // Reload the original control word now.
38054 addFrameReference(BuildMI(*BB, MI, DL,
38055 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
38056
38057 MI.eraseFromParent(); // The pseudo instruction is gone now.
38058 return BB;
38059 }
38060
38061 // xbegin
38062 case X86::XBEGIN:
38063 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
38064
38065 case X86::VAARG_64:
38066 case X86::VAARG_X32:
38067 return EmitVAARGWithCustomInserter(MI, BB);
38068
38069 case X86::EH_SjLj_SetJmp32:
38070 case X86::EH_SjLj_SetJmp64:
38071 return emitEHSjLjSetJmp(MI, BB);
38072
38073 case X86::EH_SjLj_LongJmp32:
38074 case X86::EH_SjLj_LongJmp64:
38075 return emitEHSjLjLongJmp(MI, BB);
38076
38077 case X86::Int_eh_sjlj_setup_dispatch:
38078 return EmitSjLjDispatchBlock(MI, BB);
38079
38080 case TargetOpcode::STATEPOINT:
38081 // As an implementation detail, STATEPOINT shares the STACKMAP format at
38082 // this point in the process. We diverge later.
38083 return emitPatchPoint(MI, BB);
38084
38085 case TargetOpcode::STACKMAP:
38086 case TargetOpcode::PATCHPOINT:
38087 return emitPatchPoint(MI, BB);
38088
38089 case TargetOpcode::PATCHABLE_EVENT_CALL:
38090 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38091 return BB;
38092
38093 case X86::LCMPXCHG8B: {
38094 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38095 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38096 // requires a memory operand. If it happens that current architecture is
38097 // i686 and for current function we need a base pointer
38098 // - which is ESI for i686 - register allocator would not be able to
38099 // allocate registers for an address in form of X(%reg, %reg, Y)
38100 // - there never would be enough unreserved registers during regalloc
38101 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38102 // We are giving a hand to register allocator by precomputing the address in
38103 // a new vreg using LEA.
38104
38105 // If it is not i686 or there is no base pointer - nothing to do here.
38106 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38107 return BB;
38108
38109 // Even though this code does not necessarily needs the base pointer to
38110 // be ESI, we check for that. The reason: if this assert fails, there are
38111 // some changes happened in the compiler base pointer handling, which most
38112 // probably have to be addressed somehow here.
38113 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38115, __extension__
__PRETTY_FUNCTION__))
38114 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38115, __extension__
__PRETTY_FUNCTION__))
38115 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38115, __extension__
__PRETTY_FUNCTION__))
;
38116
38117 MachineRegisterInfo &MRI = MF->getRegInfo();
38118 MVT SPTy = getPointerTy(MF->getDataLayout());
38119 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38120 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38121
38122 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38123 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38124 // does not use index register.
38125 if (AM.IndexReg == X86::NoRegister)
38126 return BB;
38127
38128 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38129 // four operand definitions that are E[ABCD] registers. We skip them and
38130 // then insert the LEA.
38131 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38132 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
38133 RMBBI->definesRegister(X86::EBX) ||
38134 RMBBI->definesRegister(X86::ECX) ||
38135 RMBBI->definesRegister(X86::EDX))) {
38136 ++RMBBI;
38137 }
38138 MachineBasicBlock::iterator MBBI(RMBBI);
38139 addFullAddress(
38140 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
38141
38142 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38143
38144 return BB;
38145 }
38146 case X86::LCMPXCHG16B_NO_RBX: {
38147 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38148 Register BasePtr = TRI->getBaseRegister();
38149 if (TRI->hasBasePointer(*MF) &&
38150 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38151 if (!BB->isLiveIn(BasePtr))
38152 BB->addLiveIn(BasePtr);
38153 // Save RBX into a virtual register.
38154 Register SaveRBX =
38155 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38156 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38157 .addReg(X86::RBX);
38158 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38159 MachineInstrBuilder MIB =
38160 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38161 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38162 MIB.add(MI.getOperand(Idx));
38163 MIB.add(MI.getOperand(X86::AddrNumOperands));
38164 MIB.addReg(SaveRBX);
38165 } else {
38166 // Simple case, just copy the virtual register to RBX.
38167 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
38168 .add(MI.getOperand(X86::AddrNumOperands));
38169 MachineInstrBuilder MIB =
38170 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
38171 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38172 MIB.add(MI.getOperand(Idx));
38173 }
38174 MI.eraseFromParent();
38175 return BB;
38176 }
38177 case X86::MWAITX: {
38178 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38179 Register BasePtr = TRI->getBaseRegister();
38180 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38181 // If no need to save the base pointer, we generate MWAITXrrr,
38182 // else we generate pseudo MWAITX_SAVE_RBX.
38183 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38184 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38185 .addReg(MI.getOperand(0).getReg());
38186 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38187 .addReg(MI.getOperand(1).getReg());
38188 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
38189 .addReg(MI.getOperand(2).getReg());
38190 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
38191 MI.eraseFromParent();
38192 } else {
38193 if (!BB->isLiveIn(BasePtr)) {
38194 BB->addLiveIn(BasePtr);
38195 }
38196 // Parameters can be copied into ECX and EAX but not EBX yet.
38197 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38198 .addReg(MI.getOperand(0).getReg());
38199 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38200 .addReg(MI.getOperand(1).getReg());
38201 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38201, __extension__
__PRETTY_FUNCTION__))
;
38202 // Save RBX into a virtual register.
38203 Register SaveRBX =
38204 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38205 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38206 .addReg(X86::RBX);
38207 // Generate mwaitx pseudo.
38208 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38209 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
38210 .addDef(Dst) // Destination tied in with SaveRBX.
38211 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38212 .addUse(SaveRBX); // Save of base pointer.
38213 MI.eraseFromParent();
38214 }
38215 return BB;
38216 }
38217 case TargetOpcode::PREALLOCATED_SETUP: {
38218 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38218, __extension__
__PRETTY_FUNCTION__))
;
38219 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38220 MFI->setHasPreallocatedCall(true);
38221 int64_t PreallocatedId = MI.getOperand(0).getImm();
38222 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38223 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38223, __extension__
__PRETTY_FUNCTION__))
;
38224 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
38225 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
38226 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
38227 .addReg(X86::ESP)
38228 .addImm(StackAdjustment);
38229 MI.eraseFromParent();
38230 return BB;
38231 }
38232 case TargetOpcode::PREALLOCATED_ARG: {
38233 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38233, __extension__
__PRETTY_FUNCTION__))
;
38234 int64_t PreallocatedId = MI.getOperand(1).getImm();
38235 int64_t ArgIdx = MI.getOperand(2).getImm();
38236 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38237 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38238 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
38239 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
38240 // stack pointer + offset
38241 addRegOffset(
38242 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
38243 X86::ESP, false, ArgOffset);
38244 MI.eraseFromParent();
38245 return BB;
38246 }
38247 case X86::PTDPBSSD:
38248 case X86::PTDPBSUD:
38249 case X86::PTDPBUSD:
38250 case X86::PTDPBUUD:
38251 case X86::PTDPBF16PS:
38252 case X86::PTDPFP16PS: {
38253 unsigned Opc;
38254 switch (MI.getOpcode()) {
38255 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38255)
;
38256 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38257 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38258 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38259 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38260 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38261 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38262 }
38263
38264 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38265 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38266 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38267 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38268 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38269
38270 MI.eraseFromParent(); // The pseudo is gone now.
38271 return BB;
38272 }
38273 case X86::PTILEZERO: {
38274 unsigned Imm = MI.getOperand(0).getImm();
38275 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38276 MI.eraseFromParent(); // The pseudo is gone now.
38277 return BB;
38278 }
38279 case X86::PTILELOADD:
38280 case X86::PTILELOADDT1:
38281 case X86::PTILESTORED: {
38282 unsigned Opc;
38283 switch (MI.getOpcode()) {
38284 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38284)
;
38285 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
38286 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
38287 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
38288 }
38289
38290 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38291 unsigned CurOp = 0;
38292 if (Opc != X86::TILESTORED)
38293 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38294 RegState::Define);
38295
38296 MIB.add(MI.getOperand(CurOp++)); // base
38297 MIB.add(MI.getOperand(CurOp++)); // scale
38298 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38299 MIB.add(MI.getOperand(CurOp++)); // displacement
38300 MIB.add(MI.getOperand(CurOp++)); // segment
38301
38302 if (Opc == X86::TILESTORED)
38303 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38304 RegState::Undef);
38305
38306 MI.eraseFromParent(); // The pseudo is gone now.
38307 return BB;
38308 }
38309 case X86::PTCMMIMFP16PS:
38310 case X86::PTCMMRLFP16PS: {
38311 const DebugLoc &DL = MI.getDebugLoc();
38312 unsigned Opc;
38313 switch (MI.getOpcode()) {
38314 default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38314)
;
38315 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
38316 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
38317 }
38318 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38319 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38320 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38321 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38322 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38323 MI.eraseFromParent(); // The pseudo is gone now.
38324 return BB;
38325 }
38326 }
38327}
38328
38329//===----------------------------------------------------------------------===//
38330// X86 Optimization Hooks
38331//===----------------------------------------------------------------------===//
38332
38333bool
38334X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
38335 const APInt &DemandedBits,
38336 const APInt &DemandedElts,
38337 TargetLoweringOpt &TLO) const {
38338 EVT VT = Op.getValueType();
38339 unsigned Opcode = Op.getOpcode();
38340 unsigned EltSize = VT.getScalarSizeInBits();
38341
38342 if (VT.isVector()) {
38343 // If the constant is only all signbits in the active bits, then we should
38344 // extend it to the entire constant to allow it act as a boolean constant
38345 // vector.
38346 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38347 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38348 return false;
38349 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38350 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38351 continue;
38352 const APInt &Val = V.getConstantOperandAPInt(i);
38353 if (Val.getBitWidth() > Val.getNumSignBits() &&
38354 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38355 return true;
38356 }
38357 return false;
38358 };
38359 // For vectors - if we have a constant, then try to sign extend.
38360 // TODO: Handle AND/ANDN cases.
38361 unsigned ActiveBits = DemandedBits.getActiveBits();
38362 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38363 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
38364 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38365 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38366 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38367 VT.getVectorNumElements());
38368 SDValue NewC =
38369 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
38370 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38371 SDValue NewOp =
38372 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38373 return TLO.CombineTo(Op, NewOp);
38374 }
38375 return false;
38376 }
38377
38378 // Only optimize Ands to prevent shrinking a constant that could be
38379 // matched by movzx.
38380 if (Opcode != ISD::AND)
38381 return false;
38382
38383 // Make sure the RHS really is a constant.
38384 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38385 if (!C)
38386 return false;
38387
38388 const APInt &Mask = C->getAPIntValue();
38389
38390 // Clear all non-demanded bits initially.
38391 APInt ShrunkMask = Mask & DemandedBits;
38392
38393 // Find the width of the shrunk mask.
38394 unsigned Width = ShrunkMask.getActiveBits();
38395
38396 // If the mask is all 0s there's nothing to do here.
38397 if (Width == 0)
38398 return false;
38399
38400 // Find the next power of 2 width, rounding up to a byte.
38401 Width = llvm::bit_ceil(std::max(Width, 8U));
38402 // Truncate the width to size to handle illegal types.
38403 Width = std::min(Width, EltSize);
38404
38405 // Calculate a possible zero extend mask for this constant.
38406 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38407
38408 // If we aren't changing the mask, just return true to keep it and prevent
38409 // the caller from optimizing.
38410 if (ZeroExtendMask == Mask)
38411 return true;
38412
38413 // Make sure the new mask can be represented by a combination of mask bits
38414 // and non-demanded bits.
38415 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38416 return false;
38417
38418 // Replace the constant with the zero extend mask.
38419 SDLoc DL(Op);
38420 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38421 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38422 return TLO.CombineTo(Op, NewOp);
38423}
38424
38425void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
38426 KnownBits &Known,
38427 const APInt &DemandedElts,
38428 const SelectionDAG &DAG,
38429 unsigned Depth) const {
38430 unsigned BitWidth = Known.getBitWidth();
38431 unsigned NumElts = DemandedElts.getBitWidth();
38432 unsigned Opc = Op.getOpcode();
38433 EVT VT = Op.getValueType();
38434 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
38435 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
38436 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
38437 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
38438 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
38439 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))
;
38440
38441 Known.resetAll();
38442 switch (Opc) {
38443 default: break;
38444 case X86ISD::MUL_IMM: {
38445 KnownBits Known2;
38446 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38447 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38448 Known = KnownBits::mul(Known, Known2);
38449 break;
38450 }
38451 case X86ISD::SETCC:
38452 Known.Zero.setBitsFrom(1);
38453 break;
38454 case X86ISD::MOVMSK: {
38455 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38456 Known.Zero.setBitsFrom(NumLoBits);
38457 break;
38458 }
38459 case X86ISD::PEXTRB:
38460 case X86ISD::PEXTRW: {
38461 SDValue Src = Op.getOperand(0);
38462 EVT SrcVT = Src.getValueType();
38463 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38464 Op.getConstantOperandVal(1));
38465 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38466 Known = Known.anyextOrTrunc(BitWidth);
38467 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38468 break;
38469 }
38470 case X86ISD::VSRAI:
38471 case X86ISD::VSHLI:
38472 case X86ISD::VSRLI: {
38473 unsigned ShAmt = Op.getConstantOperandVal(1);
38474 if (ShAmt >= VT.getScalarSizeInBits()) {
38475 // Out of range logical bit shifts are guaranteed to be zero.
38476 // Out of range arithmetic bit shifts splat the sign bit.
38477 if (Opc != X86ISD::VSRAI) {
38478 Known.setAllZero();
38479 break;
38480 }
38481
38482 ShAmt = VT.getScalarSizeInBits() - 1;
38483 }
38484
38485 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38486 if (Opc == X86ISD::VSHLI) {
38487 Known.Zero <<= ShAmt;
38488 Known.One <<= ShAmt;
38489 // Low bits are known zero.
38490 Known.Zero.setLowBits(ShAmt);
38491 } else if (Opc == X86ISD::VSRLI) {
38492 Known.Zero.lshrInPlace(ShAmt);
38493 Known.One.lshrInPlace(ShAmt);
38494 // High bits are known zero.
38495 Known.Zero.setHighBits(ShAmt);
38496 } else {
38497 Known.Zero.ashrInPlace(ShAmt);
38498 Known.One.ashrInPlace(ShAmt);
38499 }
38500 break;
38501 }
38502 case X86ISD::PACKUS: {
38503 // PACKUS is just a truncation if the upper half is zero.
38504 APInt DemandedLHS, DemandedRHS;
38505 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38506
38507 Known.One = APInt::getAllOnes(BitWidth * 2);
38508 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38509
38510 KnownBits Known2;
38511 if (!!DemandedLHS) {
38512 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38513 Known = KnownBits::commonBits(Known, Known2);
38514 }
38515 if (!!DemandedRHS) {
38516 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38517 Known = KnownBits::commonBits(Known, Known2);
38518 }
38519
38520 if (Known.countMinLeadingZeros() < BitWidth)
38521 Known.resetAll();
38522 Known = Known.trunc(BitWidth);
38523 break;
38524 }
38525 case X86ISD::VBROADCAST: {
38526 SDValue Src = Op.getOperand(0);
38527 if (!Src.getSimpleValueType().isVector()) {
38528 Known = DAG.computeKnownBits(Src, Depth + 1);
38529 return;
38530 }
38531 break;
38532 }
38533 case X86ISD::AND: {
38534 if (Op.getResNo() == 0) {
38535 KnownBits Known2;
38536 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38537 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38538 Known &= Known2;
38539 }
38540 break;
38541 }
38542 case X86ISD::ANDNP: {
38543 KnownBits Known2;
38544 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38545 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38546
38547 // ANDNP = (~X & Y);
38548 Known.One &= Known2.Zero;
38549 Known.Zero |= Known2.One;
38550 break;
38551 }
38552 case X86ISD::FOR: {
38553 KnownBits Known2;
38554 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38555 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38556
38557 Known |= Known2;
38558 break;
38559 }
38560 case X86ISD::PSADBW: {
38561 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38563, __extension__
__PRETTY_FUNCTION__))
38562 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38563, __extension__
__PRETTY_FUNCTION__))
38563 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38563, __extension__
__PRETTY_FUNCTION__))
;
38564
38565 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
38566 Known.Zero.setBitsFrom(16);
38567 break;
38568 }
38569 case X86ISD::PMULUDQ: {
38570 KnownBits Known2;
38571 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38572 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38573
38574 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38575 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38576 Known = KnownBits::mul(Known, Known2);
38577 break;
38578 }
38579 case X86ISD::CMOV: {
38580 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38581 // If we don't know any bits, early out.
38582 if (Known.isUnknown())
38583 break;
38584 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38585
38586 // Only known if known in both the LHS and RHS.
38587 Known = KnownBits::commonBits(Known, Known2);
38588 break;
38589 }
38590 case X86ISD::BEXTR:
38591 case X86ISD::BEXTRI: {
38592 SDValue Op0 = Op.getOperand(0);
38593 SDValue Op1 = Op.getOperand(1);
38594
38595 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38596 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38597 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38598
38599 // If the length is 0, the result is 0.
38600 if (Length == 0) {
38601 Known.setAllZero();
38602 break;
38603 }
38604
38605 if ((Shift + Length) <= BitWidth) {
38606 Known = DAG.computeKnownBits(Op0, Depth + 1);
38607 Known = Known.extractBits(Length, Shift);
38608 Known = Known.zextOrTrunc(BitWidth);
38609 }
38610 }
38611 break;
38612 }
38613 case X86ISD::PDEP: {
38614 KnownBits Known2;
38615 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38616 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38617 // Zeros are retained from the mask operand. But not ones.
38618 Known.One.clearAllBits();
38619 // The result will have at least as many trailing zeros as the non-mask
38620 // operand since bits can only map to the same or higher bit position.
38621 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38622 break;
38623 }
38624 case X86ISD::PEXT: {
38625 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38626 // The result has as many leading zeros as the number of zeroes in the mask.
38627 unsigned Count = Known.Zero.popcount();
38628 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38629 Known.One.clearAllBits();
38630 break;
38631 }
38632 case X86ISD::VTRUNC:
38633 case X86ISD::VTRUNCS:
38634 case X86ISD::VTRUNCUS:
38635 case X86ISD::CVTSI2P:
38636 case X86ISD::CVTUI2P:
38637 case X86ISD::CVTP2SI:
38638 case X86ISD::CVTP2UI:
38639 case X86ISD::MCVTP2SI:
38640 case X86ISD::MCVTP2UI:
38641 case X86ISD::CVTTP2SI:
38642 case X86ISD::CVTTP2UI:
38643 case X86ISD::MCVTTP2SI:
38644 case X86ISD::MCVTTP2UI:
38645 case X86ISD::MCVTSI2P:
38646 case X86ISD::MCVTUI2P:
38647 case X86ISD::VFPROUND:
38648 case X86ISD::VMFPROUND:
38649 case X86ISD::CVTPS2PH:
38650 case X86ISD::MCVTPS2PH: {
38651 // Truncations/Conversions - upper elements are known zero.
38652 EVT SrcVT = Op.getOperand(0).getValueType();
38653 if (SrcVT.isVector()) {
38654 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38655 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38656 Known.setAllZero();
38657 }
38658 break;
38659 }
38660 case X86ISD::STRICT_CVTTP2SI:
38661 case X86ISD::STRICT_CVTTP2UI:
38662 case X86ISD::STRICT_CVTSI2P:
38663 case X86ISD::STRICT_CVTUI2P:
38664 case X86ISD::STRICT_VFPROUND:
38665 case X86ISD::STRICT_CVTPS2PH: {
38666 // Strict Conversions - upper elements are known zero.
38667 EVT SrcVT = Op.getOperand(1).getValueType();
38668 if (SrcVT.isVector()) {
38669 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38670 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38671 Known.setAllZero();
38672 }
38673 break;
38674 }
38675 case X86ISD::MOVQ2DQ: {
38676 // Move from MMX to XMM. Upper half of XMM should be 0.
38677 if (DemandedElts.countr_zero() >= (NumElts / 2))
38678 Known.setAllZero();
38679 break;
38680 }
38681 case X86ISD::VBROADCAST_LOAD: {
38682 APInt UndefElts;
38683 SmallVector<APInt, 16> EltBits;
38684 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38685 /*AllowWholeUndefs*/ false,
38686 /*AllowPartialUndefs*/ false)) {
38687 Known.Zero.setAllBits();
38688 Known.One.setAllBits();
38689 for (unsigned I = 0; I != NumElts; ++I) {
38690 if (!DemandedElts[I])
38691 continue;
38692 if (UndefElts[I]) {
38693 Known.resetAll();
38694 break;
38695 }
38696 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38697 Known = KnownBits::commonBits(Known, Known2);
38698 }
38699 return;
38700 }
38701 break;
38702 }
38703 }
38704
38705 // Handle target shuffles.
38706 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38707 if (isTargetShuffle(Opc)) {
38708 SmallVector<int, 64> Mask;
38709 SmallVector<SDValue, 2> Ops;
38710 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38711 unsigned NumOps = Ops.size();
38712 unsigned NumElts = VT.getVectorNumElements();
38713 if (Mask.size() == NumElts) {
38714 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38715 Known.Zero.setAllBits(); Known.One.setAllBits();
38716 for (unsigned i = 0; i != NumElts; ++i) {
38717 if (!DemandedElts[i])
38718 continue;
38719 int M = Mask[i];
38720 if (M == SM_SentinelUndef) {
38721 // For UNDEF elements, we don't know anything about the common state
38722 // of the shuffle result.
38723 Known.resetAll();
38724 break;
38725 }
38726 if (M == SM_SentinelZero) {
38727 Known.One.clearAllBits();
38728 continue;
38729 }
38730 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38731, __extension__
__PRETTY_FUNCTION__))
38731 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38731, __extension__
__PRETTY_FUNCTION__))
;
38732
38733 unsigned OpIdx = (unsigned)M / NumElts;
38734 unsigned EltIdx = (unsigned)M % NumElts;
38735 if (Ops[OpIdx].getValueType() != VT) {
38736 // TODO - handle target shuffle ops with different value types.
38737 Known.resetAll();
38738 break;
38739 }
38740 DemandedOps[OpIdx].setBit(EltIdx);
38741 }
38742 // Known bits are the values that are shared by every demanded element.
38743 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38744 if (!DemandedOps[i])
38745 continue;
38746 KnownBits Known2 =
38747 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38748 Known = KnownBits::commonBits(Known, Known2);
38749 }
38750 }
38751 }
38752 }
38753}
38754
38755unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
38756 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38757 unsigned Depth) const {
38758 EVT VT = Op.getValueType();
38759 unsigned VTBits = VT.getScalarSizeInBits();
38760 unsigned Opcode = Op.getOpcode();
38761 switch (Opcode) {
38762 case X86ISD::SETCC_CARRY:
38763 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38764 return VTBits;
38765
38766 case X86ISD::VTRUNC: {
38767 SDValue Src = Op.getOperand(0);
38768 MVT SrcVT = Src.getSimpleValueType();
38769 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38770 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38770, __extension__
__PRETTY_FUNCTION__))
;
38771 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38772 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38773 if (Tmp > (NumSrcBits - VTBits))
38774 return Tmp - (NumSrcBits - VTBits);
38775 return 1;
38776 }
38777
38778 case X86ISD::PACKSS: {
38779 // PACKSS is just a truncation if the sign bits extend to the packed size.
38780 APInt DemandedLHS, DemandedRHS;
38781 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
38782 DemandedRHS);
38783
38784 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
38785 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
38786 if (!!DemandedLHS)
38787 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38788 if (!!DemandedRHS)
38789 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38790 unsigned Tmp = std::min(Tmp0, Tmp1);
38791 if (Tmp > (SrcBits - VTBits))
38792 return Tmp - (SrcBits - VTBits);
38793 return 1;
38794 }
38795
38796 case X86ISD::VBROADCAST: {
38797 SDValue Src = Op.getOperand(0);
38798 if (!Src.getSimpleValueType().isVector())
38799 return DAG.ComputeNumSignBits(Src, Depth + 1);
38800 break;
38801 }
38802
38803 case X86ISD::VSHLI: {
38804 SDValue Src = Op.getOperand(0);
38805 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
38806 if (ShiftVal.uge(VTBits))
38807 return VTBits; // Shifted all bits out --> zero.
38808 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38809 if (ShiftVal.uge(Tmp))
38810 return 1; // Shifted all sign bits out --> unknown.
38811 return Tmp - ShiftVal.getZExtValue();
38812 }
38813
38814 case X86ISD::VSRAI: {
38815 SDValue Src = Op.getOperand(0);
38816 APInt ShiftVal = Op.getConstantOperandAPInt(1);
38817 if (ShiftVal.uge(VTBits - 1))
38818 return VTBits; // Sign splat.
38819 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38820 ShiftVal += Tmp;
38821 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
38822 }
38823
38824 case X86ISD::FSETCC:
38825 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38826 if (VT == MVT::f32 || VT == MVT::f64 ||
38827 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
38828 return VTBits;
38829 break;
38830
38831 case X86ISD::PCMPGT:
38832 case X86ISD::PCMPEQ:
38833 case X86ISD::CMPP:
38834 case X86ISD::VPCOM:
38835 case X86ISD::VPCOMU:
38836 // Vector compares return zero/all-bits result values.
38837 return VTBits;
38838
38839 case X86ISD::ANDNP: {
38840 unsigned Tmp0 =
38841 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38842 if (Tmp0 == 1) return 1; // Early out.
38843 unsigned Tmp1 =
38844 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38845 return std::min(Tmp0, Tmp1);
38846 }
38847
38848 case X86ISD::CMOV: {
38849 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38850 if (Tmp0 == 1) return 1; // Early out.
38851 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38852 return std::min(Tmp0, Tmp1);
38853 }
38854 }
38855
38856 // Handle target shuffles.
38857 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38858 if (isTargetShuffle(Opcode)) {
38859 SmallVector<int, 64> Mask;
38860 SmallVector<SDValue, 2> Ops;
38861 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38862 unsigned NumOps = Ops.size();
38863 unsigned NumElts = VT.getVectorNumElements();
38864 if (Mask.size() == NumElts) {
38865 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38866 for (unsigned i = 0; i != NumElts; ++i) {
38867 if (!DemandedElts[i])
38868 continue;
38869 int M = Mask[i];
38870 if (M == SM_SentinelUndef) {
38871 // For UNDEF elements, we don't know anything about the common state
38872 // of the shuffle result.
38873 return 1;
38874 } else if (M == SM_SentinelZero) {
38875 // Zero = all sign bits.
38876 continue;
38877 }
38878 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38879, __extension__
__PRETTY_FUNCTION__))
38879 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38879, __extension__
__PRETTY_FUNCTION__))
;
38880
38881 unsigned OpIdx = (unsigned)M / NumElts;
38882 unsigned EltIdx = (unsigned)M % NumElts;
38883 if (Ops[OpIdx].getValueType() != VT) {
38884 // TODO - handle target shuffle ops with different value types.
38885 return 1;
38886 }
38887 DemandedOps[OpIdx].setBit(EltIdx);
38888 }
38889 unsigned Tmp0 = VTBits;
38890 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
38891 if (!DemandedOps[i])
38892 continue;
38893 unsigned Tmp1 =
38894 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38895 Tmp0 = std::min(Tmp0, Tmp1);
38896 }
38897 return Tmp0;
38898 }
38899 }
38900 }
38901
38902 // Fallback case.
38903 return 1;
38904}
38905
38906SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
38907 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38908 return N->getOperand(0);
38909 return N;
38910}
38911
38912// Helper to look for a normal load that can be narrowed into a vzload with the
38913// specified VT and memory VT. Returns SDValue() on failure.
38914static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
38915 SelectionDAG &DAG) {
38916 // Can't if the load is volatile or atomic.
38917 if (!LN->isSimple())
38918 return SDValue();
38919
38920 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38921 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38922 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
38923 LN->getPointerInfo(), LN->getOriginalAlign(),
38924 LN->getMemOperand()->getFlags());
38925}
38926
38927// Attempt to match a combined shuffle mask against supported unary shuffle
38928// instructions.
38929// TODO: Investigate sharing more of this with shuffle lowering.
38930static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38931 bool AllowFloatDomain, bool AllowIntDomain,
38932 SDValue V1, const SelectionDAG &DAG,
38933 const X86Subtarget &Subtarget, unsigned &Shuffle,
38934 MVT &SrcVT, MVT &DstVT) {
38935 unsigned NumMaskElts = Mask.size();
38936 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
38937
38938 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38939 if (Mask[0] == 0 &&
38940 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
38941 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38942 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38943 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38944 Shuffle = X86ISD::VZEXT_MOVL;
38945 if (MaskEltSize == 16)
38946 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38947 else
38948 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38949 return true;
38950 }
38951 }
38952
38953 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
38954 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38955 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
38956 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
38957 unsigned MaxScale = 64 / MaskEltSize;
38958 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
38959 bool MatchAny = true;
38960 bool MatchZero = true;
38961 unsigned NumDstElts = NumMaskElts / Scale;
38962 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
38963 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
38964 MatchAny = MatchZero = false;
38965 break;
38966 }
38967 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
38968 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
38969 }
38970 if (MatchAny || MatchZero) {
38971 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38971, __extension__
__PRETTY_FUNCTION__))
;
38972 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
38973 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
38974 MVT::getIntegerVT(MaskEltSize);
38975 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
38976
38977 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
38978 if (SrcVT.getVectorNumElements() != NumDstElts)
38979 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
38980
38981 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
38982 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
38983 return true;
38984 }
38985 }
38986 }
38987
38988 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38989 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
38990 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
38991 isUndefOrEqual(Mask[0], 0) &&
38992 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38993 Shuffle = X86ISD::VZEXT_MOVL;
38994 if (MaskEltSize == 16)
38995 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38996 else
38997 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38998 return true;
38999 }
39000
39001 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39002 // instructions are no slower than UNPCKLPD but has the option to
39003 // fold the input operand into even an unaligned memory load.
39004 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39005 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39006 Shuffle = X86ISD::MOVDDUP;
39007 SrcVT = DstVT = MVT::v2f64;
39008 return true;
39009 }
39010 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39011 Shuffle = X86ISD::MOVSLDUP;
39012 SrcVT = DstVT = MVT::v4f32;
39013 return true;
39014 }
39015 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39016 Shuffle = X86ISD::MOVSHDUP;
39017 SrcVT = DstVT = MVT::v4f32;
39018 return true;
39019 }
39020 }
39021
39022 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39023 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39023, __extension__
__PRETTY_FUNCTION__))
;
39024 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39025 Shuffle = X86ISD::MOVDDUP;
39026 SrcVT = DstVT = MVT::v4f64;
39027 return true;
39028 }
39029 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39030 V1)) {
39031 Shuffle = X86ISD::MOVSLDUP;
39032 SrcVT = DstVT = MVT::v8f32;
39033 return true;
39034 }
39035 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39036 V1)) {
39037 Shuffle = X86ISD::MOVSHDUP;
39038 SrcVT = DstVT = MVT::v8f32;
39039 return true;
39040 }
39041 }
39042
39043 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39044 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39045, __extension__
__PRETTY_FUNCTION__))
39045 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39045, __extension__
__PRETTY_FUNCTION__))
;
39046 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39047 V1)) {
39048 Shuffle = X86ISD::MOVDDUP;
39049 SrcVT = DstVT = MVT::v8f64;
39050 return true;
39051 }
39052 if (isTargetShuffleEquivalent(
39053 MaskVT, Mask,
39054 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39055 Shuffle = X86ISD::MOVSLDUP;
39056 SrcVT = DstVT = MVT::v16f32;
39057 return true;
39058 }
39059 if (isTargetShuffleEquivalent(
39060 MaskVT, Mask,
39061 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39062 Shuffle = X86ISD::MOVSHDUP;
39063 SrcVT = DstVT = MVT::v16f32;
39064 return true;
39065 }
39066 }
39067
39068 return false;
39069}
39070
39071// Attempt to match a combined shuffle mask against supported unary immediate
39072// permute instructions.
39073// TODO: Investigate sharing more of this with shuffle lowering.
39074static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
39075 const APInt &Zeroable,
39076 bool AllowFloatDomain, bool AllowIntDomain,
39077 const SelectionDAG &DAG,
39078 const X86Subtarget &Subtarget,
39079 unsigned &Shuffle, MVT &ShuffleVT,
39080 unsigned &PermuteImm) {
39081 unsigned NumMaskElts = Mask.size();
39082 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39083 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39084 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39085 bool ContainsZeros = isAnyZero(Mask);
39086
39087 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39088 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39089 // Check for lane crossing permutes.
39090 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39091 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39092 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39093 Shuffle = X86ISD::VPERMI;
39094 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39095 PermuteImm = getV4X86ShuffleImm(Mask);
39096 return true;
39097 }
39098 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39099 SmallVector<int, 4> RepeatedMask;
39100 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39101 Shuffle = X86ISD::VPERMI;
39102 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39103 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39104 return true;
39105 }
39106 }
39107 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39108 // VPERMILPD can permute with a non-repeating shuffle.
39109 Shuffle = X86ISD::VPERMILPI;
39110 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39111 PermuteImm = 0;
39112 for (int i = 0, e = Mask.size(); i != e; ++i) {
39113 int M = Mask[i];
39114 if (M == SM_SentinelUndef)
39115 continue;
39116 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39116, __extension__
__PRETTY_FUNCTION__))
;
39117 PermuteImm |= (M & 1) << i;
39118 }
39119 return true;
39120 }
39121 }
39122
39123 // We are checking for shuffle match or shift match. Loop twice so we can
39124 // order which we try and match first depending on target preference.
39125 for (unsigned Order = 0; Order < 2; ++Order) {
39126 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39127 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39128 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39129 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39130 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39131 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39132 SmallVector<int, 4> RepeatedMask;
39133 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39134 // Narrow the repeated mask to create 32-bit element permutes.
39135 SmallVector<int, 4> WordMask = RepeatedMask;
39136 if (MaskScalarSizeInBits == 64)
39137 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39138
39139 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39140 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39141 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39142 PermuteImm = getV4X86ShuffleImm(WordMask);
39143 return true;
39144 }
39145 }
39146
39147 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39148 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39149 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39150 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39151 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39152 SmallVector<int, 4> RepeatedMask;
39153 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39154 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39155 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39156
39157 // PSHUFLW: permute lower 4 elements only.
39158 if (isUndefOrInRange(LoMask, 0, 4) &&
39159 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39160 Shuffle = X86ISD::PSHUFLW;
39161 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39162 PermuteImm = getV4X86ShuffleImm(LoMask);
39163 return true;
39164 }
39165
39166 // PSHUFHW: permute upper 4 elements only.
39167 if (isUndefOrInRange(HiMask, 4, 8) &&
39168 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39169 // Offset the HiMask so that we can create the shuffle immediate.
39170 int OffsetHiMask[4];
39171 for (int i = 0; i != 4; ++i)
39172 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39173
39174 Shuffle = X86ISD::PSHUFHW;
39175 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39176 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39177 return true;
39178 }
39179 }
39180 }
39181 } else {
39182 // Attempt to match against bit rotates.
39183 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39184 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39185 Subtarget.hasAVX512())) {
39186 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39187 Subtarget, Mask);
39188 if (0 < RotateAmt) {
39189 Shuffle = X86ISD::VROTLI;
39190 PermuteImm = (unsigned)RotateAmt;
39191 return true;
39192 }
39193 }
39194 }
39195 // Attempt to match against byte/bit shifts.
39196 if (AllowIntDomain &&
39197 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39198 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39199 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39200 int ShiftAmt =
39201 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39202 Zeroable, Subtarget);
39203 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39204 32 <= ShuffleVT.getScalarSizeInBits())) {
39205 // Byte shifts can be slower so only match them on second attempt.
39206 if (Order == 0 &&
39207 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39208 continue;
39209
39210 PermuteImm = (unsigned)ShiftAmt;
39211 return true;
39212 }
39213
39214 }
39215 }
39216
39217 return false;
39218}
39219
39220// Attempt to match a combined unary shuffle mask against supported binary
39221// shuffle instructions.
39222// TODO: Investigate sharing more of this with shuffle lowering.
39223static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39224 bool AllowFloatDomain, bool AllowIntDomain,
39225 SDValue &V1, SDValue &V2, const SDLoc &DL,
39226 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39227 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39228 bool IsUnary) {
39229 unsigned NumMaskElts = Mask.size();
39230 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39231 unsigned SizeInBits = MaskVT.getSizeInBits();
39232
39233 if (MaskVT.is128BitVector()) {
39234 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39235 AllowFloatDomain) {
39236 V2 = V1;
39237 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39238 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39239 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39240 return true;
39241 }
39242 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39243 AllowFloatDomain) {
39244 V2 = V1;
39245 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39246 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39247 return true;
39248 }
39249 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39250 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39251 std::swap(V1, V2);
39252 Shuffle = X86ISD::MOVSD;
39253 SrcVT = DstVT = MVT::v2f64;
39254 return true;
39255 }
39256 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39257 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39258 Shuffle = X86ISD::MOVSS;
39259 SrcVT = DstVT = MVT::v4f32;
39260 return true;
39261 }
39262 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39263 DAG) &&
39264 Subtarget.hasFP16()) {
39265 Shuffle = X86ISD::MOVSH;
39266 SrcVT = DstVT = MVT::v8f16;
39267 return true;
39268 }
39269 }
39270
39271 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39272 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39273 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39274 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39275 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39276 Subtarget)) {
39277 DstVT = MaskVT;
39278 return true;
39279 }
39280 }
39281
39282 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39283 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39284 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39285 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39286 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39287 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
39288 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39289 Subtarget)) {
39290 SrcVT = DstVT = MaskVT;
39291 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39292 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39293 return true;
39294 }
39295 }
39296
39297 // Attempt to match against a OR if we're performing a blend shuffle and the
39298 // non-blended source element is zero in each case.
39299 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39300 if (SizeInBits == V1.getValueSizeInBits() &&
39301 SizeInBits == V2.getValueSizeInBits() &&
39302 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39303 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39304 bool IsBlend = true;
39305 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39306 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39307 unsigned Scale1 = NumV1Elts / NumMaskElts;
39308 unsigned Scale2 = NumV2Elts / NumMaskElts;
39309 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39310 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39311 for (unsigned i = 0; i != NumMaskElts; ++i) {
39312 int M = Mask[i];
39313 if (M == SM_SentinelUndef)
39314 continue;
39315 if (M == SM_SentinelZero) {
39316 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39317 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39318 continue;
39319 }
39320 if (M == (int)i) {
39321 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39322 continue;
39323 }
39324 if (M == (int)(i + NumMaskElts)) {
39325 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39326 continue;
39327 }
39328 IsBlend = false;
39329 break;
39330 }
39331 if (IsBlend) {
39332 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39333 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39334 Shuffle = ISD::OR;
39335 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39336 return true;
39337 }
39338 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39339 // FIXME: handle mismatched sizes?
39340 // TODO: investigate if `ISD::OR` handling in
39341 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39342 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39343 unsigned NumElts = V.getValueType().getVectorNumElements();
39344 KnownBits Known(NumElts);
39345 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39346 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39347 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39348 if (PeepholeKnown.isZero())
39349 Known.Zero.setBit(EltIdx);
39350 if (PeepholeKnown.isAllOnes())
39351 Known.One.setBit(EltIdx);
39352 }
39353 return Known;
39354 };
39355
39356 KnownBits V1Known = computeKnownBitsElementWise(V1);
39357 KnownBits V2Known = computeKnownBitsElementWise(V2);
39358
39359 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39360 int M = Mask[i];
39361 if (M == SM_SentinelUndef)
39362 continue;
39363 if (M == SM_SentinelZero) {
39364 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39365 continue;
39366 }
39367 if (M == (int)i) {
39368 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39369 continue;
39370 }
39371 if (M == (int)(i + NumMaskElts)) {
39372 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39373 continue;
39374 }
39375 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39375)
;
39376 }
39377 if (IsBlend) {
39378 Shuffle = ISD::OR;
39379 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39380 return true;
39381 }
39382 }
39383 }
39384 }
39385
39386 return false;
39387}
39388
39389static bool matchBinaryPermuteShuffle(
39390 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39391 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39392 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39393 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39394 unsigned NumMaskElts = Mask.size();
39395 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39396
39397 // Attempt to match against VALIGND/VALIGNQ rotate.
39398 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39399 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39400 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39401 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39402 if (!isAnyZero(Mask)) {
39403 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39404 if (0 < Rotation) {
39405 Shuffle = X86ISD::VALIGN;
39406 if (EltSizeInBits == 64)
39407 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39408 else
39409 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39410 PermuteImm = Rotation;
39411 return true;
39412 }
39413 }
39414 }
39415
39416 // Attempt to match against PALIGNR byte rotate.
39417 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39418 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39419 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39420 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39421 if (0 < ByteRotation) {
39422 Shuffle = X86ISD::PALIGNR;
39423 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39424 PermuteImm = ByteRotation;
39425 return true;
39426 }
39427 }
39428
39429 // Attempt to combine to X86ISD::BLENDI.
39430 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39431 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39432 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39433 uint64_t BlendMask = 0;
39434 bool ForceV1Zero = false, ForceV2Zero = false;
39435 SmallVector<int, 8> TargetMask(Mask);
39436 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39437 ForceV2Zero, BlendMask)) {
39438 if (MaskVT == MVT::v16i16) {
39439 // We can only use v16i16 PBLENDW if the lanes are repeated.
39440 SmallVector<int, 8> RepeatedMask;
39441 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39442 RepeatedMask)) {
39443 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39444, __extension__
__PRETTY_FUNCTION__))
39444 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39444, __extension__
__PRETTY_FUNCTION__))
;
39445 PermuteImm = 0;
39446 for (int i = 0; i < 8; ++i)
39447 if (RepeatedMask[i] >= 8)
39448 PermuteImm |= 1 << i;
39449 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39450 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39451 Shuffle = X86ISD::BLENDI;
39452 ShuffleVT = MaskVT;
39453 return true;
39454 }
39455 } else {
39456 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39457 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39458 PermuteImm = (unsigned)BlendMask;
39459 Shuffle = X86ISD::BLENDI;
39460 ShuffleVT = MaskVT;
39461 return true;
39462 }
39463 }
39464 }
39465
39466 // Attempt to combine to INSERTPS, but only if it has elements that need to
39467 // be set to zero.
39468 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39469 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39470 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39471 Shuffle = X86ISD::INSERTPS;
39472 ShuffleVT = MVT::v4f32;
39473 return true;
39474 }
39475
39476 // Attempt to combine to SHUFPD.
39477 if (AllowFloatDomain && EltSizeInBits == 64 &&
39478 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39479 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39480 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39481 bool ForceV1Zero = false, ForceV2Zero = false;
39482 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39483 PermuteImm, Mask, Zeroable)) {
39484 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39485 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39486 Shuffle = X86ISD::SHUFP;
39487 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39488 return true;
39489 }
39490 }
39491
39492 // Attempt to combine to SHUFPS.
39493 if (AllowFloatDomain && EltSizeInBits == 32 &&
39494 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39495 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39496 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39497 SmallVector<int, 4> RepeatedMask;
39498 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39499 // Match each half of the repeated mask, to determine if its just
39500 // referencing one of the vectors, is zeroable or entirely undef.
39501 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39502 int M0 = RepeatedMask[Offset];
39503 int M1 = RepeatedMask[Offset + 1];
39504
39505 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39506 return DAG.getUNDEF(MaskVT);
39507 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39508 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39509 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39510 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39511 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39512 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39513 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39514 return V1;
39515 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39516 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39517 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39518 return V2;
39519 }
39520
39521 return SDValue();
39522 };
39523
39524 int ShufMask[4] = {-1, -1, -1, -1};
39525 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39526 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39527
39528 if (Lo && Hi) {
39529 V1 = Lo;
39530 V2 = Hi;
39531 Shuffle = X86ISD::SHUFP;
39532 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39533 PermuteImm = getV4X86ShuffleImm(ShufMask);
39534 return true;
39535 }
39536 }
39537 }
39538
39539 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39540 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39541 MaskVT.is128BitVector() &&
39542 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39543 Shuffle = X86ISD::INSERTPS;
39544 ShuffleVT = MVT::v4f32;
39545 return true;
39546 }
39547
39548 return false;
39549}
39550
39551static SDValue combineX86ShuffleChainWithExtract(
39552 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39553 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39554 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39555 const X86Subtarget &Subtarget);
39556
39557/// Combine an arbitrary chain of shuffles into a single instruction if
39558/// possible.
39559///
39560/// This is the leaf of the recursive combine below. When we have found some
39561/// chain of single-use x86 shuffle instructions and accumulated the combined
39562/// shuffle mask represented by them, this will try to pattern match that mask
39563/// into either a single instruction if there is a special purpose instruction
39564/// for this operation, or into a PSHUFB instruction which is a fully general
39565/// instruction but should only be used to replace chains over a certain depth.
39566static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39567 ArrayRef<int> BaseMask, int Depth,
39568 bool HasVariableMask,
39569 bool AllowVariableCrossLaneMask,
39570 bool AllowVariablePerLaneMask,
39571 SelectionDAG &DAG,
39572 const X86Subtarget &Subtarget) {
39573 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39573, __extension__
__PRETTY_FUNCTION__))
;
39574 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39575, __extension__
__PRETTY_FUNCTION__))
39575 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39575, __extension__
__PRETTY_FUNCTION__))
;
39576
39577 SDLoc DL(Root);
39578 MVT RootVT = Root.getSimpleValueType();
39579 unsigned RootSizeInBits = RootVT.getSizeInBits();
39580 unsigned NumRootElts = RootVT.getVectorNumElements();
39581
39582 // Canonicalize shuffle input op to the requested type.
39583 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39584 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39585 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39586 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39587 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39588 return DAG.getBitcast(VT, Op);
39589 };
39590
39591 // Find the inputs that enter the chain. Note that multiple uses are OK
39592 // here, we're not going to remove the operands we find.
39593 bool UnaryShuffle = (Inputs.size() == 1);
39594 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39595 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39596 : peekThroughBitcasts(Inputs[1]));
39597
39598 MVT VT1 = V1.getSimpleValueType();
39599 MVT VT2 = V2.getSimpleValueType();
39600 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39601, __extension__
__PRETTY_FUNCTION__))
39601 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39601, __extension__
__PRETTY_FUNCTION__))
;
39602
39603 SDValue Res;
39604
39605 unsigned NumBaseMaskElts = BaseMask.size();
39606 if (NumBaseMaskElts == 1) {
39607 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39607, __extension__
__PRETTY_FUNCTION__))
;
39608 return CanonicalizeShuffleInput(RootVT, V1);
39609 }
39610
39611 bool OptForSize = DAG.shouldOptForSize();
39612 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39613 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39614 (RootVT.isFloatingPoint() && Depth >= 1) ||
39615 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39616
39617 // Don't combine if we are a AVX512/EVEX target and the mask element size
39618 // is different from the root element size - this would prevent writemasks
39619 // from being reused.
39620 bool IsMaskedShuffle = false;
39621 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39622 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
39623 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39624 IsMaskedShuffle = true;
39625 }
39626 }
39627
39628 // If we are shuffling a splat (and not introducing zeros) then we can just
39629 // use it directly. This works for smaller elements as well as they already
39630 // repeat across each mask element.
39631 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39632 V1.getValueSizeInBits() >= RootSizeInBits &&
39633 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39634 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39635 return CanonicalizeShuffleInput(RootVT, V1);
39636 }
39637
39638 SmallVector<int, 64> Mask(BaseMask);
39639
39640 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39641 // etc. can be simplified.
39642 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39643 SmallVector<int> ScaledMask, IdentityMask;
39644 unsigned NumElts = VT1.getVectorNumElements();
39645 if (Mask.size() <= NumElts &&
39646 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39647 for (unsigned i = 0; i != NumElts; ++i)
39648 IdentityMask.push_back(i);
39649 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39650 V2))
39651 return CanonicalizeShuffleInput(RootVT, V1);
39652 }
39653 }
39654
39655 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39656 if (RootVT.is512BitVector() &&
39657 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39658 // If the upper subvectors are zeroable, then an extract+insert is more
39659 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39660 // to zero the upper subvectors.
39661 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39662 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39663 return SDValue(); // Nothing to do!
39664 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39665, __extension__
__PRETTY_FUNCTION__))
39665 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39665, __extension__
__PRETTY_FUNCTION__))
;
39666 Res = CanonicalizeShuffleInput(RootVT, V1);
39667 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39668 bool UseZero = isAnyZero(Mask);
39669 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39670 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39671 }
39672
39673 // Narrow shuffle mask to v4x128.
39674 SmallVector<int, 4> ScaledMask;
39675 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39675, __extension__
__PRETTY_FUNCTION__))
;
39676 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39677
39678 // Try to lower to vshuf64x2/vshuf32x4.
39679 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39680 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39681 SelectionDAG &DAG) {
39682 unsigned PermMask = 0;
39683 // Insure elements came from the same Op.
39684 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39685 for (int i = 0; i < 4; ++i) {
39686 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39686, __extension__
__PRETTY_FUNCTION__))
;
39687 if (ScaledMask[i] < 0)
39688 continue;
39689
39690 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39691 unsigned OpIndex = i / 2;
39692 if (Ops[OpIndex].isUndef())
39693 Ops[OpIndex] = Op;
39694 else if (Ops[OpIndex] != Op)
39695 return SDValue();
39696
39697 // Convert the 128-bit shuffle mask selection values into 128-bit
39698 // selection bits defined by a vshuf64x2 instruction's immediate control
39699 // byte.
39700 PermMask |= (ScaledMask[i] % 4) << (i * 2);
39701 }
39702
39703 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39704 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39705 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39706 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39707 };
39708
39709 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39710 // doesn't work because our mask is for 128 bits and we don't have an MVT
39711 // to match that.
39712 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39713 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39714 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39715 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39716 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39717 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39718 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39719 ScaledMask[1] == (ScaledMask[3] % 2));
39720
39721 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39722 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39723 return SDValue(); // Nothing to do!
39724 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39725 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39726 return DAG.getBitcast(RootVT, V);
39727 }
39728 }
39729
39730 // Handle 128-bit lane shuffles of 256-bit vectors.
39731 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39732 // If the upper half is zeroable, then an extract+insert is more optimal
39733 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39734 // zero the upper half.
39735 if (isUndefOrZero(Mask[1])) {
39736 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39737 return SDValue(); // Nothing to do!
39738 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39738, __extension__
__PRETTY_FUNCTION__))
;
39739 Res = CanonicalizeShuffleInput(RootVT, V1);
39740 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39741 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39742 256);
39743 }
39744
39745 // If we're inserting the low subvector, an insert-subvector 'concat'
39746 // pattern is quicker than VPERM2X128.
39747 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39748 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39749 !Subtarget.hasAVX2()) {
39750 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39751 return SDValue(); // Nothing to do!
39752 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39753 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39754 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39755 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39756 }
39757
39758 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39759 return SDValue(); // Nothing to do!
39760
39761 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39762 // we need to use the zeroing feature.
39763 // Prefer blends for sequential shuffles unless we are optimizing for size.
39764 if (UnaryShuffle &&
39765 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39766 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39767 unsigned PermMask = 0;
39768 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39769 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39770 return DAG.getNode(
39771 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39772 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39773 }
39774
39775 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39776 return SDValue(); // Nothing to do!
39777
39778 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39779 if (!UnaryShuffle && !IsMaskedShuffle) {
39780 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39781, __extension__
__PRETTY_FUNCTION__))
39781 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39781, __extension__
__PRETTY_FUNCTION__))
;
39782 // Prefer blends to X86ISD::VPERM2X128.
39783 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
39784 unsigned PermMask = 0;
39785 PermMask |= ((Mask[0] & 3) << 0);
39786 PermMask |= ((Mask[1] & 3) << 4);
39787 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
39788 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
39789 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
39790 CanonicalizeShuffleInput(RootVT, LHS),
39791 CanonicalizeShuffleInput(RootVT, RHS),
39792 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39793 }
39794 }
39795 }
39796
39797 // For masks that have been widened to 128-bit elements or more,
39798 // narrow back down to 64-bit elements.
39799 if (BaseMaskEltSizeInBits > 64) {
39800 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39800, __extension__
__PRETTY_FUNCTION__))
;
39801 int MaskScale = BaseMaskEltSizeInBits / 64;
39802 SmallVector<int, 64> ScaledMask;
39803 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39804 Mask = std::move(ScaledMask);
39805 }
39806
39807 // For masked shuffles, we're trying to match the root width for better
39808 // writemask folding, attempt to scale the mask.
39809 // TODO - variable shuffles might need this to be widened again.
39810 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
39811 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39811, __extension__
__PRETTY_FUNCTION__))
;
39812 int MaskScale = NumRootElts / Mask.size();
39813 SmallVector<int, 64> ScaledMask;
39814 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39815 Mask = std::move(ScaledMask);
39816 }
39817
39818 unsigned NumMaskElts = Mask.size();
39819 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
39820
39821 // Determine the effective mask value type.
39822 FloatDomain &= (32 <= MaskEltSizeInBits);
39823 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
39824 : MVT::getIntegerVT(MaskEltSizeInBits);
39825 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
39826
39827 // Only allow legal mask types.
39828 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39829 return SDValue();
39830
39831 // Attempt to match the mask against known shuffle patterns.
39832 MVT ShuffleSrcVT, ShuffleVT;
39833 unsigned Shuffle, PermuteImm;
39834
39835 // Which shuffle domains are permitted?
39836 // Permit domain crossing at higher combine depths.
39837 // TODO: Should we indicate which domain is preferred if both are allowed?
39838 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39839 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39840 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
39841
39842 // Determine zeroable mask elements.
39843 APInt KnownUndef, KnownZero;
39844 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
39845 APInt Zeroable = KnownUndef | KnownZero;
39846
39847 if (UnaryShuffle) {
39848 // Attempt to match against broadcast-from-vector.
39849 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
39850 if ((Subtarget.hasAVX2() ||
39851 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
39852 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
39853 if (isUndefOrEqual(Mask, 0)) {
39854 if (V1.getValueType() == MaskVT &&
39855 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39856 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
39857 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39858 return SDValue(); // Nothing to do!
39859 Res = V1.getOperand(0);
39860 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39861 return DAG.getBitcast(RootVT, Res);
39862 }
39863 if (Subtarget.hasAVX2()) {
39864 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39865 return SDValue(); // Nothing to do!
39866 Res = CanonicalizeShuffleInput(MaskVT, V1);
39867 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39868 return DAG.getBitcast(RootVT, Res);
39869 }
39870 }
39871 }
39872
39873 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
39874 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
39875 (!IsMaskedShuffle ||
39876 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39877 if (Depth == 0 && Root.getOpcode() == Shuffle)
39878 return SDValue(); // Nothing to do!
39879 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39880 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
39881 return DAG.getBitcast(RootVT, Res);
39882 }
39883
39884 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39885 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
39886 PermuteImm) &&
39887 (!IsMaskedShuffle ||
39888 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39889 if (Depth == 0 && Root.getOpcode() == Shuffle)
39890 return SDValue(); // Nothing to do!
39891 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
39892 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
39893 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39894 return DAG.getBitcast(RootVT, Res);
39895 }
39896 }
39897
39898 // Attempt to combine to INSERTPS, but only if the inserted element has come
39899 // from a scalar.
39900 // TODO: Handle other insertions here as well?
39901 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
39902 Subtarget.hasSSE41() &&
39903 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
39904 if (MaskEltSizeInBits == 32) {
39905 SDValue SrcV1 = V1, SrcV2 = V2;
39906 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
39907 DAG) &&
39908 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39909 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39910 return SDValue(); // Nothing to do!
39911 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39912 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
39913 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
39914 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39915 return DAG.getBitcast(RootVT, Res);
39916 }
39917 }
39918 if (MaskEltSizeInBits == 64 &&
39919 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
39920 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39921 V2.getScalarValueSizeInBits() <= 32) {
39922 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39923 return SDValue(); // Nothing to do!
39924 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
39925 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39926 CanonicalizeShuffleInput(MVT::v4f32, V1),
39927 CanonicalizeShuffleInput(MVT::v4f32, V2),
39928 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39929 return DAG.getBitcast(RootVT, Res);
39930 }
39931 }
39932
39933 SDValue NewV1 = V1; // Save operands in case early exit happens.
39934 SDValue NewV2 = V2;
39935 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
39936 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
39937 ShuffleVT, UnaryShuffle) &&
39938 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39939 if (Depth == 0 && Root.getOpcode() == Shuffle)
39940 return SDValue(); // Nothing to do!
39941 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
39942 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
39943 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
39944 return DAG.getBitcast(RootVT, Res);
39945 }
39946
39947 NewV1 = V1; // Save operands in case early exit happens.
39948 NewV2 = V2;
39949 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39950 AllowIntDomain, NewV1, NewV2, DL, DAG,
39951 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
39952 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39953 if (Depth == 0 && Root.getOpcode() == Shuffle)
39954 return SDValue(); // Nothing to do!
39955 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
39956 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
39957 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
39958 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39959 return DAG.getBitcast(RootVT, Res);
39960 }
39961
39962 // Typically from here on, we need an integer version of MaskVT.
39963 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
39964 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
39965
39966 // Annoyingly, SSE4A instructions don't map into the above match helpers.
39967 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
39968 uint64_t BitLen, BitIdx;
39969 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
39970 Zeroable)) {
39971 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39972 return SDValue(); // Nothing to do!
39973 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39974 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
39975 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39976 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39977 return DAG.getBitcast(RootVT, Res);
39978 }
39979
39980 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
39981 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39982 return SDValue(); // Nothing to do!
39983 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39984 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
39985 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
39986 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39987 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39988 return DAG.getBitcast(RootVT, Res);
39989 }
39990 }
39991
39992 // Match shuffle against TRUNCATE patterns.
39993 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
39994 // Match against a VTRUNC instruction, accounting for src/dst sizes.
39995 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
39996 Subtarget)) {
39997 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
39998 ShuffleSrcVT.getVectorNumElements();
39999 unsigned Opc =
40000 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40001 if (Depth == 0 && Root.getOpcode() == Opc)
40002 return SDValue(); // Nothing to do!
40003 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40004 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40005 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40006 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40007 return DAG.getBitcast(RootVT, Res);
40008 }
40009
40010 // Do we need a more general binary truncation pattern?
40011 if (RootSizeInBits < 512 &&
40012 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40013 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40014 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40015 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40016 // Bail if this was already a truncation or PACK node.
40017 // We sometimes fail to match PACK if we demand known undef elements.
40018 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
40019 Root.getOpcode() == X86ISD::PACKSS ||
40020 Root.getOpcode() == X86ISD::PACKUS))
40021 return SDValue(); // Nothing to do!
40022 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40023 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40024 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40025 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40026 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40027 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40028 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40029 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40030 return DAG.getBitcast(RootVT, Res);
40031 }
40032 }
40033
40034 // Don't try to re-form single instruction chains under any circumstances now
40035 // that we've done encoding canonicalization for them.
40036 if (Depth < 1)
40037 return SDValue();
40038
40039 // Depth threshold above which we can efficiently use variable mask shuffles.
40040 int VariableCrossLaneShuffleDepth =
40041 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40042 int VariablePerLaneShuffleDepth =
40043 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40044 AllowVariableCrossLaneMask &=
40045 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
40046 AllowVariablePerLaneMask &=
40047 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40048 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40049 // higher depth before combining them.
40050 bool AllowBWIVPERMV3 =
40051 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40052
40053 bool MaskContainsZeros = isAnyZero(Mask);
40054
40055 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40056 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40057 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40058 if (Subtarget.hasAVX2() &&
40059 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40060 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40061 Res = CanonicalizeShuffleInput(MaskVT, V1);
40062 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40063 return DAG.getBitcast(RootVT, Res);
40064 }
40065 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40066 if ((Subtarget.hasAVX512() &&
40067 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40068 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40069 (Subtarget.hasBWI() &&
40070 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40071 (Subtarget.hasVBMI() &&
40072 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40073 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40074 V2 = DAG.getUNDEF(MaskVT);
40075 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40076 return DAG.getBitcast(RootVT, Res);
40077 }
40078 }
40079
40080 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40081 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40082 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40083 ((Subtarget.hasAVX512() &&
40084 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40085 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40086 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40087 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40088 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40089 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40090 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40091 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40092 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40093 for (unsigned i = 0; i != NumMaskElts; ++i)
40094 if (Mask[i] == SM_SentinelZero)
40095 Mask[i] = NumMaskElts + i;
40096 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40097 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40098 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40099 return DAG.getBitcast(RootVT, Res);
40100 }
40101
40102 // If that failed and either input is extracted then try to combine as a
40103 // shuffle with the larger type.
40104 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40105 Inputs, Root, BaseMask, Depth, HasVariableMask,
40106 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40107 Subtarget))
40108 return WideShuffle;
40109
40110 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40111 // (non-VLX will pad to 512-bit shuffles).
40112 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40113 ((Subtarget.hasAVX512() &&
40114 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40115 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40116 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40117 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40118 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40119 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40120 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40121 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40122 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40123 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40124 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40125 return DAG.getBitcast(RootVT, Res);
40126 }
40127 return SDValue();
40128 }
40129
40130 // See if we can combine a single input shuffle with zeros to a bit-mask,
40131 // which is much simpler than any shuffle.
40132 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40133 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40134 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
40135 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40136 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40137 APInt UndefElts(NumMaskElts, 0);
40138 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40139 for (unsigned i = 0; i != NumMaskElts; ++i) {
40140 int M = Mask[i];
40141 if (M == SM_SentinelUndef) {
40142 UndefElts.setBit(i);
40143 continue;
40144 }
40145 if (M == SM_SentinelZero)
40146 continue;
40147 EltBits[i] = AllOnes;
40148 }
40149 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40150 Res = CanonicalizeShuffleInput(MaskVT, V1);
40151 unsigned AndOpcode =
40152 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
40153 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40154 return DAG.getBitcast(RootVT, Res);
40155 }
40156
40157 // If we have a single input shuffle with different shuffle patterns in the
40158 // the 128-bit lanes use the variable mask to VPERMILPS.
40159 // TODO Combine other mask types at higher depths.
40160 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40161 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40162 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40163 SmallVector<SDValue, 16> VPermIdx;
40164 for (int M : Mask) {
40165 SDValue Idx =
40166 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40167 VPermIdx.push_back(Idx);
40168 }
40169 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40170 Res = CanonicalizeShuffleInput(MaskVT, V1);
40171 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40172 return DAG.getBitcast(RootVT, Res);
40173 }
40174
40175 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40176 // to VPERMIL2PD/VPERMIL2PS.
40177 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40178 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40179 MaskVT == MVT::v8f32)) {
40180 // VPERMIL2 Operation.
40181 // Bits[3] - Match Bit.
40182 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40183 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40184 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40185 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40186 SmallVector<int, 8> VPerm2Idx;
40187 unsigned M2ZImm = 0;
40188 for (int M : Mask) {
40189 if (M == SM_SentinelUndef) {
40190 VPerm2Idx.push_back(-1);
40191 continue;
40192 }
40193 if (M == SM_SentinelZero) {
40194 M2ZImm = 2;
40195 VPerm2Idx.push_back(8);
40196 continue;
40197 }
40198 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40199 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40200 VPerm2Idx.push_back(Index);
40201 }
40202 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40203 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40204 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40205 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40206 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40207 return DAG.getBitcast(RootVT, Res);
40208 }
40209
40210 // If we have 3 or more shuffle instructions or a chain involving a variable
40211 // mask, we can replace them with a single PSHUFB instruction profitably.
40212 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40213 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40214 // more aggressive.
40215 if (UnaryShuffle && AllowVariablePerLaneMask &&
40216 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40217 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40218 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40219 SmallVector<SDValue, 16> PSHUFBMask;
40220 int NumBytes = RootVT.getSizeInBits() / 8;
40221 int Ratio = NumBytes / NumMaskElts;
40222 for (int i = 0; i < NumBytes; ++i) {
40223 int M = Mask[i / Ratio];
40224 if (M == SM_SentinelUndef) {
40225 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40226 continue;
40227 }
40228 if (M == SM_SentinelZero) {
40229 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40230 continue;
40231 }
40232 M = Ratio * M + i % Ratio;
40233 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40233, __extension__
__PRETTY_FUNCTION__))
;
40234 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40235 }
40236 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40237 Res = CanonicalizeShuffleInput(ByteVT, V1);
40238 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40239 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40240 return DAG.getBitcast(RootVT, Res);
40241 }
40242
40243 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40244 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40245 // slower than PSHUFB on targets that support both.
40246 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40247 Subtarget.hasXOP()) {
40248 // VPPERM Mask Operation
40249 // Bits[4:0] - Byte Index (0 - 31)
40250 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40251 SmallVector<SDValue, 16> VPPERMMask;
40252 int NumBytes = 16;
40253 int Ratio = NumBytes / NumMaskElts;
40254 for (int i = 0; i < NumBytes; ++i) {
40255 int M = Mask[i / Ratio];
40256 if (M == SM_SentinelUndef) {
40257 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40258 continue;
40259 }
40260 if (M == SM_SentinelZero) {
40261 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40262 continue;
40263 }
40264 M = Ratio * M + i % Ratio;
40265 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40266 }
40267 MVT ByteVT = MVT::v16i8;
40268 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40269 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40270 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40271 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40272 return DAG.getBitcast(RootVT, Res);
40273 }
40274
40275 // If that failed and either input is extracted then try to combine as a
40276 // shuffle with the larger type.
40277 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40278 Inputs, Root, BaseMask, Depth, HasVariableMask,
40279 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40280 return WideShuffle;
40281
40282 // If we have a dual input shuffle then lower to VPERMV3,
40283 // (non-VLX will pad to 512-bit shuffles)
40284 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40285 ((Subtarget.hasAVX512() &&
40286 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40287 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40288 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40289 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40290 MaskVT == MVT::v16i32)) ||
40291 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40292 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40293 MaskVT == MVT::v32i16)) ||
40294 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40295 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40296 MaskVT == MVT::v64i8)))) {
40297 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40298 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40299 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40300 return DAG.getBitcast(RootVT, Res);
40301 }
40302
40303 // Failed to find any combines.
40304 return SDValue();
40305}
40306
40307// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40308// instruction if possible.
40309//
40310// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40311// type size to attempt to combine:
40312// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40313// -->
40314// extract_subvector(shuffle(x,y,m2),0)
40315static SDValue combineX86ShuffleChainWithExtract(
40316 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40317 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40318 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40319 const X86Subtarget &Subtarget) {
40320 unsigned NumMaskElts = BaseMask.size();
40321 unsigned NumInputs = Inputs.size();
40322 if (NumInputs == 0)
40323 return SDValue();
40324
40325 EVT RootVT = Root.getValueType();
40326 unsigned RootSizeInBits = RootVT.getSizeInBits();
40327 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40328 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40328, __extension__
__PRETTY_FUNCTION__))
;
40329
40330 // Peek through extract_subvector to find widest legal vector.
40331 // TODO: Handle ISD::TRUNCATE
40332 unsigned WideSizeInBits = RootSizeInBits;
40333 for (unsigned I = 0; I != NumInputs; ++I) {
40334 SDValue Input = peekThroughBitcasts(Inputs[I]);
40335 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
40336 Input = peekThroughBitcasts(Input.getOperand(0));
40337 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40338 WideSizeInBits < Input.getValueSizeInBits())
40339 WideSizeInBits = Input.getValueSizeInBits();
40340 }
40341
40342 // Bail if we fail to find a source larger than the existing root.
40343 unsigned Scale = WideSizeInBits / RootSizeInBits;
40344 if (WideSizeInBits <= RootSizeInBits ||
40345 (WideSizeInBits % RootSizeInBits) != 0)
40346 return SDValue();
40347
40348 // Create new mask for larger type.
40349 SmallVector<int, 64> WideMask(BaseMask);
40350 for (int &M : WideMask) {
40351 if (M < 0)
40352 continue;
40353 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40354 }
40355 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40356
40357 // Attempt to peek through inputs and adjust mask when we extract from an
40358 // upper subvector.
40359 int AdjustedMasks = 0;
40360 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
40361 for (unsigned I = 0; I != NumInputs; ++I) {
40362 SDValue &Input = WideInputs[I];
40363 Input = peekThroughBitcasts(Input);
40364 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40365 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40366 uint64_t Idx = Input.getConstantOperandVal(1);
40367 if (Idx != 0) {
40368 ++AdjustedMasks;
40369 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40370 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40371
40372 int lo = I * WideMask.size();
40373 int hi = (I + 1) * WideMask.size();
40374 for (int &M : WideMask)
40375 if (lo <= M && M < hi)
40376 M += Idx;
40377 }
40378 Input = peekThroughBitcasts(Input.getOperand(0));
40379 }
40380 }
40381
40382 // Remove unused/repeated shuffle source ops.
40383 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40384 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40384, __extension__
__PRETTY_FUNCTION__))
;
40385
40386 // Bail if we're always extracting from the lowest subvectors,
40387 // combineX86ShuffleChain should match this for the current width, or the
40388 // shuffle still references too many inputs.
40389 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40390 return SDValue();
40391
40392 // Minor canonicalization of the accumulated shuffle mask to make it easier
40393 // to match below. All this does is detect masks with sequential pairs of
40394 // elements, and shrink them to the half-width mask. It does this in a loop
40395 // so it will reduce the size of the mask to the minimal width mask which
40396 // performs an equivalent shuffle.
40397 while (WideMask.size() > 1) {
40398 SmallVector<int, 64> WidenedMask;
40399 if (!canWidenShuffleElements(WideMask, WidenedMask))
40400 break;
40401 WideMask = std::move(WidenedMask);
40402 }
40403
40404 // Canonicalization of binary shuffle masks to improve pattern matching by
40405 // commuting the inputs.
40406 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40407 ShuffleVectorSDNode::commuteMask(WideMask);
40408 std::swap(WideInputs[0], WideInputs[1]);
40409 }
40410
40411 // Increase depth for every upper subvector we've peeked through.
40412 Depth += AdjustedMasks;
40413
40414 // Attempt to combine wider chain.
40415 // TODO: Can we use a better Root?
40416 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40417 WideInputs.back().getValueSizeInBits()
40418 ? WideInputs.front()
40419 : WideInputs.back();
40420 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40421, __extension__
__PRETTY_FUNCTION__))
40421 "WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40421, __extension__
__PRETTY_FUNCTION__))
;
40422
40423 if (SDValue WideShuffle =
40424 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40425 HasVariableMask, AllowVariableCrossLaneMask,
40426 AllowVariablePerLaneMask, DAG, Subtarget)) {
40427 WideShuffle =
40428 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40429 return DAG.getBitcast(RootVT, WideShuffle);
40430 }
40431
40432 return SDValue();
40433}
40434
40435// Canonicalize the combined shuffle mask chain with horizontal ops.
40436// NOTE: This may update the Ops and Mask.
40437static SDValue canonicalizeShuffleMaskWithHorizOp(
40438 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
40439 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40440 const X86Subtarget &Subtarget) {
40441 if (Mask.empty() || Ops.empty())
40442 return SDValue();
40443
40444 SmallVector<SDValue> BC;
40445 for (SDValue Op : Ops)
40446 BC.push_back(peekThroughBitcasts(Op));
40447
40448 // All ops must be the same horizop + type.
40449 SDValue BC0 = BC[0];
40450 EVT VT0 = BC0.getValueType();
40451 unsigned Opcode0 = BC0.getOpcode();
40452 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40453 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40454 }))
40455 return SDValue();
40456
40457 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40458 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40459 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40460 if (!isHoriz && !isPack)
40461 return SDValue();
40462
40463 // Do all ops have a single use?
40464 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40465 return Op.hasOneUse() &&
40466 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
40467 });
40468
40469 int NumElts = VT0.getVectorNumElements();
40470 int NumLanes = VT0.getSizeInBits() / 128;
40471 int NumEltsPerLane = NumElts / NumLanes;
40472 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40473 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40474 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40475
40476 if (NumEltsPerLane >= 4 &&
40477 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40478 SmallVector<int> LaneMask, ScaledMask;
40479 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40480 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40481 // See if we can remove the shuffle by resorting the HOP chain so that
40482 // the HOP args are pre-shuffled.
40483 // TODO: Generalize to any sized/depth chain.
40484 // TODO: Add support for PACKSS/PACKUS.
40485 if (isHoriz) {
40486 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40487 auto GetHOpSrc = [&](int M) {
40488 if (M == SM_SentinelUndef)
40489 return DAG.getUNDEF(VT0);
40490 if (M == SM_SentinelZero)
40491 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40492 SDValue Src0 = BC[M / 4];
40493 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40494 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40495 return Src1.getOperand(M % 2);
40496 return SDValue();
40497 };
40498 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40499 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40500 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40501 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40502 if (M0 && M1 && M2 && M3) {
40503 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40504 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40505 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40506 }
40507 }
40508 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40509 if (Ops.size() >= 2) {
40510 SDValue LHS, RHS;
40511 auto GetHOpSrc = [&](int M, int &OutM) {
40512 // TODO: Support SM_SentinelZero
40513 if (M < 0)
40514 return M == SM_SentinelUndef;
40515 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40516 if (!LHS || LHS == Src) {
40517 LHS = Src;
40518 OutM = (M % 2);
40519 return true;
40520 }
40521 if (!RHS || RHS == Src) {
40522 RHS = Src;
40523 OutM = (M % 2) + 2;
40524 return true;
40525 }
40526 return false;
40527 };
40528 int PostMask[4] = {-1, -1, -1, -1};
40529 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40530 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40531 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40532 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40533 LHS = DAG.getBitcast(SrcVT, LHS);
40534 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40535 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40536 // Use SHUFPS for the permute so this will work on SSE3 targets,
40537 // shuffle combining and domain handling will simplify this later on.
40538 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40539 Res = DAG.getBitcast(ShuffleVT, Res);
40540 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40541 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40542 }
40543 }
40544 }
40545 }
40546
40547 if (2 < Ops.size())
40548 return SDValue();
40549
40550 SDValue BC1 = BC[BC.size() - 1];
40551 if (Mask.size() == VT0.getVectorNumElements()) {
40552 // Canonicalize binary shuffles of horizontal ops that use the
40553 // same sources to an unary shuffle.
40554 // TODO: Try to perform this fold even if the shuffle remains.
40555 if (Ops.size() == 2) {
40556 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40557 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40558 };
40559 // Commute if all BC0's ops are contained in BC1.
40560 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40561 ContainsOps(BC1, BC0.getOperand(1))) {
40562 ShuffleVectorSDNode::commuteMask(Mask);
40563 std::swap(Ops[0], Ops[1]);
40564 std::swap(BC0, BC1);
40565 }
40566
40567 // If BC1 can be represented by BC0, then convert to unary shuffle.
40568 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40569 ContainsOps(BC0, BC1.getOperand(1))) {
40570 for (int &M : Mask) {
40571 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40572 continue;
40573 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40574 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40575 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40576 M += NumHalfEltsPerLane;
40577 }
40578 }
40579 }
40580
40581 // Canonicalize unary horizontal ops to only refer to lower halves.
40582 for (int i = 0; i != NumElts; ++i) {
40583 int &M = Mask[i];
40584 if (isUndefOrZero(M))
40585 continue;
40586 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40587 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40588 M -= NumHalfEltsPerLane;
40589 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40590 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40591 M -= NumHalfEltsPerLane;
40592 }
40593 }
40594
40595 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40596 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40597 // represents the LHS/RHS inputs for the lower/upper halves.
40598 SmallVector<int, 16> TargetMask128, WideMask128;
40599 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40600 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40601 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40601, __extension__
__PRETTY_FUNCTION__))
;
40602 bool SingleOp = (Ops.size() == 1);
40603 if (isPack || OneUseOps ||
40604 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40605 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40606 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40607 Lo = Lo.getOperand(WideMask128[0] & 1);
40608 Hi = Hi.getOperand(WideMask128[1] & 1);
40609 if (SingleOp) {
40610 SDValue Undef = DAG.getUNDEF(SrcVT);
40611 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40612 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40613 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40614 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40615 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40616 }
40617 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40618 }
40619 }
40620
40621 return SDValue();
40622}
40623
40624// Attempt to constant fold all of the constant source ops.
40625// Returns true if the entire shuffle is folded to a constant.
40626// TODO: Extend this to merge multiple constant Ops and update the mask.
40627static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
40628 ArrayRef<int> Mask, SDValue Root,
40629 bool HasVariableMask,
40630 SelectionDAG &DAG,
40631 const X86Subtarget &Subtarget) {
40632 MVT VT = Root.getSimpleValueType();
40633
40634 unsigned SizeInBits = VT.getSizeInBits();
40635 unsigned NumMaskElts = Mask.size();
40636 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40637 unsigned NumOps = Ops.size();
40638
40639 // Extract constant bits from each source op.
40640 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40641 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40642 for (unsigned I = 0; I != NumOps; ++I)
40643 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40644 RawBitsOps[I]))
40645 return SDValue();
40646
40647 // If we're optimizing for size, only fold if at least one of the constants is
40648 // only used once or the combined shuffle has included a variable mask
40649 // shuffle, this is to avoid constant pool bloat.
40650 bool IsOptimizingSize = DAG.shouldOptForSize();
40651 if (IsOptimizingSize && !HasVariableMask &&
40652 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40653 return SDValue();
40654
40655 // Shuffle the constant bits according to the mask.
40656 SDLoc DL(Root);
40657 APInt UndefElts(NumMaskElts, 0);
40658 APInt ZeroElts(NumMaskElts, 0);
40659 APInt ConstantElts(NumMaskElts, 0);
40660 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40661 APInt::getZero(MaskSizeInBits));
40662 for (unsigned i = 0; i != NumMaskElts; ++i) {
40663 int M = Mask[i];
40664 if (M == SM_SentinelUndef) {
40665 UndefElts.setBit(i);
40666 continue;
40667 } else if (M == SM_SentinelZero) {
40668 ZeroElts.setBit(i);
40669 continue;
40670 }
40671 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40671, __extension__
__PRETTY_FUNCTION__))
;
40672
40673 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40674 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40675
40676 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40677 if (SrcUndefElts[SrcMaskIdx]) {
40678 UndefElts.setBit(i);
40679 continue;
40680 }
40681
40682 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40683 APInt &Bits = SrcEltBits[SrcMaskIdx];
40684 if (!Bits) {
40685 ZeroElts.setBit(i);
40686 continue;
40687 }
40688
40689 ConstantElts.setBit(i);
40690 ConstantBitData[i] = Bits;
40691 }
40692 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40692, __extension__
__PRETTY_FUNCTION__))
;
40693
40694 // Attempt to create a zero vector.
40695 if ((UndefElts | ZeroElts).isAllOnes())
40696 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
40697
40698 // Create the constant data.
40699 MVT MaskSVT;
40700 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40701 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40702 else
40703 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40704
40705 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40706 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40707 return SDValue();
40708
40709 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40710 return DAG.getBitcast(VT, CstOp);
40711}
40712
40713namespace llvm {
40714 namespace X86 {
40715 enum {
40716 MaxShuffleCombineDepth = 8
40717 };
40718 }
40719} // namespace llvm
40720
40721/// Fully generic combining of x86 shuffle instructions.
40722///
40723/// This should be the last combine run over the x86 shuffle instructions. Once
40724/// they have been fully optimized, this will recursively consider all chains
40725/// of single-use shuffle instructions, build a generic model of the cumulative
40726/// shuffle operation, and check for simpler instructions which implement this
40727/// operation. We use this primarily for two purposes:
40728///
40729/// 1) Collapse generic shuffles to specialized single instructions when
40730/// equivalent. In most cases, this is just an encoding size win, but
40731/// sometimes we will collapse multiple generic shuffles into a single
40732/// special-purpose shuffle.
40733/// 2) Look for sequences of shuffle instructions with 3 or more total
40734/// instructions, and replace them with the slightly more expensive SSSE3
40735/// PSHUFB instruction if available. We do this as the last combining step
40736/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40737/// a suitable short sequence of other instructions. The PSHUFB will either
40738/// use a register or have to read from memory and so is slightly (but only
40739/// slightly) more expensive than the other shuffle instructions.
40740///
40741/// Because this is inherently a quadratic operation (for each shuffle in
40742/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40743/// This should never be an issue in practice as the shuffle lowering doesn't
40744/// produce sequences of more than 8 instructions.
40745///
40746/// FIXME: We will currently miss some cases where the redundant shuffling
40747/// would simplify under the threshold for PSHUFB formation because of
40748/// combine-ordering. To fix this, we should do the redundant instruction
40749/// combining in this recursive walk.
40750static SDValue combineX86ShufflesRecursively(
40751 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40752 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40753 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40754 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40755 const X86Subtarget &Subtarget) {
40756 assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40758, __extension__
__PRETTY_FUNCTION__))
40757 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40758, __extension__
__PRETTY_FUNCTION__))
40758 "Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40758, __extension__
__PRETTY_FUNCTION__))
;
40759 MVT RootVT = Root.getSimpleValueType();
40760 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40760, __extension__
__PRETTY_FUNCTION__))
;
40761 unsigned RootSizeInBits = RootVT.getSizeInBits();
40762
40763 // Bound the depth of our recursive combine because this is ultimately
40764 // quadratic in nature.
40765 if (Depth >= MaxDepth)
40766 return SDValue();
40767
40768 // Directly rip through bitcasts to find the underlying operand.
40769 SDValue Op = SrcOps[SrcOpIndex];
40770 Op = peekThroughOneUseBitcasts(Op);
40771
40772 EVT VT = Op.getValueType();
40773 if (!VT.isVector() || !VT.isSimple())
40774 return SDValue(); // Bail if we hit a non-simple non-vector.
40775
40776 // FIXME: Just bail on f16 for now.
40777 if (VT.getVectorElementType() == MVT::f16)
40778 return SDValue();
40779
40780 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40781, __extension__
__PRETTY_FUNCTION__))
40781 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40781, __extension__
__PRETTY_FUNCTION__))
;
40782
40783 // Create a demanded elts mask from the referenced elements of Op.
40784 APInt OpDemandedElts = APInt::getZero(RootMask.size());
40785 for (int M : RootMask) {
40786 int BaseIdx = RootMask.size() * SrcOpIndex;
40787 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
40788 OpDemandedElts.setBit(M - BaseIdx);
40789 }
40790 if (RootSizeInBits != VT.getSizeInBits()) {
40791 // Op is smaller than Root - extract the demanded elts for the subvector.
40792 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
40793 unsigned NumOpMaskElts = RootMask.size() / Scale;
40794 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40794, __extension__
__PRETTY_FUNCTION__))
;
40795 assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))
40796 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))
40797 .isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))
40798 "Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))
;
40799 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
40800 }
40801 OpDemandedElts =
40802 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
40803
40804 // Extract target shuffle mask and resolve sentinels and inputs.
40805 SmallVector<int, 64> OpMask;
40806 SmallVector<SDValue, 2> OpInputs;
40807 APInt OpUndef, OpZero;
40808 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
40809 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
40810 OpZero, DAG, Depth, false)) {
40811 // Shuffle inputs must not be larger than the shuffle result.
40812 // TODO: Relax this for single input faux shuffles (e.g. trunc).
40813 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
40814 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
40815 }))
40816 return SDValue();
40817 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40818 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40819 !isNullConstant(Op.getOperand(1))) {
40820 SDValue SrcVec = Op.getOperand(0);
40821 int ExtractIdx = Op.getConstantOperandVal(1);
40822 unsigned NumElts = VT.getVectorNumElements();
40823 OpInputs.assign({SrcVec});
40824 OpMask.assign(NumElts, SM_SentinelUndef);
40825 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40826 OpZero = OpUndef = APInt::getZero(NumElts);
40827 } else {
40828 return SDValue();
40829 }
40830
40831 // If the shuffle result was smaller than the root, we need to adjust the
40832 // mask indices and pad the mask with undefs.
40833 if (RootSizeInBits > VT.getSizeInBits()) {
40834 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
40835 unsigned OpMaskSize = OpMask.size();
40836 if (OpInputs.size() > 1) {
40837 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
40838 for (int &M : OpMask) {
40839 if (M < 0)
40840 continue;
40841 int EltIdx = M % OpMaskSize;
40842 int OpIdx = M / OpMaskSize;
40843 M = (PaddedMaskSize * OpIdx) + EltIdx;
40844 }
40845 }
40846 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
40847 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
40848 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40849 }
40850
40851 SmallVector<int, 64> Mask;
40852 SmallVector<SDValue, 16> Ops;
40853
40854 // We don't need to merge masks if the root is empty.
40855 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40856 if (EmptyRoot) {
40857 // Only resolve zeros if it will remove an input, otherwise we might end
40858 // up in an infinite loop.
40859 bool ResolveKnownZeros = true;
40860 if (!OpZero.isZero()) {
40861 APInt UsedInputs = APInt::getZero(OpInputs.size());
40862 for (int i = 0, e = OpMask.size(); i != e; ++i) {
40863 int M = OpMask[i];
40864 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
40865 continue;
40866 UsedInputs.setBit(M / OpMask.size());
40867 if (UsedInputs.isAllOnes()) {
40868 ResolveKnownZeros = false;
40869 break;
40870 }
40871 }
40872 }
40873 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
40874 ResolveKnownZeros);
40875
40876 Mask = OpMask;
40877 Ops.append(OpInputs.begin(), OpInputs.end());
40878 } else {
40879 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
40880
40881 // Add the inputs to the Ops list, avoiding duplicates.
40882 Ops.append(SrcOps.begin(), SrcOps.end());
40883
40884 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40885 // Attempt to find an existing match.
40886 SDValue InputBC = peekThroughBitcasts(Input);
40887 for (int i = 0, e = Ops.size(); i < e; ++i)
40888 if (InputBC == peekThroughBitcasts(Ops[i]))
40889 return i;
40890 // Match failed - should we replace an existing Op?
40891 if (InsertionPoint >= 0) {
40892 Ops[InsertionPoint] = Input;
40893 return InsertionPoint;
40894 }
40895 // Add to the end of the Ops list.
40896 Ops.push_back(Input);
40897 return Ops.size() - 1;
40898 };
40899
40900 SmallVector<int, 2> OpInputIdx;
40901 for (SDValue OpInput : OpInputs)
40902 OpInputIdx.push_back(
40903 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40904
40905 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))
40906 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))
40907 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))
40908 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))
40909 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))
40910 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))
;
40911
40912 // This function can be performance-critical, so we rely on the power-of-2
40913 // knowledge that we have about the mask sizes to replace div/rem ops with
40914 // bit-masks and shifts.
40915 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40916, __extension__
__PRETTY_FUNCTION__))
40916 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40916, __extension__
__PRETTY_FUNCTION__))
;
40917 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40918, __extension__
__PRETTY_FUNCTION__))
40918 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40918, __extension__
__PRETTY_FUNCTION__))
;
40919 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
40920 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
40921
40922 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
40923 unsigned RootRatio =
40924 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
40925 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
40926 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40927, __extension__
__PRETTY_FUNCTION__))
40927 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40927, __extension__
__PRETTY_FUNCTION__))
;
40928
40929 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40929, __extension__
__PRETTY_FUNCTION__))
;
40930 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40930, __extension__
__PRETTY_FUNCTION__))
;
40931 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40931, __extension__
__PRETTY_FUNCTION__))
;
40932 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
40933 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
40934
40935 Mask.resize(MaskWidth, SM_SentinelUndef);
40936
40937 // Merge this shuffle operation's mask into our accumulated mask. Note that
40938 // this shuffle's mask will be the first applied to the input, followed by
40939 // the root mask to get us all the way to the root value arrangement. The
40940 // reason for this order is that we are recursing up the operation chain.
40941 for (unsigned i = 0; i < MaskWidth; ++i) {
40942 unsigned RootIdx = i >> RootRatioLog2;
40943 if (RootMask[RootIdx] < 0) {
40944 // This is a zero or undef lane, we're done.
40945 Mask[i] = RootMask[RootIdx];
40946 continue;
40947 }
40948
40949 unsigned RootMaskedIdx =
40950 RootRatio == 1
40951 ? RootMask[RootIdx]
40952 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40953
40954 // Just insert the scaled root mask value if it references an input other
40955 // than the SrcOp we're currently inserting.
40956 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
40957 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
40958 Mask[i] = RootMaskedIdx;
40959 continue;
40960 }
40961
40962 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40963 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
40964 if (OpMask[OpIdx] < 0) {
40965 // The incoming lanes are zero or undef, it doesn't matter which ones we
40966 // are using.
40967 Mask[i] = OpMask[OpIdx];
40968 continue;
40969 }
40970
40971 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40972 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
40973 : (OpMask[OpIdx] << OpRatioLog2) +
40974 (RootMaskedIdx & (OpRatio - 1));
40975
40976 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
40977 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
40978 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40978, __extension__
__PRETTY_FUNCTION__))
;
40979 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
40980
40981 Mask[i] = OpMaskedIdx;
40982 }
40983 }
40984
40985 // Peek through vector widenings and set out of bounds mask indices to undef.
40986 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
40987 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
40988 SDValue &Op = Ops[I];
40989 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
40990 isNullConstant(Op.getOperand(2))) {
40991 Op = Op.getOperand(1);
40992 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
40993 int Lo = I * Mask.size();
40994 int Hi = (I + 1) * Mask.size();
40995 int NewHi = Lo + (Mask.size() / Scale);
40996 for (int &M : Mask) {
40997 if (Lo <= M && NewHi <= M && M < Hi)
40998 M = SM_SentinelUndef;
40999 }
41000 }
41001 }
41002
41003 // Peek through any free extract_subvector nodes back to root size.
41004 for (SDValue &Op : Ops)
41005 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41006 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41007 isNullConstant(Op.getOperand(1)))
41008 Op = Op.getOperand(0);
41009
41010 // Remove unused/repeated shuffle source ops.
41011 resolveTargetShuffleInputsAndMask(Ops, Mask);
41012
41013 // Handle the all undef/zero/ones cases early.
41014 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41015 return DAG.getUNDEF(RootVT);
41016 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41017 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
41018 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41019 !llvm::is_contained(Mask, SM_SentinelZero))
41020 return getOnesVector(RootVT, DAG, SDLoc(Root));
41021
41022 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41022, __extension__
__PRETTY_FUNCTION__))
;
41023 HasVariableMask |= IsOpVariableMask;
41024
41025 // Update the list of shuffle nodes that have been combined so far.
41026 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
41027 SrcNodes.end());
41028 CombinedNodes.push_back(Op.getNode());
41029
41030 // See if we can recurse into each shuffle source op (if it's a target
41031 // shuffle). The source op should only be generally combined if it either has
41032 // a single use (i.e. current Op) or all its users have already been combined,
41033 // if not then we can still combine but should prevent generation of variable
41034 // shuffles to avoid constant pool bloat.
41035 // Don't recurse if we already have more source ops than we can combine in
41036 // the remaining recursion depth.
41037 if (Ops.size() < (MaxDepth - Depth)) {
41038 for (int i = 0, e = Ops.size(); i < e; ++i) {
41039 // For empty roots, we need to resolve zeroable elements before combining
41040 // them with other shuffles.
41041 SmallVector<int, 64> ResolvedMask = Mask;
41042 if (EmptyRoot)
41043 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41044 bool AllowCrossLaneVar = false;
41045 bool AllowPerLaneVar = false;
41046 if (Ops[i].getNode()->hasOneUse() ||
41047 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41048 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41049 AllowPerLaneVar = AllowVariablePerLaneMask;
41050 }
41051 if (SDValue Res = combineX86ShufflesRecursively(
41052 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41053 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41054 Subtarget))
41055 return Res;
41056 }
41057 }
41058
41059 // Attempt to constant fold all of the constant source ops.
41060 if (SDValue Cst = combineX86ShufflesConstants(
41061 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
41062 return Cst;
41063
41064 // If constant fold failed and we only have constants - then we have
41065 // multiple uses by a single non-variable shuffle - just bail.
41066 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41067 APInt UndefElts;
41068 SmallVector<APInt> RawBits;
41069 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41070 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41071 RawBits);
41072 })) {
41073 return SDValue();
41074 }
41075
41076 // Canonicalize the combined shuffle mask chain with horizontal ops.
41077 // NOTE: This will update the Ops and Mask.
41078 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
41079 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
41080 return DAG.getBitcast(RootVT, HOp);
41081
41082 // Try to refine our inputs given our knowledge of target shuffle mask.
41083 for (auto I : enumerate(Ops)) {
41084 int OpIdx = I.index();
41085 SDValue &Op = I.value();
41086
41087 // What range of shuffle mask element values results in picking from Op?
41088 int Lo = OpIdx * Mask.size();
41089 int Hi = Lo + Mask.size();
41090
41091 // Which elements of Op do we demand, given the mask's granularity?
41092 APInt OpDemandedElts(Mask.size(), 0);
41093 for (int MaskElt : Mask) {
41094 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41095 int OpEltIdx = MaskElt - Lo;
41096 OpDemandedElts.setBit(OpEltIdx);
41097 }
41098 }
41099
41100 // Is the shuffle result smaller than the root?
41101 if (Op.getValueSizeInBits() < RootSizeInBits) {
41102 // We padded the mask with undefs. But we now need to undo that.
41103 unsigned NumExpectedVectorElts = Mask.size();
41104 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41105 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41106 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41108, __extension__
__PRETTY_FUNCTION__))
41107 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41108, __extension__
__PRETTY_FUNCTION__))
41108 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41108, __extension__
__PRETTY_FUNCTION__))
;
41109 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41110 }
41111
41112 // The Op itself may be of different VT, so we need to scale the mask.
41113 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41114 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41115
41116 // Can this operand be simplified any further, given it's demanded elements?
41117 if (SDValue NewOp =
41118 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
41119 Op, OpScaledDemandedElts, DAG))
41120 Op = NewOp;
41121 }
41122 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41123
41124 // Widen any subvector shuffle inputs we've collected.
41125 // TODO: Remove this to avoid generating temporary nodes, we should only
41126 // widen once combineX86ShuffleChain has found a match.
41127 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41128 return Op.getValueSizeInBits() < RootSizeInBits;
41129 })) {
41130 for (SDValue &Op : Ops)
41131 if (Op.getValueSizeInBits() < RootSizeInBits)
41132 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41133 RootSizeInBits);
41134 // Reresolve - we might have repeated subvector sources.
41135 resolveTargetShuffleInputsAndMask(Ops, Mask);
41136 }
41137
41138 // We can only combine unary and binary shuffle mask cases.
41139 if (Ops.size() <= 2) {
41140 // Minor canonicalization of the accumulated shuffle mask to make it easier
41141 // to match below. All this does is detect masks with sequential pairs of
41142 // elements, and shrink them to the half-width mask. It does this in a loop
41143 // so it will reduce the size of the mask to the minimal width mask which
41144 // performs an equivalent shuffle.
41145 while (Mask.size() > 1) {
41146 SmallVector<int, 64> WidenedMask;
41147 if (!canWidenShuffleElements(Mask, WidenedMask))
41148 break;
41149 Mask = std::move(WidenedMask);
41150 }
41151
41152 // Canonicalization of binary shuffle masks to improve pattern matching by
41153 // commuting the inputs.
41154 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41155 ShuffleVectorSDNode::commuteMask(Mask);
41156 std::swap(Ops[0], Ops[1]);
41157 }
41158
41159 // Try to combine into a single shuffle instruction.
41160 if (SDValue Shuffle = combineX86ShuffleChain(
41161 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41162 AllowVariablePerLaneMask, DAG, Subtarget))
41163 return Shuffle;
41164
41165 // If all the operands come from the same larger vector, fallthrough and try
41166 // to use combineX86ShuffleChainWithExtract.
41167 SDValue LHS = peekThroughBitcasts(Ops.front());
41168 SDValue RHS = peekThroughBitcasts(Ops.back());
41169 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41170 (RootSizeInBits / Mask.size()) != 64 ||
41171 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41172 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41173 LHS.getOperand(0) != RHS.getOperand(0))
41174 return SDValue();
41175 }
41176
41177 // If that failed and any input is extracted then try to combine as a
41178 // shuffle with the larger type.
41179 return combineX86ShuffleChainWithExtract(
41180 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41181 AllowVariablePerLaneMask, DAG, Subtarget);
41182}
41183
41184/// Helper entry wrapper to combineX86ShufflesRecursively.
41185static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
41186 const X86Subtarget &Subtarget) {
41187 return combineX86ShufflesRecursively(
41188 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41189 /*HasVarMask*/ false,
41190 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41191 Subtarget);
41192}
41193
41194/// Get the PSHUF-style mask from PSHUF node.
41195///
41196/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41197/// PSHUF-style masks that can be reused with such instructions.
41198static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
41199 MVT VT = N.getSimpleValueType();
41200 SmallVector<int, 4> Mask;
41201 SmallVector<SDValue, 2> Ops;
41202 bool HaveMask =
41203 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
41204 (void)HaveMask;
41205 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41205
, __extension__ __PRETTY_FUNCTION__))
;
41206
41207 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41208 // matter. Check that the upper masks are repeats and remove them.
41209 if (VT.getSizeInBits() > 128) {
41210 int LaneElts = 128 / VT.getScalarSizeInBits();
41211#ifndef NDEBUG
41212 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41213 for (int j = 0; j < LaneElts; ++j)
41214 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41215, __extension__
__PRETTY_FUNCTION__))
41215 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41215, __extension__
__PRETTY_FUNCTION__))
;
41216#endif
41217 Mask.resize(LaneElts);
41218 }
41219
41220 switch (N.getOpcode()) {
41221 case X86ISD::PSHUFD:
41222 return Mask;
41223 case X86ISD::PSHUFLW:
41224 Mask.resize(4);
41225 return Mask;
41226 case X86ISD::PSHUFHW:
41227 Mask.erase(Mask.begin(), Mask.begin() + 4);
41228 for (int &M : Mask)
41229 M -= 4;
41230 return Mask;
41231 default:
41232 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41232)
;
41233 }
41234}
41235
41236/// Search for a combinable shuffle across a chain ending in pshufd.
41237///
41238/// We walk up the chain and look for a combinable shuffle, skipping over
41239/// shuffles that we could hoist this shuffle's transformation past without
41240/// altering anything.
41241static SDValue
41242combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
41243 SelectionDAG &DAG) {
41244 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41245, __extension__
__PRETTY_FUNCTION__))
41245 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41245, __extension__
__PRETTY_FUNCTION__))
;
41246 SDLoc DL(N);
41247
41248 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41249 // of the shuffles in the chain so that we can form a fresh chain to replace
41250 // this one.
41251 SmallVector<SDValue, 8> Chain;
41252 SDValue V = N.getOperand(0);
41253 for (; V.hasOneUse(); V = V.getOperand(0)) {
41254 switch (V.getOpcode()) {
41255 default:
41256 return SDValue(); // Nothing combined!
41257
41258 case ISD::BITCAST:
41259 // Skip bitcasts as we always know the type for the target specific
41260 // instructions.
41261 continue;
41262
41263 case X86ISD::PSHUFD:
41264 // Found another dword shuffle.
41265 break;
41266
41267 case X86ISD::PSHUFLW:
41268 // Check that the low words (being shuffled) are the identity in the
41269 // dword shuffle, and the high words are self-contained.
41270 if (Mask[0] != 0 || Mask[1] != 1 ||
41271 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41272 return SDValue();
41273
41274 Chain.push_back(V);
41275 continue;
41276
41277 case X86ISD::PSHUFHW:
41278 // Check that the high words (being shuffled) are the identity in the
41279 // dword shuffle, and the low words are self-contained.
41280 if (Mask[2] != 2 || Mask[3] != 3 ||
41281 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41282 return SDValue();
41283
41284 Chain.push_back(V);
41285 continue;
41286
41287 case X86ISD::UNPCKL:
41288 case X86ISD::UNPCKH:
41289 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41290 // shuffle into a preceding word shuffle.
41291 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41292 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41293 return SDValue();
41294
41295 // Search for a half-shuffle which we can combine with.
41296 unsigned CombineOp =
41297 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41298 if (V.getOperand(0) != V.getOperand(1) ||
41299 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41300 return SDValue();
41301 Chain.push_back(V);
41302 V = V.getOperand(0);
41303 do {
41304 switch (V.getOpcode()) {
41305 default:
41306 return SDValue(); // Nothing to combine.
41307
41308 case X86ISD::PSHUFLW:
41309 case X86ISD::PSHUFHW:
41310 if (V.getOpcode() == CombineOp)
41311 break;
41312
41313 Chain.push_back(V);
41314
41315 [[fallthrough]];
41316 case ISD::BITCAST:
41317 V = V.getOperand(0);
41318 continue;
41319 }
41320 break;
41321 } while (V.hasOneUse());
41322 break;
41323 }
41324 // Break out of the loop if we break out of the switch.
41325 break;
41326 }
41327
41328 if (!V.hasOneUse())
41329 // We fell out of the loop without finding a viable combining instruction.
41330 return SDValue();
41331
41332 // Merge this node's mask and our incoming mask.
41333 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
41334 for (int &M : Mask)
41335 M = VMask[M];
41336 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41337 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41338
41339 // Rebuild the chain around this new shuffle.
41340 while (!Chain.empty()) {
41341 SDValue W = Chain.pop_back_val();
41342
41343 if (V.getValueType() != W.getOperand(0).getValueType())
41344 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41345
41346 switch (W.getOpcode()) {
41347 default:
41348 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41348)
;
41349
41350 case X86ISD::UNPCKL:
41351 case X86ISD::UNPCKH:
41352 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41353 break;
41354
41355 case X86ISD::PSHUFD:
41356 case X86ISD::PSHUFLW:
41357 case X86ISD::PSHUFHW:
41358 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41359 break;
41360 }
41361 }
41362 if (V.getValueType() != N.getValueType())
41363 V = DAG.getBitcast(N.getValueType(), V);
41364
41365 // Return the new chain to replace N.
41366 return V;
41367}
41368
41369// Attempt to commute shufps LHS loads:
41370// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41371static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
41372 SelectionDAG &DAG) {
41373 // TODO: Add vXf64 support.
41374 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41375 return SDValue();
41376
41377 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41378 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41379 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41380 return SDValue();
41381 SDValue N0 = V.getOperand(0);
41382 SDValue N1 = V.getOperand(1);
41383 unsigned Imm = V.getConstantOperandVal(2);
41384 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41385 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41386 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
41387 return SDValue();
41388 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41389 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41390 DAG.getTargetConstant(Imm, DL, MVT::i8));
41391 };
41392
41393 switch (N.getOpcode()) {
41394 case X86ISD::VPERMILPI:
41395 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41396 unsigned Imm = N.getConstantOperandVal(1);
41397 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41398 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41399 }
41400 break;
41401 case X86ISD::SHUFP: {
41402 SDValue N0 = N.getOperand(0);
41403 SDValue N1 = N.getOperand(1);
41404 unsigned Imm = N.getConstantOperandVal(2);
41405 if (N0 == N1) {
41406 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41407 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41408 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41409 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41410 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41411 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41412 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41413 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41414 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41415 }
41416 break;
41417 }
41418 }
41419
41420 return SDValue();
41421}
41422
41423// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41424static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
41425 const SDLoc &DL) {
41426 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41427 EVT ShuffleVT = N.getValueType();
41428
41429 auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
41430 // AllZeros/AllOnes constants are freely shuffled and will peek through
41431 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41432 // merge with target shuffles if it has one use so shuffle combining is
41433 // likely to kick in. Shuffles of splats are expected to be removed.
41434 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41435 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41436 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
41437 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
41438 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41439 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41440 (FoldLoad && isShuffleFoldableLoad(Op)) ||
41441 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41442 };
41443 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41444 // Ensure we only shuffle whole vector src elements, unless its a logical
41445 // binops where we can more aggressively move shuffles from dst to src.
41446 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
41447 BinOp == X86ISD::ANDNP ||
41448 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41449 };
41450
41451 unsigned Opc = N.getOpcode();
41452 switch (Opc) {
41453 // Unary and Unary+Permute Shuffles.
41454 case X86ISD::PSHUFB: {
41455 // Don't merge PSHUFB if it contains zero'd elements.
41456 SmallVector<int> Mask;
41457 SmallVector<SDValue> Ops;
41458 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
41459 Mask))
41460 break;
41461 [[fallthrough]];
41462 }
41463 case X86ISD::VBROADCAST:
41464 case X86ISD::MOVDDUP:
41465 case X86ISD::PSHUFD:
41466 case X86ISD::PSHUFHW:
41467 case X86ISD::PSHUFLW:
41468 case X86ISD::VPERMI:
41469 case X86ISD::VPERMILPI: {
41470 if (N.getOperand(0).getValueType() == ShuffleVT &&
41471 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41472 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41473 unsigned SrcOpcode = N0.getOpcode();
41474 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41475 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41476 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41477 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
41478 IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
41479 SDValue LHS, RHS;
41480 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41481 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41482 if (N.getNumOperands() == 2) {
41483 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41484 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41485 } else {
41486 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41487 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41488 }
41489 EVT OpVT = N0.getValueType();
41490 return DAG.getBitcast(ShuffleVT,
41491 DAG.getNode(SrcOpcode, DL, OpVT,
41492 DAG.getBitcast(OpVT, LHS),
41493 DAG.getBitcast(OpVT, RHS)));
41494 }
41495 }
41496 }
41497 break;
41498 }
41499 // Binary and Binary+Permute Shuffles.
41500 case X86ISD::INSERTPS: {
41501 // Don't merge INSERTPS if it contains zero'd elements.
41502 unsigned InsertPSMask = N.getConstantOperandVal(2);
41503 unsigned ZeroMask = InsertPSMask & 0xF;
41504 if (ZeroMask != 0)
41505 break;
41506 [[fallthrough]];
41507 }
41508 case X86ISD::MOVSD:
41509 case X86ISD::MOVSS:
41510 case X86ISD::BLENDI:
41511 case X86ISD::SHUFP:
41512 case X86ISD::UNPCKH:
41513 case X86ISD::UNPCKL: {
41514 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41515 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41516 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41517 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41518 unsigned SrcOpcode = N0.getOpcode();
41519 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41520 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41521 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41522 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41523 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
41524 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41525 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
41526 // Ensure the total number of shuffles doesn't increase by folding this
41527 // shuffle through to the source ops.
41528 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41529 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41530 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41531 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41532 SDValue LHS, RHS;
41533 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41534 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41535 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41536 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41537 if (N.getNumOperands() == 3) {
41538 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41539 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41540 } else {
41541 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41542 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41543 }
41544 EVT OpVT = N0.getValueType();
41545 return DAG.getBitcast(ShuffleVT,
41546 DAG.getNode(SrcOpcode, DL, OpVT,
41547 DAG.getBitcast(OpVT, LHS),
41548 DAG.getBitcast(OpVT, RHS)));
41549 }
41550 }
41551 }
41552 break;
41553 }
41554 }
41555 return SDValue();
41556}
41557
41558/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41559static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
41560 SelectionDAG &DAG,
41561 const SDLoc &DL) {
41562 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41562, __extension__
__PRETTY_FUNCTION__))
;
41563
41564 MVT VT = V.getSimpleValueType();
41565 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41566 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41567 unsigned SrcOpc0 = Src0.getOpcode();
41568 unsigned SrcOpc1 = Src1.getOpcode();
41569 EVT SrcVT0 = Src0.getValueType();
41570 EVT SrcVT1 = Src1.getValueType();
41571
41572 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41573 return SDValue();
41574
41575 switch (SrcOpc0) {
41576 case X86ISD::MOVDDUP: {
41577 SDValue LHS = Src0.getOperand(0);
41578 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41579 SDValue Res =
41580 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41581 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41582 return DAG.getBitcast(VT, Res);
41583 }
41584 case X86ISD::VPERMILPI:
41585 // TODO: Handle v4f64 permutes with different low/high lane masks.
41586 if (SrcVT0 == MVT::v4f64) {
41587 uint64_t Mask = Src0.getConstantOperandVal(1);
41588 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41589 break;
41590 }
41591 [[fallthrough]];
41592 case X86ISD::VSHLI:
41593 case X86ISD::VSRLI:
41594 case X86ISD::VSRAI:
41595 case X86ISD::PSHUFD:
41596 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41597 SDValue LHS = Src0.getOperand(0);
41598 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41599 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41600 V.getOperand(2));
41601 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41602 return DAG.getBitcast(VT, Res);
41603 }
41604 break;
41605 }
41606
41607 return SDValue();
41608}
41609
41610/// Try to combine x86 target specific shuffles.
41611static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
41612 TargetLowering::DAGCombinerInfo &DCI,
41613 const X86Subtarget &Subtarget) {
41614 SDLoc DL(N);
41615 MVT VT = N.getSimpleValueType();
41616 SmallVector<int, 4> Mask;
41617 unsigned Opcode = N.getOpcode();
41618
41619 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41620 return R;
41621
41622 // Handle specific target shuffles.
41623 switch (Opcode) {
41624 case X86ISD::MOVDDUP: {
41625 SDValue Src = N.getOperand(0);
41626 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41627 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41628 ISD::isNormalLoad(Src.getNode())) {
41629 LoadSDNode *LN = cast<LoadSDNode>(Src);
41630 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41631 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41632 DCI.CombineTo(N.getNode(), Movddup);
41633 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41634 DCI.recursivelyDeleteUnusedNodes(LN);
41635 return N; // Return N so it doesn't get rechecked!
41636 }
41637 }
41638
41639 return SDValue();
41640 }
41641 case X86ISD::VBROADCAST: {
41642 SDValue Src = N.getOperand(0);
41643 SDValue BC = peekThroughBitcasts(Src);
41644 EVT SrcVT = Src.getValueType();
41645 EVT BCVT = BC.getValueType();
41646
41647 // If broadcasting from another shuffle, attempt to simplify it.
41648 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41649 if (isTargetShuffle(BC.getOpcode()) &&
41650 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41651 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41652 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41653 SM_SentinelUndef);
41654 for (unsigned i = 0; i != Scale; ++i)
41655 DemandedMask[i] = i;
41656 if (SDValue Res = combineX86ShufflesRecursively(
41657 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41658 X86::MaxShuffleCombineDepth,
41659 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41660 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41661 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41662 DAG.getBitcast(SrcVT, Res));
41663 }
41664
41665 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41666 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41667 if (Src.getOpcode() == ISD::BITCAST &&
41668 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41669 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
41670 FixedVectorType::isValidElementType(
41671 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41672 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41673 VT.getVectorNumElements());
41674 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41675 }
41676
41677 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41678 // If we're re-broadcasting a smaller type then broadcast with that type and
41679 // bitcast.
41680 // TODO: Do this for any splat?
41681 if (Src.getOpcode() == ISD::BITCAST &&
41682 (BC.getOpcode() == X86ISD::VBROADCAST ||
41683 BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
41684 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41685 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41686 MVT NewVT =
41687 MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
41688 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41689 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41690 }
41691
41692 // Reduce broadcast source vector to lowest 128-bits.
41693 if (SrcVT.getSizeInBits() > 128)
41694 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41695 extract128BitVector(Src, 0, DAG, DL));
41696
41697 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41698 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
41699 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41700
41701 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41702 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41703 isNullConstant(Src.getOperand(1)) &&
41704 DAG.getTargetLoweringInfo().isTypeLegal(
41705 Src.getOperand(0).getValueType()))
41706 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41707
41708 // Share broadcast with the longest vector and extract low subvector (free).
41709 // Ensure the same SDValue from the SDNode use is being used.
41710 for (SDNode *User : Src->uses())
41711 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41712 Src == User->getOperand(0) &&
41713 User->getValueSizeInBits(0).getFixedValue() >
41714 VT.getFixedSizeInBits()) {
41715 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41716 VT.getSizeInBits());
41717 }
41718
41719 // vbroadcast(scalarload X) -> vbroadcast_load X
41720 // For float loads, extract other uses of the scalar from the broadcast.
41721 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41722 ISD::isNormalLoad(Src.getNode())) {
41723 LoadSDNode *LN = cast<LoadSDNode>(Src);
41724 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41725 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41726 SDValue BcastLd =
41727 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41728 LN->getMemoryVT(), LN->getMemOperand());
41729 // If the load value is used only by N, replace it via CombineTo N.
41730 bool NoReplaceExtract = Src.hasOneUse();
41731 DCI.CombineTo(N.getNode(), BcastLd);
41732 if (NoReplaceExtract) {
41733 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41734 DCI.recursivelyDeleteUnusedNodes(LN);
41735 } else {
41736 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41737 DAG.getIntPtrConstant(0, DL));
41738 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41739 }
41740 return N; // Return N so it doesn't get rechecked!
41741 }
41742
41743 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41744 // i16. So shrink it ourselves if we can make a broadcast_load.
41745 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41746 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41747 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41747, __extension__
__PRETTY_FUNCTION__))
;
41748 SDValue TruncIn = Src.getOperand(0);
41749
41750 // If this is a truncate of a non extending load we can just narrow it to
41751 // use a broadcast_load.
41752 if (ISD::isNormalLoad(TruncIn.getNode())) {
41753 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41754 // Unless its volatile or atomic.
41755 if (LN->isSimple()) {
41756 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41757 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41758 SDValue BcastLd = DAG.getMemIntrinsicNode(
41759 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41760 LN->getPointerInfo(), LN->getOriginalAlign(),
41761 LN->getMemOperand()->getFlags());
41762 DCI.CombineTo(N.getNode(), BcastLd);
41763 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41764 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41765 return N; // Return N so it doesn't get rechecked!
41766 }
41767 }
41768
41769 // If this is a truncate of an i16 extload, we can directly replace it.
41770 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41771 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41772 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41773 if (LN->getMemoryVT().getSizeInBits() == 16) {
41774 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41775 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41776 SDValue BcastLd =
41777 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41778 LN->getMemoryVT(), LN->getMemOperand());
41779 DCI.CombineTo(N.getNode(), BcastLd);
41780 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41781 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41782 return N; // Return N so it doesn't get rechecked!
41783 }
41784 }
41785
41786 // If this is a truncate of load that has been shifted right, we can
41787 // offset the pointer and use a narrower load.
41788 if (TruncIn.getOpcode() == ISD::SRL &&
41789 TruncIn.getOperand(0).hasOneUse() &&
41790 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
41791 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
41792 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
41793 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
41794 // Make sure the shift amount and the load size are divisible by 16.
41795 // Don't do this if the load is volatile or atomic.
41796 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
41797 LN->isSimple()) {
41798 unsigned Offset = ShiftAmt / 8;
41799 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41800 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
41801 TypeSize::Fixed(Offset), DL);
41802 SDValue Ops[] = { LN->getChain(), Ptr };
41803 SDValue BcastLd = DAG.getMemIntrinsicNode(
41804 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41805 LN->getPointerInfo().getWithOffset(Offset),
41806 LN->getOriginalAlign(),
41807 LN->getMemOperand()->getFlags());
41808 DCI.CombineTo(N.getNode(), BcastLd);
41809 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41810 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41811 return N; // Return N so it doesn't get rechecked!
41812 }
41813 }
41814 }
41815
41816 // vbroadcast(vzload X) -> vbroadcast_load X
41817 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
41818 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
41819 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41820 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41821 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41822 SDValue BcastLd =
41823 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41824 LN->getMemoryVT(), LN->getMemOperand());
41825 DCI.CombineTo(N.getNode(), BcastLd);
41826 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41827 DCI.recursivelyDeleteUnusedNodes(LN);
41828 return N; // Return N so it doesn't get rechecked!
41829 }
41830 }
41831
41832 // vbroadcast(vector load X) -> vbroadcast_load
41833 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
41834 SrcVT == MVT::v4i32) &&
41835 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
41836 LoadSDNode *LN = cast<LoadSDNode>(Src);
41837 // Unless the load is volatile or atomic.
41838 if (LN->isSimple()) {
41839 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41840 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
41841 SDValue BcastLd = DAG.getMemIntrinsicNode(
41842 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
41843 LN->getPointerInfo(), LN->getOriginalAlign(),
41844 LN->getMemOperand()->getFlags());
41845 DCI.CombineTo(N.getNode(), BcastLd);
41846 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41847 DCI.recursivelyDeleteUnusedNodes(LN);
41848 return N; // Return N so it doesn't get rechecked!
41849 }
41850 }
41851
41852 return SDValue();
41853 }
41854 case X86ISD::VZEXT_MOVL: {
41855 SDValue N0 = N.getOperand(0);
41856
41857 // If this a vzmovl of a full vector load, replace it with a vzload, unless
41858 // the load is volatile.
41859 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
41860 auto *LN = cast<LoadSDNode>(N0);
41861 if (SDValue VZLoad =
41862 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
41863 DCI.CombineTo(N.getNode(), VZLoad);
41864 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41865 DCI.recursivelyDeleteUnusedNodes(LN);
41866 return N;
41867 }
41868 }
41869
41870 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
41871 // and can just use a VZEXT_LOAD.
41872 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
41873 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
41874 auto *LN = cast<MemSDNode>(N0);
41875 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
41876 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41877 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
41878 SDValue VZLoad =
41879 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
41880 LN->getMemoryVT(), LN->getMemOperand());
41881 DCI.CombineTo(N.getNode(), VZLoad);
41882 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41883 DCI.recursivelyDeleteUnusedNodes(LN);
41884 return N;
41885 }
41886 }
41887
41888 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
41889 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
41890 // if the upper bits of the i64 are zero.
41891 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
41892 N0.getOperand(0).hasOneUse() &&
41893 N0.getOperand(0).getValueType() == MVT::i64) {
41894 SDValue In = N0.getOperand(0);
41895 APInt Mask = APInt::getHighBitsSet(64, 32);
41896 if (DAG.MaskedValueIsZero(In, Mask)) {
41897 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
41898 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
41899 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
41900 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
41901 return DAG.getBitcast(VT, Movl);
41902 }
41903 }
41904
41905 // Load a scalar integer constant directly to XMM instead of transferring an
41906 // immediate value from GPR.
41907 // vzext_movl (scalar_to_vector C) --> load [C,0...]
41908 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
41909 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
41910 // Create a vector constant - scalar constant followed by zeros.
41911 EVT ScalarVT = N0.getOperand(0).getValueType();
41912 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
41913 unsigned NumElts = VT.getVectorNumElements();
41914 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
41915 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
41916 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
41917
41918 // Load the vector constant from constant pool.
41919 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
41920 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
41921 MachinePointerInfo MPI =
41922 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
41923 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
41924 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
41925 MachineMemOperand::MOLoad);
41926 }
41927 }
41928
41929 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
41930 // insert into a zero vector. This helps get VZEXT_MOVL closer to
41931 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
41932 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
41933 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
41934 SDValue V = peekThroughOneUseBitcasts(N0);
41935
41936 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
41937 isNullConstant(V.getOperand(2))) {
41938 SDValue In = V.getOperand(1);
41939 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
41940 In.getValueSizeInBits() /
41941 VT.getScalarSizeInBits());
41942 In = DAG.getBitcast(SubVT, In);
41943 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
41944 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
41945 getZeroVector(VT, Subtarget, DAG, DL), Movl,
41946 V.getOperand(2));
41947 }
41948 }
41949
41950 return SDValue();
41951 }
41952 case X86ISD::BLENDI: {
41953 SDValue N0 = N.getOperand(0);
41954 SDValue N1 = N.getOperand(1);
41955
41956 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41957 // TODO: Handle MVT::v16i16 repeated blend mask.
41958 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
41959 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41960 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41961 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
41962 SrcVT.getScalarSizeInBits() >= 32) {
41963 unsigned BlendMask = N.getConstantOperandVal(2);
41964 unsigned Size = VT.getVectorNumElements();
41965 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
41966 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
41967 return DAG.getBitcast(
41968 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41969 N1.getOperand(0),
41970 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
41971 }
41972 }
41973 return SDValue();
41974 }
41975 case X86ISD::SHUFP: {
41976 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
41977 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
41978 // TODO: Support types other than v4f32.
41979 if (VT == MVT::v4f32) {
41980 bool Updated = false;
41981 SmallVector<int> Mask;
41982 SmallVector<SDValue> Ops;
41983 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
41984 Ops.size() == 2) {
41985 for (int i = 0; i != 2; ++i) {
41986 SmallVector<SDValue> SubOps;
41987 SmallVector<int> SubMask, SubScaledMask;
41988 SDValue Sub = peekThroughBitcasts(Ops[i]);
41989 // TODO: Scaling might be easier if we specify the demanded elts.
41990 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
41991 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
41992 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
41993 int Ofs = i * 2;
41994 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
41995 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
41996 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
41997 Updated = true;
41998 }
41999 }
42000 }
42001 if (Updated) {
42002 for (int &M : Mask)
42003 M %= 4;
42004 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42005 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42006 }
42007 }
42008 return SDValue();
42009 }
42010 case X86ISD::VPERMI: {
42011 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42012 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42013 SDValue N0 = N.getOperand(0);
42014 SDValue N1 = N.getOperand(1);
42015 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42016 if (N0.getOpcode() == ISD::BITCAST &&
42017 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42018 SDValue Src = N0.getOperand(0);
42019 EVT SrcVT = Src.getValueType();
42020 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42021 return DAG.getBitcast(VT, Res);
42022 }
42023 return SDValue();
42024 }
42025 case X86ISD::VPERM2X128: {
42026 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42027 SDValue LHS = N->getOperand(0);
42028 SDValue RHS = N->getOperand(1);
42029 if (LHS.getOpcode() == ISD::BITCAST &&
42030 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42031 EVT SrcVT = LHS.getOperand(0).getValueType();
42032 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42033 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42034 DAG.getBitcast(SrcVT, LHS),
42035 DAG.getBitcast(SrcVT, RHS),
42036 N->getOperand(2)));
42037 }
42038 }
42039
42040 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42041 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
42042 return Res;
42043
42044 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42045 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42046 auto FindSubVector128 = [&](unsigned Idx) {
42047 if (Idx > 3)
42048 return SDValue();
42049 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42050 SmallVector<SDValue> SubOps;
42051 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42052 return SubOps[Idx & 1];
42053 unsigned NumElts = Src.getValueType().getVectorNumElements();
42054 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42055 Src.getOperand(1).getValueSizeInBits() == 128 &&
42056 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42057 return Src.getOperand(1);
42058 }
42059 return SDValue();
42060 };
42061 unsigned Imm = N.getConstantOperandVal(2);
42062 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42063 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42064 MVT SubVT = VT.getHalfNumVectorElementsVT();
42065 SubLo = DAG.getBitcast(SubVT, SubLo);
42066 SubHi = DAG.getBitcast(SubVT, SubHi);
42067 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42068 }
42069 }
42070 return SDValue();
42071 }
42072 case X86ISD::PSHUFD:
42073 case X86ISD::PSHUFLW:
42074 case X86ISD::PSHUFHW:
42075 Mask = getPSHUFShuffleMask(N);
42076 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42076, __extension__ __PRETTY_FUNCTION__))
;
42077 break;
42078 case X86ISD::MOVSD:
42079 case X86ISD::MOVSH:
42080 case X86ISD::MOVSS: {
42081 SDValue N0 = N.getOperand(0);
42082 SDValue N1 = N.getOperand(1);
42083
42084 // Canonicalize scalar FPOps:
42085 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42086 // If commutable, allow OP(N1[0], N0[0]).
42087 unsigned Opcode1 = N1.getOpcode();
42088 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42089 Opcode1 == ISD::FDIV) {
42090 SDValue N10 = N1.getOperand(0);
42091 SDValue N11 = N1.getOperand(1);
42092 if (N10 == N0 ||
42093 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42094 if (N10 != N0)
42095 std::swap(N10, N11);
42096 MVT SVT = VT.getVectorElementType();
42097 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
42098 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42099 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42100 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42101 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42102 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42103 }
42104 }
42105
42106 return SDValue();
42107 }
42108 case X86ISD::INSERTPS: {
42109 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42109, __extension__
__PRETTY_FUNCTION__))
;
42110 SDValue Op0 = N.getOperand(0);
42111 SDValue Op1 = N.getOperand(1);
42112 unsigned InsertPSMask = N.getConstantOperandVal(2);
42113 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42114 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42115 unsigned ZeroMask = InsertPSMask & 0xF;
42116
42117 // If we zero out all elements from Op0 then we don't need to reference it.
42118 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42119 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42120 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42121
42122 // If we zero out the element from Op1 then we don't need to reference it.
42123 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42124 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42125 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42126
42127 // Attempt to merge insertps Op1 with an inner target shuffle node.
42128 SmallVector<int, 8> TargetMask1;
42129 SmallVector<SDValue, 2> Ops1;
42130 APInt KnownUndef1, KnownZero1;
42131 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42132 KnownZero1)) {
42133 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42134 // Zero/UNDEF insertion - zero out element and remove dependency.
42135 InsertPSMask |= (1u << DstIdx);
42136 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42137 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42138 }
42139 // Update insertps mask srcidx and reference the source input directly.
42140 int M = TargetMask1[SrcIdx];
42141 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42141, __extension__
__PRETTY_FUNCTION__))
;
42142 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42143 Op1 = Ops1[M < 4 ? 0 : 1];
42144 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42145 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42146 }
42147
42148 // Attempt to merge insertps Op0 with an inner target shuffle node.
42149 SmallVector<int, 8> TargetMask0;
42150 SmallVector<SDValue, 2> Ops0;
42151 APInt KnownUndef0, KnownZero0;
42152 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42153 KnownZero0)) {
42154 bool Updated = false;
42155 bool UseInput00 = false;
42156 bool UseInput01 = false;
42157 for (int i = 0; i != 4; ++i) {
42158 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42159 // No change if element is already zero or the inserted element.
42160 continue;
42161 }
42162
42163 if (KnownUndef0[i] || KnownZero0[i]) {
42164 // If the target mask is undef/zero then we must zero the element.
42165 InsertPSMask |= (1u << i);
42166 Updated = true;
42167 continue;
42168 }
42169
42170 // The input vector element must be inline.
42171 int M = TargetMask0[i];
42172 if (M != i && M != (i + 4))
42173 return SDValue();
42174
42175 // Determine which inputs of the target shuffle we're using.
42176 UseInput00 |= (0 <= M && M < 4);
42177 UseInput01 |= (4 <= M);
42178 }
42179
42180 // If we're not using both inputs of the target shuffle then use the
42181 // referenced input directly.
42182 if (UseInput00 && !UseInput01) {
42183 Updated = true;
42184 Op0 = Ops0[0];
42185 } else if (!UseInput00 && UseInput01) {
42186 Updated = true;
42187 Op0 = Ops0[1];
42188 }
42189
42190 if (Updated)
42191 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42192 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42193 }
42194
42195 // If we're inserting an element from a vbroadcast load, fold the
42196 // load into the X86insertps instruction. We need to convert the scalar
42197 // load to a vector and clear the source lane of the INSERTPS control.
42198 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42199 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42200 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42201 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42202 MemIntr->getBasePtr(),
42203 MemIntr->getMemOperand());
42204 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42205 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
42206 Load),
42207 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42208 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42209 return Insert;
42210 }
42211 }
42212
42213 return SDValue();
42214 }
42215 default:
42216 return SDValue();
42217 }
42218
42219 // Nuke no-op shuffles that show up after combining.
42220 if (isNoopShuffleMask(Mask))
42221 return N.getOperand(0);
42222
42223 // Look for simplifications involving one or two shuffle instructions.
42224 SDValue V = N.getOperand(0);
42225 switch (N.getOpcode()) {
42226 default:
42227 break;
42228 case X86ISD::PSHUFLW:
42229 case X86ISD::PSHUFHW:
42230 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42230, __extension__
__PRETTY_FUNCTION__))
;
42231
42232 // See if this reduces to a PSHUFD which is no more expensive and can
42233 // combine with more operations. Note that it has to at least flip the
42234 // dwords as otherwise it would have been removed as a no-op.
42235 if (ArrayRef(Mask).equals({2, 3, 0, 1})) {
42236 int DMask[] = {0, 1, 2, 3};
42237 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42238 DMask[DOffset + 0] = DOffset + 1;
42239 DMask[DOffset + 1] = DOffset + 0;
42240 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
42241 V = DAG.getBitcast(DVT, V);
42242 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42243 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42244 return DAG.getBitcast(VT, V);
42245 }
42246
42247 // Look for shuffle patterns which can be implemented as a single unpack.
42248 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42249 // only works when we have a PSHUFD followed by two half-shuffles.
42250 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42251 (V.getOpcode() == X86ISD::PSHUFLW ||
42252 V.getOpcode() == X86ISD::PSHUFHW) &&
42253 V.getOpcode() != N.getOpcode() &&
42254 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42255 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42256 if (D.getOpcode() == X86ISD::PSHUFD) {
42257 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
42258 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
42259 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42260 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42261 int WordMask[8];
42262 for (int i = 0; i < 4; ++i) {
42263 WordMask[i + NOffset] = Mask[i] + NOffset;
42264 WordMask[i + VOffset] = VMask[i] + VOffset;
42265 }
42266 // Map the word mask through the DWord mask.
42267 int MappedMask[8];
42268 for (int i = 0; i < 8; ++i)
42269 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42270 if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42271 ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42272 // We can replace all three shuffles with an unpack.
42273 V = DAG.getBitcast(VT, D.getOperand(0));
42274 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42275 : X86ISD::UNPCKH,
42276 DL, VT, V, V);
42277 }
42278 }
42279 }
42280
42281 break;
42282
42283 case X86ISD::PSHUFD:
42284 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
42285 return NewN;
42286
42287 break;
42288 }
42289
42290 return SDValue();
42291}
42292
42293/// Checks if the shuffle mask takes subsequent elements
42294/// alternately from two vectors.
42295/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42296static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42297
42298 int ParitySrc[2] = {-1, -1};
42299 unsigned Size = Mask.size();
42300 for (unsigned i = 0; i != Size; ++i) {
42301 int M = Mask[i];
42302 if (M < 0)
42303 continue;
42304
42305 // Make sure we are using the matching element from the input.
42306 if ((M % Size) != i)
42307 return false;
42308
42309 // Make sure we use the same input for all elements of the same parity.
42310 int Src = M / Size;
42311 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42312 return false;
42313 ParitySrc[i % 2] = Src;
42314 }
42315
42316 // Make sure each input is used.
42317 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42318 return false;
42319
42320 Op0Even = ParitySrc[0] == 0;
42321 return true;
42322}
42323
42324/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42325/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42326/// are written to the parameters \p Opnd0 and \p Opnd1.
42327///
42328/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42329/// so it is easier to generically match. We also insert dummy vector shuffle
42330/// nodes for the operands which explicitly discard the lanes which are unused
42331/// by this operation to try to flow through the rest of the combiner
42332/// the fact that they're unused.
42333static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42334 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42335 bool &IsSubAdd) {
42336
42337 EVT VT = N->getValueType(0);
42338 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42339 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42340 !VT.getSimpleVT().isFloatingPoint())
42341 return false;
42342
42343 // We only handle target-independent shuffles.
42344 // FIXME: It would be easy and harmless to use the target shuffle mask
42345 // extraction tool to support more.
42346 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42347 return false;
42348
42349 SDValue V1 = N->getOperand(0);
42350 SDValue V2 = N->getOperand(1);
42351
42352 // Make sure we have an FADD and an FSUB.
42353 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42354 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42355 V1.getOpcode() == V2.getOpcode())
42356 return false;
42357
42358 // If there are other uses of these operations we can't fold them.
42359 if (!V1->hasOneUse() || !V2->hasOneUse())
42360 return false;
42361
42362 // Ensure that both operations have the same operands. Note that we can
42363 // commute the FADD operands.
42364 SDValue LHS, RHS;
42365 if (V1.getOpcode() == ISD::FSUB) {
42366 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42367 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42368 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42369 return false;
42370 } else {
42371 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42371, __extension__
__PRETTY_FUNCTION__))
;
42372 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42373 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42374 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42375 return false;
42376 }
42377
42378 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42379 bool Op0Even;
42380 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42381 return false;
42382
42383 // It's a subadd if the vector in the even parity is an FADD.
42384 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42385 : V2->getOpcode() == ISD::FADD;
42386
42387 Opnd0 = LHS;
42388 Opnd1 = RHS;
42389 return true;
42390}
42391
42392/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42393static SDValue combineShuffleToFMAddSub(SDNode *N,
42394 const X86Subtarget &Subtarget,
42395 SelectionDAG &DAG) {
42396 // We only handle target-independent shuffles.
42397 // FIXME: It would be easy and harmless to use the target shuffle mask
42398 // extraction tool to support more.
42399 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42400 return SDValue();
42401
42402 MVT VT = N->getSimpleValueType(0);
42403 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42404 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42405 return SDValue();
42406
42407 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42408 SDValue Op0 = N->getOperand(0);
42409 SDValue Op1 = N->getOperand(1);
42410 SDValue FMAdd = Op0, FMSub = Op1;
42411 if (FMSub.getOpcode() != X86ISD::FMSUB)
42412 std::swap(FMAdd, FMSub);
42413
42414 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42415 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42416 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42417 FMAdd.getOperand(2) != FMSub.getOperand(2))
42418 return SDValue();
42419
42420 // Check for correct shuffle mask.
42421 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42422 bool Op0Even;
42423 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42424 return SDValue();
42425
42426 // FMAddSub takes zeroth operand from FMSub node.
42427 SDLoc DL(N);
42428 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42429 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42430 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42431 FMAdd.getOperand(2));
42432}
42433
42434/// Try to combine a shuffle into a target-specific add-sub or
42435/// mul-add-sub node.
42436static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
42437 const X86Subtarget &Subtarget,
42438 SelectionDAG &DAG) {
42439 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
42440 return V;
42441
42442 SDValue Opnd0, Opnd1;
42443 bool IsSubAdd;
42444 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42445 return SDValue();
42446
42447 MVT VT = N->getSimpleValueType(0);
42448 SDLoc DL(N);
42449
42450 // Try to generate X86ISD::FMADDSUB node here.
42451 SDValue Opnd2;
42452 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42453 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42454 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42455 }
42456
42457 if (IsSubAdd)
42458 return SDValue();
42459
42460 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42461 // the ADDSUB idiom has been successfully recognized. There are no known
42462 // X86 targets with 512-bit ADDSUB instructions!
42463 if (VT.is512BitVector())
42464 return SDValue();
42465
42466 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42467 // the ADDSUB idiom has been successfully recognized. There are no known
42468 // X86 targets with FP16 ADDSUB instructions!
42469 if (VT.getVectorElementType() == MVT::f16)
42470 return SDValue();
42471
42472 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42473}
42474
42475// We are looking for a shuffle where both sources are concatenated with undef
42476// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42477// if we can express this as a single-source shuffle, that's preferable.
42478static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
42479 const X86Subtarget &Subtarget) {
42480 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42481 return SDValue();
42482
42483 EVT VT = N->getValueType(0);
42484
42485 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42486 if (!VT.is128BitVector() && !VT.is256BitVector())
42487 return SDValue();
42488
42489 if (VT.getVectorElementType() != MVT::i32 &&
42490 VT.getVectorElementType() != MVT::i64 &&
42491 VT.getVectorElementType() != MVT::f32 &&
42492 VT.getVectorElementType() != MVT::f64)
42493 return SDValue();
42494
42495 SDValue N0 = N->getOperand(0);
42496 SDValue N1 = N->getOperand(1);
42497
42498 // Check that both sources are concats with undef.
42499 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42500 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42501 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42502 !N1.getOperand(1).isUndef())
42503 return SDValue();
42504
42505 // Construct the new shuffle mask. Elements from the first source retain their
42506 // index, but elements from the second source no longer need to skip an undef.
42507 SmallVector<int, 8> Mask;
42508 int NumElts = VT.getVectorNumElements();
42509
42510 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
42511 for (int Elt : SVOp->getMask())
42512 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42513
42514 SDLoc DL(N);
42515 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
42516 N1.getOperand(0));
42517 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42518}
42519
42520/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42521/// low half of each source vector and does not set any high half elements in
42522/// the destination vector, narrow the shuffle to half its original size.
42523static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
42524 if (!Shuf->getValueType(0).isSimple())
42525 return SDValue();
42526 MVT VT = Shuf->getSimpleValueType(0);
42527 if (!VT.is256BitVector() && !VT.is512BitVector())
42528 return SDValue();
42529
42530 // See if we can ignore all of the high elements of the shuffle.
42531 ArrayRef<int> Mask = Shuf->getMask();
42532 if (!isUndefUpperHalf(Mask))
42533 return SDValue();
42534
42535 // Check if the shuffle mask accesses only the low half of each input vector
42536 // (half-index output is 0 or 2).
42537 int HalfIdx1, HalfIdx2;
42538 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42539 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42540 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42541 return SDValue();
42542
42543 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42544 // The trick is knowing that all of the insert/extract are actually free
42545 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42546 // of narrow inputs into a narrow output, and that is always cheaper than
42547 // the wide shuffle that we started with.
42548 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42549 Shuf->getOperand(1), HalfMask, HalfIdx1,
42550 HalfIdx2, false, DAG, /*UseConcat*/true);
42551}
42552
42553static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
42554 TargetLowering::DAGCombinerInfo &DCI,
42555 const X86Subtarget &Subtarget) {
42556 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42557 if (SDValue V = narrowShuffle(Shuf, DAG))
42558 return V;
42559
42560 // If we have legalized the vector types, look for blends of FADD and FSUB
42561 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42562 SDLoc dl(N);
42563 EVT VT = N->getValueType(0);
42564 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42565 if (TLI.isTypeLegal(VT))
42566 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
42567 return AddSub;
42568
42569 // Attempt to combine into a vector load/broadcast.
42570 if (SDValue LD = combineToConsecutiveLoads(
42571 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42572 return LD;
42573
42574 // For AVX2, we sometimes want to combine
42575 // (vector_shuffle <mask> (concat_vectors t1, undef)
42576 // (concat_vectors t2, undef))
42577 // Into:
42578 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42579 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42580 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
42581 return ShufConcat;
42582
42583 if (isTargetShuffle(N->getOpcode())) {
42584 SDValue Op(N, 0);
42585 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
42586 return Shuffle;
42587
42588 // Try recursively combining arbitrary sequences of x86 shuffle
42589 // instructions into higher-order shuffles. We do this after combining
42590 // specific PSHUF instruction sequences into their minimal form so that we
42591 // can evaluate how many specialized shuffle instructions are involved in
42592 // a particular chain.
42593 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42594 return Res;
42595
42596 // Simplify source operands based on shuffle mask.
42597 // TODO - merge this into combineX86ShufflesRecursively.
42598 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42599 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42600 return SDValue(N, 0);
42601
42602 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42603 // Perform this after other shuffle combines to allow inner shuffles to be
42604 // combined away first.
42605 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
42606 return BinOp;
42607 }
42608
42609 return SDValue();
42610}
42611
42612// Simplify variable target shuffle masks based on the demanded elements.
42613// TODO: Handle DemandedBits in mask indices as well?
42614bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
42615 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42616 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42617 // If we're demanding all elements don't bother trying to simplify the mask.
42618 unsigned NumElts = DemandedElts.getBitWidth();
42619 if (DemandedElts.isAllOnes())
42620 return false;
42621
42622 SDValue Mask = Op.getOperand(MaskIndex);
42623 if (!Mask.hasOneUse())
42624 return false;
42625
42626 // Attempt to generically simplify the variable shuffle mask.
42627 APInt MaskUndef, MaskZero;
42628 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42629 Depth + 1))
42630 return true;
42631
42632 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42633 // TODO: Support other types from getTargetShuffleMaskIndices?
42634 SDValue BC = peekThroughOneUseBitcasts(Mask);
42635 EVT BCVT = BC.getValueType();
42636 auto *Load = dyn_cast<LoadSDNode>(BC);
42637 if (!Load)
42638 return false;
42639
42640 const Constant *C = getTargetConstantFromNode(Load);
42641 if (!C)
42642 return false;
42643
42644 Type *CTy = C->getType();
42645 if (!CTy->isVectorTy() ||
42646 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42647 return false;
42648
42649 // Handle scaling for i64 elements on 32-bit targets.
42650 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42651 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
42652 return false;
42653 unsigned Scale = NumCstElts / NumElts;
42654
42655 // Simplify mask if we have an undemanded element that is not undef.
42656 bool Simplified = false;
42657 SmallVector<Constant *, 32> ConstVecOps;
42658 for (unsigned i = 0; i != NumCstElts; ++i) {
42659 Constant *Elt = C->getAggregateElement(i);
42660 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
42661 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42662 Simplified = true;
42663 continue;
42664 }
42665 ConstVecOps.push_back(Elt);
42666 }
42667 if (!Simplified)
42668 return false;
42669
42670 // Generate new constant pool entry + legalize immediately for the load.
42671 SDLoc DL(Op);
42672 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
42673 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
42674 SDValue NewMask = TLO.DAG.getLoad(
42675 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
42676 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
42677 Load->getAlign());
42678 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
42679}
42680
42681bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
42682 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
42683 TargetLoweringOpt &TLO, unsigned Depth) const {
42684 int NumElts = DemandedElts.getBitWidth();
42685 unsigned Opc = Op.getOpcode();
42686 EVT VT = Op.getValueType();
42687
42688 // Handle special case opcodes.
42689 switch (Opc) {
42690 case X86ISD::PMULDQ:
42691 case X86ISD::PMULUDQ: {
42692 APInt LHSUndef, LHSZero;
42693 APInt RHSUndef, RHSZero;
42694 SDValue LHS = Op.getOperand(0);
42695 SDValue RHS = Op.getOperand(1);
42696 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42697 Depth + 1))
42698 return true;
42699 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42700 Depth + 1))
42701 return true;
42702 // Multiply by zero.
42703 KnownZero = LHSZero | RHSZero;
42704 break;
42705 }
42706 case X86ISD::VPMADDWD: {
42707 APInt LHSUndef, LHSZero;
42708 APInt RHSUndef, RHSZero;
42709 SDValue LHS = Op.getOperand(0);
42710 SDValue RHS = Op.getOperand(1);
42711 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
42712
42713 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
42714 Depth + 1))
42715 return true;
42716 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
42717 Depth + 1))
42718 return true;
42719
42720 // TODO: Multiply by zero.
42721
42722 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
42723 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
42724 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
42725 Depth + 1))
42726 return true;
42727 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
42728 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
42729 Depth + 1))
42730 return true;
42731 break;
42732 }
42733 case X86ISD::PSADBW: {
42734 SDValue LHS = Op.getOperand(0);
42735 SDValue RHS = Op.getOperand(1);
42736 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))
42737 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))
42738 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))
42739 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))
;
42740
42741 // Aggressively peek through ops to get at the demanded elts.
42742 if (!DemandedElts.isAllOnes()) {
42743 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
42744 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
42745 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
42746 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42747 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
42748 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42749 if (NewLHS || NewRHS) {
42750 NewLHS = NewLHS ? NewLHS : LHS;
42751 NewRHS = NewRHS ? NewRHS : RHS;
42752 return TLO.CombineTo(
42753 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42754 }
42755 }
42756 break;
42757 }
42758 case X86ISD::VSHL:
42759 case X86ISD::VSRL:
42760 case X86ISD::VSRA: {
42761 // We only need the bottom 64-bits of the (128-bit) shift amount.
42762 SDValue Amt = Op.getOperand(1);
42763 MVT AmtVT = Amt.getSimpleValueType();
42764 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42764, __extension__
__PRETTY_FUNCTION__))
;
42765
42766 // If we reuse the shift amount just for sse shift amounts then we know that
42767 // only the bottom 64-bits are only ever used.
42768 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
42769 unsigned UseOpc = Use->getOpcode();
42770 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
42771 UseOpc == X86ISD::VSRA) &&
42772 Use->getOperand(0) != Amt;
42773 });
42774
42775 APInt AmtUndef, AmtZero;
42776 unsigned NumAmtElts = AmtVT.getVectorNumElements();
42777 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
42778 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
42779 Depth + 1, AssumeSingleUse))
42780 return true;
42781 [[fallthrough]];
42782 }
42783 case X86ISD::VSHLI:
42784 case X86ISD::VSRLI:
42785 case X86ISD::VSRAI: {
42786 SDValue Src = Op.getOperand(0);
42787 APInt SrcUndef;
42788 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
42789 Depth + 1))
42790 return true;
42791
42792 // Fold shift(0,x) -> 0
42793 if (DemandedElts.isSubsetOf(KnownZero))
42794 return TLO.CombineTo(
42795 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42796
42797 // Aggressively peek through ops to get at the demanded elts.
42798 if (!DemandedElts.isAllOnes())
42799 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
42800 Src, DemandedElts, TLO.DAG, Depth + 1))
42801 return TLO.CombineTo(
42802 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
42803 break;
42804 }
42805 case X86ISD::VPSHA:
42806 case X86ISD::VPSHL:
42807 case X86ISD::VSHLV:
42808 case X86ISD::VSRLV:
42809 case X86ISD::VSRAV: {
42810 APInt LHSUndef, LHSZero;
42811 APInt RHSUndef, RHSZero;
42812 SDValue LHS = Op.getOperand(0);
42813 SDValue RHS = Op.getOperand(1);
42814 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42815 Depth + 1))
42816 return true;
42817
42818 // Fold shift(0,x) -> 0
42819 if (DemandedElts.isSubsetOf(LHSZero))
42820 return TLO.CombineTo(
42821 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42822
42823 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42824 Depth + 1))
42825 return true;
42826
42827 KnownZero = LHSZero;
42828 break;
42829 }
42830 case X86ISD::KSHIFTL: {
42831 SDValue Src = Op.getOperand(0);
42832 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42833 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42833, __extension__
__PRETTY_FUNCTION__))
;
42834 unsigned ShiftAmt = Amt->getZExtValue();
42835
42836 if (ShiftAmt == 0)
42837 return TLO.CombineTo(Op, Src);
42838
42839 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42840 // single shift. We can do this if the bottom bits (which are shifted
42841 // out) are never demanded.
42842 if (Src.getOpcode() == X86ISD::KSHIFTR) {
42843 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
42844 unsigned C1 = Src.getConstantOperandVal(1);
42845 unsigned NewOpc = X86ISD::KSHIFTL;
42846 int Diff = ShiftAmt - C1;
42847 if (Diff < 0) {
42848 Diff = -Diff;
42849 NewOpc = X86ISD::KSHIFTR;
42850 }
42851
42852 SDLoc dl(Op);
42853 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42854 return TLO.CombineTo(
42855 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42856 }
42857 }
42858
42859 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
42860 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42861 Depth + 1))
42862 return true;
42863
42864 KnownUndef <<= ShiftAmt;
42865 KnownZero <<= ShiftAmt;
42866 KnownZero.setLowBits(ShiftAmt);
42867 break;
42868 }
42869 case X86ISD::KSHIFTR: {
42870 SDValue Src = Op.getOperand(0);
42871 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42872 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42872, __extension__
__PRETTY_FUNCTION__))
;
42873 unsigned ShiftAmt = Amt->getZExtValue();
42874
42875 if (ShiftAmt == 0)
42876 return TLO.CombineTo(Op, Src);
42877
42878 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
42879 // single shift. We can do this if the top bits (which are shifted
42880 // out) are never demanded.
42881 if (Src.getOpcode() == X86ISD::KSHIFTL) {
42882 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
42883 unsigned C1 = Src.getConstantOperandVal(1);
42884 unsigned NewOpc = X86ISD::KSHIFTR;
42885 int Diff = ShiftAmt - C1;
42886 if (Diff < 0) {
42887 Diff = -Diff;
42888 NewOpc = X86ISD::KSHIFTL;
42889 }
42890
42891 SDLoc dl(Op);
42892 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42893 return TLO.CombineTo(
42894 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42895 }
42896 }
42897
42898 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
42899 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42900 Depth + 1))
42901 return true;
42902
42903 KnownUndef.lshrInPlace(ShiftAmt);
42904 KnownZero.lshrInPlace(ShiftAmt);
42905 KnownZero.setHighBits(ShiftAmt);
42906 break;
42907 }
42908 case X86ISD::ANDNP: {
42909 // ANDNP = (~LHS & RHS);
42910 SDValue LHS = Op.getOperand(0);
42911 SDValue RHS = Op.getOperand(1);
42912
42913 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
42914 APInt UndefElts;
42915 SmallVector<APInt> EltBits;
42916 int NumElts = VT.getVectorNumElements();
42917 int EltSizeInBits = VT.getScalarSizeInBits();
42918 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
42919 APInt OpElts = DemandedElts;
42920 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
42921 EltBits)) {
42922 OpBits.clearAllBits();
42923 OpElts.clearAllBits();
42924 for (int I = 0; I != NumElts; ++I) {
42925 if (!DemandedElts[I])
42926 continue;
42927 if (UndefElts[I]) {
42928 // We can't assume an undef src element gives an undef dst - the
42929 // other src might be zero.
42930 OpBits.setAllBits();
42931 OpElts.setBit(I);
42932 } else if ((Invert && !EltBits[I].isAllOnes()) ||
42933 (!Invert && !EltBits[I].isZero())) {
42934 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
42935 OpElts.setBit(I);
42936 }
42937 }
42938 }
42939 return std::make_pair(OpBits, OpElts);
42940 };
42941 APInt BitsLHS, EltsLHS;
42942 APInt BitsRHS, EltsRHS;
42943 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
42944 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
42945
42946 APInt LHSUndef, LHSZero;
42947 APInt RHSUndef, RHSZero;
42948 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
42949 Depth + 1))
42950 return true;
42951 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
42952 Depth + 1))
42953 return true;
42954
42955 if (!DemandedElts.isAllOnes()) {
42956 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
42957 TLO.DAG, Depth + 1);
42958 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
42959 TLO.DAG, Depth + 1);
42960 if (NewLHS || NewRHS) {
42961 NewLHS = NewLHS ? NewLHS : LHS;
42962 NewRHS = NewRHS ? NewRHS : RHS;
42963 return TLO.CombineTo(
42964 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42965 }
42966 }
42967 break;
42968 }
42969 case X86ISD::CVTSI2P:
42970 case X86ISD::CVTUI2P: {
42971 SDValue Src = Op.getOperand(0);
42972 MVT SrcVT = Src.getSimpleValueType();
42973 APInt SrcUndef, SrcZero;
42974 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42975 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42976 Depth + 1))
42977 return true;
42978 break;
42979 }
42980 case X86ISD::PACKSS:
42981 case X86ISD::PACKUS: {
42982 SDValue N0 = Op.getOperand(0);
42983 SDValue N1 = Op.getOperand(1);
42984
42985 APInt DemandedLHS, DemandedRHS;
42986 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42987
42988 APInt LHSUndef, LHSZero;
42989 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42990 Depth + 1))
42991 return true;
42992 APInt RHSUndef, RHSZero;
42993 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42994 Depth + 1))
42995 return true;
42996
42997 // TODO - pass on known zero/undef.
42998
42999 // Aggressively peek through ops to get at the demanded elts.
43000 // TODO - we should do this for all target/faux shuffles ops.
43001 if (!DemandedElts.isAllOnes()) {
43002 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43003 TLO.DAG, Depth + 1);
43004 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43005 TLO.DAG, Depth + 1);
43006 if (NewN0 || NewN1) {
43007 NewN0 = NewN0 ? NewN0 : N0;
43008 NewN1 = NewN1 ? NewN1 : N1;
43009 return TLO.CombineTo(Op,
43010 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43011 }
43012 }
43013 break;
43014 }
43015 case X86ISD::HADD:
43016 case X86ISD::HSUB:
43017 case X86ISD::FHADD:
43018 case X86ISD::FHSUB: {
43019 SDValue N0 = Op.getOperand(0);
43020 SDValue N1 = Op.getOperand(1);
43021
43022 APInt DemandedLHS, DemandedRHS;
43023 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43024
43025 APInt LHSUndef, LHSZero;
43026 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43027 Depth + 1))
43028 return true;
43029 APInt RHSUndef, RHSZero;
43030 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43031 Depth + 1))
43032 return true;
43033
43034 // TODO - pass on known zero/undef.
43035
43036 // Aggressively peek through ops to get at the demanded elts.
43037 // TODO: Handle repeated operands.
43038 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43039 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43040 TLO.DAG, Depth + 1);
43041 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43042 TLO.DAG, Depth + 1);
43043 if (NewN0 || NewN1) {
43044 NewN0 = NewN0 ? NewN0 : N0;
43045 NewN1 = NewN1 ? NewN1 : N1;
43046 return TLO.CombineTo(Op,
43047 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43048 }
43049 }
43050 break;
43051 }
43052 case X86ISD::VTRUNC:
43053 case X86ISD::VTRUNCS:
43054 case X86ISD::VTRUNCUS: {
43055 SDValue Src = Op.getOperand(0);
43056 MVT SrcVT = Src.getSimpleValueType();
43057 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43058 APInt SrcUndef, SrcZero;
43059 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43060 Depth + 1))
43061 return true;
43062 KnownZero = SrcZero.zextOrTrunc(NumElts);
43063 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43064 break;
43065 }
43066 case X86ISD::BLENDV: {
43067 APInt SelUndef, SelZero;
43068 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43069 SelZero, TLO, Depth + 1))
43070 return true;
43071
43072 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43073 APInt LHSUndef, LHSZero;
43074 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43075 LHSZero, TLO, Depth + 1))
43076 return true;
43077
43078 APInt RHSUndef, RHSZero;
43079 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43080 RHSZero, TLO, Depth + 1))
43081 return true;
43082
43083 KnownZero = LHSZero & RHSZero;
43084 KnownUndef = LHSUndef & RHSUndef;
43085 break;
43086 }
43087 case X86ISD::VZEXT_MOVL: {
43088 // If upper demanded elements are already zero then we have nothing to do.
43089 SDValue Src = Op.getOperand(0);
43090 APInt DemandedUpperElts = DemandedElts;
43091 DemandedUpperElts.clearLowBits(1);
43092 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43093 return TLO.CombineTo(Op, Src);
43094 break;
43095 }
43096 case X86ISD::VBROADCAST: {
43097 SDValue Src = Op.getOperand(0);
43098 MVT SrcVT = Src.getSimpleValueType();
43099 if (!SrcVT.isVector())
43100 break;
43101 // Don't bother broadcasting if we just need the 0'th element.
43102 if (DemandedElts == 1) {
43103 if (Src.getValueType() != VT)
43104 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43105 SDLoc(Op));
43106 return TLO.CombineTo(Op, Src);
43107 }
43108 APInt SrcUndef, SrcZero;
43109 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43110 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43111 Depth + 1))
43112 return true;
43113 // Aggressively peek through src to get at the demanded elt.
43114 // TODO - we should do this for all target/faux shuffles ops.
43115 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43116 Src, SrcElts, TLO.DAG, Depth + 1))
43117 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43118 break;
43119 }
43120 case X86ISD::VPERMV:
43121 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43122 Depth))
43123 return true;
43124 break;
43125 case X86ISD::PSHUFB:
43126 case X86ISD::VPERMV3:
43127 case X86ISD::VPERMILPV:
43128 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43129 Depth))
43130 return true;
43131 break;
43132 case X86ISD::VPPERM:
43133 case X86ISD::VPERMIL2:
43134 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43135 Depth))
43136 return true;
43137 break;
43138 }
43139
43140 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43141 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43142 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43143 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43144 DemandedElts.lshr(NumElts / 2) == 0) {
43145 unsigned SizeInBits = VT.getSizeInBits();
43146 unsigned ExtSizeInBits = SizeInBits / 2;
43147
43148 // See if 512-bit ops only use the bottom 128-bits.
43149 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43150 ExtSizeInBits = SizeInBits / 4;
43151
43152 switch (Opc) {
43153 // Scalar broadcast.
43154 case X86ISD::VBROADCAST: {
43155 SDLoc DL(Op);
43156 SDValue Src = Op.getOperand(0);
43157 if (Src.getValueSizeInBits() > ExtSizeInBits)
43158 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43159 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43160 ExtSizeInBits / VT.getScalarSizeInBits());
43161 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43162 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43163 TLO.DAG, DL, ExtSizeInBits));
43164 }
43165 case X86ISD::VBROADCAST_LOAD: {
43166 SDLoc DL(Op);
43167 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43168 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43169 ExtSizeInBits / VT.getScalarSizeInBits());
43170 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43171 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43172 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43173 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43174 MemIntr->getMemOperand());
43175 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43176 Bcst.getValue(1));
43177 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43178 TLO.DAG, DL, ExtSizeInBits));
43179 }
43180 // Subvector broadcast.
43181 case X86ISD::SUBV_BROADCAST_LOAD: {
43182 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43183 EVT MemVT = MemIntr->getMemoryVT();
43184 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43185 SDLoc DL(Op);
43186 SDValue Ld =
43187 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43188 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43189 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43190 Ld.getValue(1));
43191 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43192 TLO.DAG, DL, ExtSizeInBits));
43193 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43194 SDLoc DL(Op);
43195 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43196 ExtSizeInBits / VT.getScalarSizeInBits());
43197 if (SDValue BcstLd =
43198 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43199 return TLO.CombineTo(Op,
43200 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43201 TLO.DAG, DL, ExtSizeInBits));
43202 }
43203 break;
43204 }
43205 // Byte shifts by immediate.
43206 case X86ISD::VSHLDQ:
43207 case X86ISD::VSRLDQ:
43208 // Shift by uniform.
43209 case X86ISD::VSHL:
43210 case X86ISD::VSRL:
43211 case X86ISD::VSRA:
43212 // Shift by immediate.
43213 case X86ISD::VSHLI:
43214 case X86ISD::VSRLI:
43215 case X86ISD::VSRAI: {
43216 SDLoc DL(Op);
43217 SDValue Ext0 =
43218 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43219 SDValue ExtOp =
43220 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43221 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43222 SDValue Insert =
43223 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43224 return TLO.CombineTo(Op, Insert);
43225 }
43226 case X86ISD::VPERMI: {
43227 // Simplify PERMPD/PERMQ to extract_subvector.
43228 // TODO: This should be done in shuffle combining.
43229 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43230 SmallVector<int, 4> Mask;
43231 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43232 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43233 SDLoc DL(Op);
43234 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43235 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43236 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43237 return TLO.CombineTo(Op, Insert);
43238 }
43239 }
43240 break;
43241 }
43242 case X86ISD::VPERM2X128: {
43243 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43244 SDLoc DL(Op);
43245 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43246 if (LoMask & 0x8)
43247 return TLO.CombineTo(
43248 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43249 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43250 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43251 SDValue ExtOp =
43252 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43253 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43254 SDValue Insert =
43255 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43256 return TLO.CombineTo(Op, Insert);
43257 }
43258 // Zero upper elements.
43259 case X86ISD::VZEXT_MOVL:
43260 // Target unary shuffles by immediate:
43261 case X86ISD::PSHUFD:
43262 case X86ISD::PSHUFLW:
43263 case X86ISD::PSHUFHW:
43264 case X86ISD::VPERMILPI:
43265 // (Non-Lane Crossing) Target Shuffles.
43266 case X86ISD::VPERMILPV:
43267 case X86ISD::VPERMIL2:
43268 case X86ISD::PSHUFB:
43269 case X86ISD::UNPCKL:
43270 case X86ISD::UNPCKH:
43271 case X86ISD::BLENDI:
43272 // Integer ops.
43273 case X86ISD::PACKSS:
43274 case X86ISD::PACKUS:
43275 // Horizontal Ops.
43276 case X86ISD::HADD:
43277 case X86ISD::HSUB:
43278 case X86ISD::FHADD:
43279 case X86ISD::FHSUB: {
43280 SDLoc DL(Op);
43281 SmallVector<SDValue, 4> Ops;
43282 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43283 SDValue SrcOp = Op.getOperand(i);
43284 EVT SrcVT = SrcOp.getValueType();
43285 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43286, __extension__
__PRETTY_FUNCTION__))
43286 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43286, __extension__
__PRETTY_FUNCTION__))
;
43287 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43288 ExtSizeInBits)
43289 : SrcOp);
43290 }
43291 MVT ExtVT = VT.getSimpleVT();
43292 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43293 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43294 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43295 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43296 SDValue Insert =
43297 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43298 return TLO.CombineTo(Op, Insert);
43299 }
43300 }
43301 }
43302
43303 // For splats, unless we *only* demand the 0'th element,
43304 // stop attempts at simplification here, we aren't going to improve things,
43305 // this is better than any potential shuffle.
43306 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43307 return false;
43308
43309 // Get target/faux shuffle mask.
43310 APInt OpUndef, OpZero;
43311 SmallVector<int, 64> OpMask;
43312 SmallVector<SDValue, 2> OpInputs;
43313 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43314 OpZero, TLO.DAG, Depth, false))
43315 return false;
43316
43317 // Shuffle inputs must be the same size as the result.
43318 if (OpMask.size() != (unsigned)NumElts ||
43319 llvm::any_of(OpInputs, [VT](SDValue V) {
43320 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43321 !V.getValueType().isVector();
43322 }))
43323 return false;
43324
43325 KnownZero = OpZero;
43326 KnownUndef = OpUndef;
43327
43328 // Check if shuffle mask can be simplified to undef/zero/identity.
43329 int NumSrcs = OpInputs.size();
43330 for (int i = 0; i != NumElts; ++i)
43331 if (!DemandedElts[i])
43332 OpMask[i] = SM_SentinelUndef;
43333
43334 if (isUndefInRange(OpMask, 0, NumElts)) {
43335 KnownUndef.setAllBits();
43336 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43337 }
43338 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43339 KnownZero.setAllBits();
43340 return TLO.CombineTo(
43341 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43342 }
43343 for (int Src = 0; Src != NumSrcs; ++Src)
43344 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43345 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43346
43347 // Attempt to simplify inputs.
43348 for (int Src = 0; Src != NumSrcs; ++Src) {
43349 // TODO: Support inputs of different types.
43350 if (OpInputs[Src].getValueType() != VT)
43351 continue;
43352
43353 int Lo = Src * NumElts;
43354 APInt SrcElts = APInt::getZero(NumElts);
43355 for (int i = 0; i != NumElts; ++i)
43356 if (DemandedElts[i]) {
43357 int M = OpMask[i] - Lo;
43358 if (0 <= M && M < NumElts)
43359 SrcElts.setBit(M);
43360 }
43361
43362 // TODO - Propagate input undef/zero elts.
43363 APInt SrcUndef, SrcZero;
43364 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43365 TLO, Depth + 1))
43366 return true;
43367 }
43368
43369 // If we don't demand all elements, then attempt to combine to a simpler
43370 // shuffle.
43371 // We need to convert the depth to something combineX86ShufflesRecursively
43372 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43373 // to match. This prevents combineX86ShuffleChain from returning a
43374 // combined shuffle that's the same as the original root, causing an
43375 // infinite loop.
43376 if (!DemandedElts.isAllOnes()) {
43377 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43377, __extension__
__PRETTY_FUNCTION__))
;
43378
43379 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43380 for (int i = 0; i != NumElts; ++i)
43381 if (DemandedElts[i])
43382 DemandedMask[i] = i;
43383
43384 SDValue NewShuffle = combineX86ShufflesRecursively(
43385 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43386 /*HasVarMask*/ false,
43387 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43388 Subtarget);
43389 if (NewShuffle)
43390 return TLO.CombineTo(Op, NewShuffle);
43391 }
43392
43393 return false;
43394}
43395
43396bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
43397 SDValue Op, const APInt &OriginalDemandedBits,
43398 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43399 unsigned Depth) const {
43400 EVT VT = Op.getValueType();
43401 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43402 unsigned Opc = Op.getOpcode();
43403 switch(Opc) {
43404 case X86ISD::VTRUNC: {
43405 KnownBits KnownOp;
43406 SDValue Src = Op.getOperand(0);
43407 MVT SrcVT = Src.getSimpleValueType();
43408
43409 // Simplify the input, using demanded bit information.
43410 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43411 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43412 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43413 return true;
43414 break;
43415 }
43416 case X86ISD::PMULDQ:
43417 case X86ISD::PMULUDQ: {
43418 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43419 KnownBits KnownLHS, KnownRHS;
43420 SDValue LHS = Op.getOperand(0);
43421 SDValue RHS = Op.getOperand(1);
43422
43423 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43424 // FIXME: Can we bound this better?
43425 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43426 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43427 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43428
43429 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43430 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43431 DemandedMaskLHS = DemandedMask;
43432 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43433 DemandedMaskRHS = DemandedMask;
43434
43435 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43436 KnownLHS, TLO, Depth + 1))
43437 return true;
43438 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43439 KnownRHS, TLO, Depth + 1))
43440 return true;
43441
43442 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43443 KnownRHS = KnownRHS.trunc(32);
43444 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43445 KnownRHS.getConstant().isOne()) {
43446 SDLoc DL(Op);
43447 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43448 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43449 }
43450
43451 // Aggressively peek through ops to get at the demanded low bits.
43452 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
43453 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43454 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
43455 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43456 if (DemandedLHS || DemandedRHS) {
43457 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43458 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43459 return TLO.CombineTo(
43460 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43461 }
43462 break;
43463 }
43464 case X86ISD::VSHLI: {
43465 SDValue Op0 = Op.getOperand(0);
43466
43467 unsigned ShAmt = Op.getConstantOperandVal(1);
43468 if (ShAmt >= BitWidth)
43469 break;
43470
43471 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43472
43473 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43474 // single shift. We can do this if the bottom bits (which are shifted
43475 // out) are never demanded.
43476 if (Op0.getOpcode() == X86ISD::VSRLI &&
43477 OriginalDemandedBits.countr_zero() >= ShAmt) {
43478 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43479 if (Shift2Amt < BitWidth) {
43480 int Diff = ShAmt - Shift2Amt;
43481 if (Diff == 0)
43482 return TLO.CombineTo(Op, Op0.getOperand(0));
43483
43484 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43485 SDValue NewShift = TLO.DAG.getNode(
43486 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43487 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43488 return TLO.CombineTo(Op, NewShift);
43489 }
43490 }
43491
43492 // If we are only demanding sign bits then we can use the shift source directly.
43493 unsigned NumSignBits =
43494 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43495 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43496 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43497 return TLO.CombineTo(Op, Op0);
43498
43499 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43500 TLO, Depth + 1))
43501 return true;
43502
43503 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43503, __extension__
__PRETTY_FUNCTION__))
;
43504 Known.Zero <<= ShAmt;
43505 Known.One <<= ShAmt;
43506
43507 // Low bits known zero.
43508 Known.Zero.setLowBits(ShAmt);
43509 return false;
43510 }
43511 case X86ISD::VSRLI: {
43512 unsigned ShAmt = Op.getConstantOperandVal(1);
43513 if (ShAmt >= BitWidth)
43514 break;
43515
43516 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43517
43518 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
43519 OriginalDemandedElts, Known, TLO, Depth + 1))
43520 return true;
43521
43522 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43522, __extension__
__PRETTY_FUNCTION__))
;
43523 Known.Zero.lshrInPlace(ShAmt);
43524 Known.One.lshrInPlace(ShAmt);
43525
43526 // High bits known zero.
43527 Known.Zero.setHighBits(ShAmt);
43528 return false;
43529 }
43530 case X86ISD::VSRAI: {
43531 SDValue Op0 = Op.getOperand(0);
43532 SDValue Op1 = Op.getOperand(1);
43533
43534 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
43535 if (ShAmt >= BitWidth)
43536 break;
43537
43538 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43539
43540 // If we just want the sign bit then we don't need to shift it.
43541 if (OriginalDemandedBits.isSignMask())
43542 return TLO.CombineTo(Op, Op0);
43543
43544 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43545 if (Op0.getOpcode() == X86ISD::VSHLI &&
43546 Op.getOperand(1) == Op0.getOperand(1)) {
43547 SDValue Op00 = Op0.getOperand(0);
43548 unsigned NumSignBits =
43549 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43550 if (ShAmt < NumSignBits)
43551 return TLO.CombineTo(Op, Op00);
43552 }
43553
43554 // If any of the demanded bits are produced by the sign extension, we also
43555 // demand the input sign bit.
43556 if (OriginalDemandedBits.countl_zero() < ShAmt)
43557 DemandedMask.setSignBit();
43558
43559 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43560 TLO, Depth + 1))
43561 return true;
43562
43563 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43563, __extension__
__PRETTY_FUNCTION__))
;
43564 Known.Zero.lshrInPlace(ShAmt);
43565 Known.One.lshrInPlace(ShAmt);
43566
43567 // If the input sign bit is known to be zero, or if none of the top bits
43568 // are demanded, turn this into an unsigned shift right.
43569 if (Known.Zero[BitWidth - ShAmt - 1] ||
43570 OriginalDemandedBits.countl_zero() >= ShAmt)
43571 return TLO.CombineTo(
43572 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
43573
43574 // High bits are known one.
43575 if (Known.One[BitWidth - ShAmt - 1])
43576 Known.One.setHighBits(ShAmt);
43577 return false;
43578 }
43579 case X86ISD::BLENDV: {
43580 SDValue Sel = Op.getOperand(0);
43581 SDValue LHS = Op.getOperand(1);
43582 SDValue RHS = Op.getOperand(2);
43583
43584 APInt SignMask = APInt::getSignMask(BitWidth);
43585 SDValue NewSel = SimplifyMultipleUseDemandedBits(
43586 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
43587 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
43588 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43589 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
43590 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43591
43592 if (NewSel || NewLHS || NewRHS) {
43593 NewSel = NewSel ? NewSel : Sel;
43594 NewLHS = NewLHS ? NewLHS : LHS;
43595 NewRHS = NewRHS ? NewRHS : RHS;
43596 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
43597 NewSel, NewLHS, NewRHS));
43598 }
43599 break;
43600 }
43601 case X86ISD::PEXTRB:
43602 case X86ISD::PEXTRW: {
43603 SDValue Vec = Op.getOperand(0);
43604 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
43605 MVT VecVT = Vec.getSimpleValueType();
43606 unsigned NumVecElts = VecVT.getVectorNumElements();
43607
43608 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
43609 unsigned Idx = CIdx->getZExtValue();
43610 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
43611
43612 // If we demand no bits from the vector then we must have demanded
43613 // bits from the implict zext - simplify to zero.
43614 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
43615 if (DemandedVecBits == 0)
43616 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43617
43618 APInt KnownUndef, KnownZero;
43619 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
43620 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
43621 KnownZero, TLO, Depth + 1))
43622 return true;
43623
43624 KnownBits KnownVec;
43625 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
43626 KnownVec, TLO, Depth + 1))
43627 return true;
43628
43629 if (SDValue V = SimplifyMultipleUseDemandedBits(
43630 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
43631 return TLO.CombineTo(
43632 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
43633
43634 Known = KnownVec.zext(BitWidth);
43635 return false;
43636 }
43637 break;
43638 }
43639 case X86ISD::PINSRB:
43640 case X86ISD::PINSRW: {
43641 SDValue Vec = Op.getOperand(0);
43642 SDValue Scl = Op.getOperand(1);
43643 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43644 MVT VecVT = Vec.getSimpleValueType();
43645
43646 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
43647 unsigned Idx = CIdx->getZExtValue();
43648 if (!OriginalDemandedElts[Idx])
43649 return TLO.CombineTo(Op, Vec);
43650
43651 KnownBits KnownVec;
43652 APInt DemandedVecElts(OriginalDemandedElts);
43653 DemandedVecElts.clearBit(Idx);
43654 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
43655 KnownVec, TLO, Depth + 1))
43656 return true;
43657
43658 KnownBits KnownScl;
43659 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
43660 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
43661 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
43662 return true;
43663
43664 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
43665 Known = KnownBits::commonBits(KnownVec, KnownScl);
43666 return false;
43667 }
43668 break;
43669 }
43670 case X86ISD::PACKSS:
43671 // PACKSS saturates to MIN/MAX integer values. So if we just want the
43672 // sign bit then we can just ask for the source operands sign bit.
43673 // TODO - add known bits handling.
43674 if (OriginalDemandedBits.isSignMask()) {
43675 APInt DemandedLHS, DemandedRHS;
43676 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
43677
43678 KnownBits KnownLHS, KnownRHS;
43679 APInt SignMask = APInt::getSignMask(BitWidth * 2);
43680 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
43681 KnownLHS, TLO, Depth + 1))
43682 return true;
43683 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
43684 KnownRHS, TLO, Depth + 1))
43685 return true;
43686
43687 // Attempt to avoid multi-use ops if we don't need anything from them.
43688 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43689 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
43690 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
43691 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
43692 if (DemandedOp0 || DemandedOp1) {
43693 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
43694 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
43695 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
43696 }
43697 }
43698 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
43699 break;
43700 case X86ISD::VBROADCAST: {
43701 SDValue Src = Op.getOperand(0);
43702 MVT SrcVT = Src.getSimpleValueType();
43703 APInt DemandedElts = APInt::getOneBitSet(
43704 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
43705 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
43706 TLO, Depth + 1))
43707 return true;
43708 // If we don't need the upper bits, attempt to narrow the broadcast source.
43709 // Don't attempt this on AVX512 as it might affect broadcast folding.
43710 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
43711 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
43712 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
43713 Src->hasOneUse()) {
43714 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
43715 SDValue NewSrc =
43716 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
43717 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
43718 SDValue NewBcst =
43719 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
43720 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
43721 }
43722 break;
43723 }
43724 case X86ISD::PCMPGT:
43725 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43726 // iff we only need the sign bit then we can use R directly.
43727 if (OriginalDemandedBits.isSignMask() &&
43728 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43729 return TLO.CombineTo(Op, Op.getOperand(1));
43730 break;
43731 case X86ISD::MOVMSK: {
43732 SDValue Src = Op.getOperand(0);
43733 MVT SrcVT = Src.getSimpleValueType();
43734 unsigned SrcBits = SrcVT.getScalarSizeInBits();
43735 unsigned NumElts = SrcVT.getVectorNumElements();
43736
43737 // If we don't need the sign bits at all just return zero.
43738 if (OriginalDemandedBits.countr_zero() >= NumElts)
43739 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43740
43741 // See if we only demand bits from the lower 128-bit vector.
43742 if (SrcVT.is256BitVector() &&
43743 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
43744 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
43745 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43746 }
43747
43748 // Only demand the vector elements of the sign bits we need.
43749 APInt KnownUndef, KnownZero;
43750 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
43751 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43752 TLO, Depth + 1))
43753 return true;
43754
43755 Known.Zero = KnownZero.zext(BitWidth);
43756 Known.Zero.setHighBits(BitWidth - NumElts);
43757
43758 // MOVMSK only uses the MSB from each vector element.
43759 KnownBits KnownSrc;
43760 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
43761 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
43762 Depth + 1))
43763 return true;
43764
43765 if (KnownSrc.One[SrcBits - 1])
43766 Known.One.setLowBits(NumElts);
43767 else if (KnownSrc.Zero[SrcBits - 1])
43768 Known.Zero.setLowBits(NumElts);
43769
43770 // Attempt to avoid multi-use os if we don't need anything from it.
43771 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
43772 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
43773 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43774 return false;
43775 }
43776 case X86ISD::TESTP: {
43777 SDValue Op0 = Op.getOperand(0);
43778 SDValue Op1 = Op.getOperand(1);
43779 MVT OpVT = Op0.getSimpleValueType();
43780 assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43782, __extension__
__PRETTY_FUNCTION__))
43781 OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43782, __extension__
__PRETTY_FUNCTION__))
43782 "Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43782, __extension__
__PRETTY_FUNCTION__))
;
43783
43784 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
43785 KnownBits KnownSrc;
43786 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
43787 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
43788 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
43789 AssumeSingleUse) ||
43790 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
43791 AssumeSingleUse);
43792 }
43793 case X86ISD::BEXTR:
43794 case X86ISD::BEXTRI: {
43795 SDValue Op0 = Op.getOperand(0);
43796 SDValue Op1 = Op.getOperand(1);
43797
43798 // Only bottom 16-bits of the control bits are required.
43799 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
43800 // NOTE: SimplifyDemandedBits won't do this for constants.
43801 uint64_t Val1 = Cst1->getZExtValue();
43802 uint64_t MaskedVal1 = Val1 & 0xFFFF;
43803 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
43804 SDLoc DL(Op);
43805 return TLO.CombineTo(
43806 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
43807 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
43808 }
43809
43810 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
43811 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
43812
43813 // If the length is 0, the result is 0.
43814 if (Length == 0) {
43815 Known.setAllZero();
43816 return false;
43817 }
43818
43819 if ((Shift + Length) <= BitWidth) {
43820 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
43821 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
43822 return true;
43823
43824 Known = Known.extractBits(Length, Shift);
43825 Known = Known.zextOrTrunc(BitWidth);
43826 return false;
43827 }
43828 } else {
43829 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43829, __extension__
__PRETTY_FUNCTION__))
;
43830 KnownBits Known1;
43831 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
43832 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
43833 return true;
43834
43835 // If the length is 0, replace with 0.
43836 KnownBits LengthBits = Known1.extractBits(8, 8);
43837 if (LengthBits.isZero())
43838 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43839 }
43840
43841 break;
43842 }
43843 case X86ISD::PDEP: {
43844 SDValue Op0 = Op.getOperand(0);
43845 SDValue Op1 = Op.getOperand(1);
43846
43847 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
43848 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
43849
43850 // If the demanded bits has leading zeroes, we don't demand those from the
43851 // mask.
43852 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
43853 return true;
43854
43855 // The number of possible 1s in the mask determines the number of LSBs of
43856 // operand 0 used. Undemanded bits from the mask don't matter so filter
43857 // them before counting.
43858 KnownBits Known2;
43859 uint64_t Count = (~Known.Zero & LoMask).popcount();
43860 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
43861 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
43862 return true;
43863
43864 // Zeroes are retained from the mask, but not ones.
43865 Known.One.clearAllBits();
43866 // The result will have at least as many trailing zeros as the non-mask
43867 // operand since bits can only map to the same or higher bit position.
43868 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
43869 return false;
43870 }
43871 }
43872
43873 return TargetLowering::SimplifyDemandedBitsForTargetNode(
43874 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
43875}
43876
43877SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
43878 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
43879 SelectionDAG &DAG, unsigned Depth) const {
43880 int NumElts = DemandedElts.getBitWidth();
43881 unsigned Opc = Op.getOpcode();
43882 EVT VT = Op.getValueType();
43883
43884 switch (Opc) {
43885 case X86ISD::PINSRB:
43886 case X86ISD::PINSRW: {
43887 // If we don't demand the inserted element, return the base vector.
43888 SDValue Vec = Op.getOperand(0);
43889 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43890 MVT VecVT = Vec.getSimpleValueType();
43891 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
43892 !DemandedElts[CIdx->getZExtValue()])
43893 return Vec;
43894 break;
43895 }
43896 case X86ISD::VSHLI: {
43897 // If we are only demanding sign bits then we can use the shift source
43898 // directly.
43899 SDValue Op0 = Op.getOperand(0);
43900 unsigned ShAmt = Op.getConstantOperandVal(1);
43901 unsigned BitWidth = DemandedBits.getBitWidth();
43902 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
43903 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
43904 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43905 return Op0;
43906 break;
43907 }
43908 case X86ISD::VSRAI:
43909 // iff we only need the sign bit then we can use the source directly.
43910 // TODO: generalize where we only demand extended signbits.
43911 if (DemandedBits.isSignMask())
43912 return Op.getOperand(0);
43913 break;
43914 case X86ISD::PCMPGT:
43915 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43916 // iff we only need the sign bit then we can use R directly.
43917 if (DemandedBits.isSignMask() &&
43918 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43919 return Op.getOperand(1);
43920 break;
43921 case X86ISD::ANDNP: {
43922 // ANDNP = (~LHS & RHS);
43923 SDValue LHS = Op.getOperand(0);
43924 SDValue RHS = Op.getOperand(1);
43925
43926 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
43927 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
43928
43929 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
43930 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
43931 // this context, so return RHS.
43932 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
43933 return RHS;
43934 break;
43935 }
43936 }
43937
43938 APInt ShuffleUndef, ShuffleZero;
43939 SmallVector<int, 16> ShuffleMask;
43940 SmallVector<SDValue, 2> ShuffleOps;
43941 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
43942 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
43943 // If all the demanded elts are from one operand and are inline,
43944 // then we can use the operand directly.
43945 int NumOps = ShuffleOps.size();
43946 if (ShuffleMask.size() == (unsigned)NumElts &&
43947 llvm::all_of(ShuffleOps, [VT](SDValue V) {
43948 return VT.getSizeInBits() == V.getValueSizeInBits();
43949 })) {
43950
43951 if (DemandedElts.isSubsetOf(ShuffleUndef))
43952 return DAG.getUNDEF(VT);
43953 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
43954 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
43955
43956 // Bitmask that indicates which ops have only been accessed 'inline'.
43957 APInt IdentityOp = APInt::getAllOnes(NumOps);
43958 for (int i = 0; i != NumElts; ++i) {
43959 int M = ShuffleMask[i];
43960 if (!DemandedElts[i] || ShuffleUndef[i])
43961 continue;
43962 int OpIdx = M / NumElts;
43963 int EltIdx = M % NumElts;
43964 if (M < 0 || EltIdx != i) {
43965 IdentityOp.clearAllBits();
43966 break;
43967 }
43968 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
43969 if (IdentityOp == 0)
43970 break;
43971 }
43972 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43973, __extension__
__PRETTY_FUNCTION__))
43973 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43973, __extension__
__PRETTY_FUNCTION__))
;
43974
43975 if (IdentityOp != 0)
43976 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
43977 }
43978 }
43979
43980 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
43981 Op, DemandedBits, DemandedElts, DAG, Depth);
43982}
43983
43984bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
43985 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43986 bool PoisonOnly, unsigned Depth) const {
43987 unsigned EltsBits = Op.getScalarValueSizeInBits();
43988 unsigned NumElts = DemandedElts.getBitWidth();
43989
43990 // TODO: Add more target shuffles.
43991 switch (Op.getOpcode()) {
43992 case X86ISD::PSHUFD:
43993 case X86ISD::VPERMILPI: {
43994 SmallVector<int, 8> Mask;
43995 DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
43996
43997 APInt DemandedSrcElts = APInt::getZero(NumElts);
43998 for (unsigned I = 0; I != NumElts; ++I)
43999 if (DemandedElts[I])
44000 DemandedSrcElts.setBit(Mask[I]);
44001
44002 return DAG.isGuaranteedNotToBeUndefOrPoison(
44003 Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
44004 }
44005 }
44006 return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44007 Op, DemandedElts, DAG, PoisonOnly, Depth);
44008}
44009
44010bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
44011 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44012 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44013
44014 // TODO: Add more target shuffles.
44015 switch (Op.getOpcode()) {
44016 case X86ISD::PSHUFD:
44017 case X86ISD::VPERMILPI:
44018 return false;
44019 }
44020 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
44021 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44022}
44023
44024bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
44025 const APInt &DemandedElts,
44026 APInt &UndefElts,
44027 const SelectionDAG &DAG,
44028 unsigned Depth) const {
44029 unsigned NumElts = DemandedElts.getBitWidth();
44030 unsigned Opc = Op.getOpcode();
44031
44032 switch (Opc) {
44033 case X86ISD::VBROADCAST:
44034 case X86ISD::VBROADCAST_LOAD:
44035 UndefElts = APInt::getZero(NumElts);
44036 return true;
44037 }
44038
44039 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44040 DAG, Depth);
44041}
44042
44043// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44044// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44045static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44046 bool AllowTruncate) {
44047 switch (Src.getOpcode()) {
44048 case ISD::TRUNCATE:
44049 if (!AllowTruncate)
44050 return false;
44051 [[fallthrough]];
44052 case ISD::SETCC:
44053 return Src.getOperand(0).getValueSizeInBits() == Size;
44054 case ISD::AND:
44055 case ISD::XOR:
44056 case ISD::OR:
44057 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44058 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44059 case ISD::SELECT:
44060 case ISD::VSELECT:
44061 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44062 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44063 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44064 case ISD::BUILD_VECTOR:
44065 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44066 ISD::isBuildVectorAllOnes(Src.getNode());
44067 }
44068 return false;
44069}
44070
44071// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44072static unsigned getAltBitOpcode(unsigned Opcode) {
44073 switch(Opcode) {
44074 case ISD::AND: return X86ISD::FAND;
44075 case ISD::OR: return X86ISD::FOR;
44076 case ISD::XOR: return X86ISD::FXOR;
44077 case X86ISD::ANDNP: return X86ISD::FANDN;
44078 }
44079 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44079)
;
44080}
44081
44082// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44083static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
44084 const SDLoc &DL) {
44085 EVT SrcVT = Src.getValueType();
44086 if (SrcVT != MVT::v4i1)
44087 return SDValue();
44088
44089 switch (Src.getOpcode()) {
44090 case ISD::SETCC:
44091 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44092 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44093 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44094 SDValue Op0 = Src.getOperand(0);
44095 if (ISD::isNormalLoad(Op0.getNode()))
44096 return DAG.getBitcast(MVT::v4f32, Op0);
44097 if (Op0.getOpcode() == ISD::BITCAST &&
44098 Op0.getOperand(0).getValueType() == MVT::v4f32)
44099 return Op0.getOperand(0);
44100 }
44101 break;
44102 case ISD::AND:
44103 case ISD::XOR:
44104 case ISD::OR: {
44105 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44106 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44107 if (Op0 && Op1)
44108 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44109 Op1);
44110 break;
44111 }
44112 }
44113 return SDValue();
44114}
44115
44116// Helper to push sign extension of vXi1 SETCC result through bitops.
44117static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
44118 SDValue Src, const SDLoc &DL) {
44119 switch (Src.getOpcode()) {
44120 case ISD::SETCC:
44121 case ISD::TRUNCATE:
44122 case ISD::BUILD_VECTOR:
44123 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44124 case ISD::AND:
44125 case ISD::XOR:
44126 case ISD::OR:
44127 return DAG.getNode(
44128 Src.getOpcode(), DL, SExtVT,
44129 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44130 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44131 case ISD::SELECT:
44132 case ISD::VSELECT:
44133 return DAG.getSelect(
44134 DL, SExtVT, Src.getOperand(0),
44135 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44136 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44137 }
44138 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44138)
;
44139}
44140
44141// Try to match patterns such as
44142// (i16 bitcast (v16i1 x))
44143// ->
44144// (i16 movmsk (16i8 sext (v16i1 x)))
44145// before the illegal vector is scalarized on subtargets that don't have legal
44146// vxi1 types.
44147static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
44148 const SDLoc &DL,
44149 const X86Subtarget &Subtarget) {
44150 EVT SrcVT = Src.getValueType();
44151 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44152 return SDValue();
44153
44154 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44155 // legalization destroys the v4i32 type.
44156 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44157 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44158 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44159 DAG.getBitcast(MVT::v4f32, V));
44160 return DAG.getZExtOrTrunc(V, DL, VT);
44161 }
44162 }
44163
44164 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44165 // movmskb even with avx512. This will be better than truncating to vXi1 and
44166 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44167 // vpcmpeqb/vpcmpgtb.
44168 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44169 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44170 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44171 Src.getOperand(0).getValueType() == MVT::v64i8);
44172
44173 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44174 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44175 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44176 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44177 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44178 EVT CmpVT = Src.getOperand(0).getValueType();
44179 EVT EltVT = CmpVT.getVectorElementType();
44180 if (CmpVT.getSizeInBits() <= 256 &&
44181 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44182 PreferMovMsk = true;
44183 }
44184
44185 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44186 // MOVMSK is supported in SSE2 or later.
44187 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44188 return SDValue();
44189
44190 // If the upper ops of a concatenation are undef, then try to bitcast the
44191 // lower op and extend.
44192 SmallVector<SDValue, 4> SubSrcOps;
44193 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44194 SubSrcOps.size() >= 2) {
44195 SDValue LowerOp = SubSrcOps[0];
44196 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44197 if (LowerOp.getOpcode() == ISD::SETCC &&
44198 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44199 EVT SubVT = VT.getIntegerVT(
44200 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44201 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44202 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44203 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44204 }
44205 }
44206 }
44207
44208 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44209 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44210 // v8i16 and v16i16.
44211 // For these two cases, we can shuffle the upper element bytes to a
44212 // consecutive sequence at the start of the vector and treat the results as
44213 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44214 // for v16i16 this is not the case, because the shuffle is expensive, so we
44215 // avoid sign-extending to this type entirely.
44216 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44217 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44218 MVT SExtVT;
44219 bool PropagateSExt = false;
44220 switch (SrcVT.getSimpleVT().SimpleTy) {
44221 default:
44222 return SDValue();
44223 case MVT::v2i1:
44224 SExtVT = MVT::v2i64;
44225 break;
44226 case MVT::v4i1:
44227 SExtVT = MVT::v4i32;
44228 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44229 // sign-extend to a 256-bit operation to avoid truncation.
44230 if (Subtarget.hasAVX() &&
44231 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44232 SExtVT = MVT::v4i64;
44233 PropagateSExt = true;
44234 }
44235 break;
44236 case MVT::v8i1:
44237 SExtVT = MVT::v8i16;
44238 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44239 // sign-extend to a 256-bit operation to match the compare.
44240 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44241 // 256-bit because the shuffle is cheaper than sign extending the result of
44242 // the compare.
44243 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44244 checkBitcastSrcVectorSize(Src, 512, true))) {
44245 SExtVT = MVT::v8i32;
44246 PropagateSExt = true;
44247 }
44248 break;
44249 case MVT::v16i1:
44250 SExtVT = MVT::v16i8;
44251 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44252 // it is not profitable to sign-extend to 256-bit because this will
44253 // require an extra cross-lane shuffle which is more expensive than
44254 // truncating the result of the compare to 128-bits.
44255 break;
44256 case MVT::v32i1:
44257 SExtVT = MVT::v32i8;
44258 break;
44259 case MVT::v64i1:
44260 // If we have AVX512F, but not AVX512BW and the input is truncated from
44261 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44262 if (Subtarget.hasAVX512()) {
44263 if (Subtarget.hasBWI())
44264 return SDValue();
44265 SExtVT = MVT::v64i8;
44266 break;
44267 }
44268 // Split if this is a <64 x i8> comparison result.
44269 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44270 SExtVT = MVT::v64i8;
44271 break;
44272 }
44273 return SDValue();
44274 };
44275
44276 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44277 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44278
44279 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44280 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44281 } else {
44282 if (SExtVT == MVT::v8i16)
44283 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
44284 DAG.getUNDEF(MVT::v8i16));
44285 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44286 }
44287
44288 EVT IntVT =
44289 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
44290 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44291 return DAG.getBitcast(VT, V);
44292}
44293
44294// Convert a vXi1 constant build vector to the same width scalar integer.
44295static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
44296 EVT SrcVT = Op.getValueType();
44297 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44298, __extension__
__PRETTY_FUNCTION__))
44298 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44298, __extension__
__PRETTY_FUNCTION__))
;
44299 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44300, __extension__
__PRETTY_FUNCTION__))
44300 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44300, __extension__
__PRETTY_FUNCTION__))
;
44301
44302 APInt Imm(SrcVT.getVectorNumElements(), 0);
44303 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44304 SDValue In = Op.getOperand(Idx);
44305 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
44306 Imm.setBit(Idx);
44307 }
44308 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44309 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44310}
44311
44312static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44313 TargetLowering::DAGCombinerInfo &DCI,
44314 const X86Subtarget &Subtarget) {
44315 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44315, __extension__
__PRETTY_FUNCTION__))
;
44316
44317 if (!DCI.isBeforeLegalizeOps())
44318 return SDValue();
44319
44320 // Only do this if we have k-registers.
44321 if (!Subtarget.hasAVX512())
44322 return SDValue();
44323
44324 EVT DstVT = N->getValueType(0);
44325 SDValue Op = N->getOperand(0);
44326 EVT SrcVT = Op.getValueType();
44327
44328 if (!Op.hasOneUse())
44329 return SDValue();
44330
44331 // Look for logic ops.
44332 if (Op.getOpcode() != ISD::AND &&
44333 Op.getOpcode() != ISD::OR &&
44334 Op.getOpcode() != ISD::XOR)
44335 return SDValue();
44336
44337 // Make sure we have a bitcast between mask registers and a scalar type.
44338 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44339 DstVT.isScalarInteger()) &&
44340 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44341 SrcVT.isScalarInteger()))
44342 return SDValue();
44343
44344 SDValue LHS = Op.getOperand(0);
44345 SDValue RHS = Op.getOperand(1);
44346
44347 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44348 LHS.getOperand(0).getValueType() == DstVT)
44349 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44350 DAG.getBitcast(DstVT, RHS));
44351
44352 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44353 RHS.getOperand(0).getValueType() == DstVT)
44354 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44355 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44356
44357 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44358 // Most of these have to move a constant from the scalar domain anyway.
44359 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
44360 RHS = combinevXi1ConstantToInteger(RHS, DAG);
44361 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44362 DAG.getBitcast(DstVT, LHS), RHS);
44363 }
44364
44365 return SDValue();
44366}
44367
44368static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
44369 const X86Subtarget &Subtarget) {
44370 SDLoc DL(BV);
44371 unsigned NumElts = BV->getNumOperands();
44372 SDValue Splat = BV->getSplatValue();
44373
44374 // Build MMX element from integer GPR or SSE float values.
44375 auto CreateMMXElement = [&](SDValue V) {
44376 if (V.isUndef())
44377 return DAG.getUNDEF(MVT::x86mmx);
44378 if (V.getValueType().isFloatingPoint()) {
44379 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44380 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44381 V = DAG.getBitcast(MVT::v2i64, V);
44382 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44383 }
44384 V = DAG.getBitcast(MVT::i32, V);
44385 } else {
44386 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44387 }
44388 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44389 };
44390
44391 // Convert build vector ops to MMX data in the bottom elements.
44392 SmallVector<SDValue, 8> Ops;
44393
44394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44395
44396 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44397 if (Splat) {
44398 if (Splat.isUndef())
44399 return DAG.getUNDEF(MVT::x86mmx);
44400
44401 Splat = CreateMMXElement(Splat);
44402
44403 if (Subtarget.hasSSE1()) {
44404 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44405 if (NumElts == 8)
44406 Splat = DAG.getNode(
44407 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44408 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44409 TLI.getPointerTy(DAG.getDataLayout())),
44410 Splat, Splat);
44411
44412 // Use PSHUFW to repeat 16-bit elements.
44413 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44414 return DAG.getNode(
44415 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44416 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44417 TLI.getPointerTy(DAG.getDataLayout())),
44418 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44419 }
44420 Ops.append(NumElts, Splat);
44421 } else {
44422 for (unsigned i = 0; i != NumElts; ++i)
44423 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44424 }
44425
44426 // Use tree of PUNPCKLs to build up general MMX vector.
44427 while (Ops.size() > 1) {
44428 unsigned NumOps = Ops.size();
44429 unsigned IntrinOp =
44430 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44431 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44432 : Intrinsic::x86_mmx_punpcklbw));
44433 SDValue Intrin = DAG.getTargetConstant(
44434 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44435 for (unsigned i = 0; i != NumOps; i += 2)
44436 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44437 Ops[i], Ops[i + 1]);
44438 Ops.resize(NumOps / 2);
44439 }
44440
44441 return Ops[0];
44442}
44443
44444// Recursive function that attempts to find if a bool vector node was originally
44445// a vector/float/double that got truncated/extended/bitcast to/from a scalar
44446// integer. If so, replace the scalar ops with bool vector equivalents back down
44447// the chain.
44448static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
44449 SelectionDAG &DAG,
44450 const X86Subtarget &Subtarget) {
44451 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44452 unsigned Opc = V.getOpcode();
44453 switch (Opc) {
44454 case ISD::BITCAST: {
44455 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44456 SDValue Src = V.getOperand(0);
44457 EVT SrcVT = Src.getValueType();
44458 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44459 return DAG.getBitcast(VT, Src);
44460 break;
44461 }
44462 case ISD::TRUNCATE: {
44463 // If we find a suitable source, a truncated scalar becomes a subvector.
44464 SDValue Src = V.getOperand(0);
44465 EVT NewSrcVT =
44466 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44467 if (TLI.isTypeLegal(NewSrcVT))
44468 if (SDValue N0 =
44469 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44470 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44471 DAG.getIntPtrConstant(0, DL));
44472 break;
44473 }
44474 case ISD::ANY_EXTEND:
44475 case ISD::ZERO_EXTEND: {
44476 // If we find a suitable source, an extended scalar becomes a subvector.
44477 SDValue Src = V.getOperand(0);
44478 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
44479 Src.getScalarValueSizeInBits());
44480 if (TLI.isTypeLegal(NewSrcVT))
44481 if (SDValue N0 =
44482 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44483 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
44484 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
44485 : DAG.getConstant(0, DL, VT),
44486 N0, DAG.getIntPtrConstant(0, DL));
44487 break;
44488 }
44489 case ISD::OR: {
44490 // If we find suitable sources, we can just move an OR to the vector domain.
44491 SDValue Src0 = V.getOperand(0);
44492 SDValue Src1 = V.getOperand(1);
44493 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44494 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
44495 return DAG.getNode(Opc, DL, VT, N0, N1);
44496 break;
44497 }
44498 case ISD::SHL: {
44499 // If we find a suitable source, a SHL becomes a KSHIFTL.
44500 SDValue Src0 = V.getOperand(0);
44501 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
44502 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
44503 break;
44504
44505 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
44506 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44507 return DAG.getNode(
44508 X86ISD::KSHIFTL, DL, VT, N0,
44509 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
44510 break;
44511 }
44512 }
44513 return SDValue();
44514}
44515
44516static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
44517 TargetLowering::DAGCombinerInfo &DCI,
44518 const X86Subtarget &Subtarget) {
44519 SDValue N0 = N->getOperand(0);
44520 EVT VT = N->getValueType(0);
44521 EVT SrcVT = N0.getValueType();
44522 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44523
44524 // Try to match patterns such as
44525 // (i16 bitcast (v16i1 x))
44526 // ->
44527 // (i16 movmsk (16i8 sext (v16i1 x)))
44528 // before the setcc result is scalarized on subtargets that don't have legal
44529 // vxi1 types.
44530 if (DCI.isBeforeLegalize()) {
44531 SDLoc dl(N);
44532 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
44533 return V;
44534
44535 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44536 // type, widen both sides to avoid a trip through memory.
44537 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
44538 Subtarget.hasAVX512()) {
44539 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
44540 N0 = DAG.getBitcast(MVT::v8i1, N0);
44541 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
44542 DAG.getIntPtrConstant(0, dl));
44543 }
44544
44545 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44546 // type, widen both sides to avoid a trip through memory.
44547 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
44548 Subtarget.hasAVX512()) {
44549 // Use zeros for the widening if we already have some zeroes. This can
44550 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
44551 // stream of this.
44552 // FIXME: It might make sense to detect a concat_vectors with a mix of
44553 // zeroes and undef and turn it into insert_subvector for i1 vectors as
44554 // a separate combine. What we can't do is canonicalize the operands of
44555 // such a concat or we'll get into a loop with SimplifyDemandedBits.
44556 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
44557 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
44558 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
44559 SrcVT = LastOp.getValueType();
44560 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44561 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
44562 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
44563 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44564 N0 = DAG.getBitcast(MVT::i8, N0);
44565 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44566 }
44567 }
44568
44569 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44570 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
44571 Ops[0] = N0;
44572 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44573 N0 = DAG.getBitcast(MVT::i8, N0);
44574 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44575 }
44576 } else {
44577 // If we're bitcasting from iX to vXi1, see if the integer originally
44578 // began as a vXi1 and whether we can remove the bitcast entirely.
44579 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
44580 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
44581 if (SDValue V =
44582 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
44583 return V;
44584 }
44585 }
44586
44587 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
44588 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
44589 // due to insert_subvector legalization on KNL. By promoting the copy to i16
44590 // we can help with known bits propagation from the vXi1 domain to the
44591 // scalar domain.
44592 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
44593 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44594 N0.getOperand(0).getValueType() == MVT::v16i1 &&
44595 isNullConstant(N0.getOperand(1)))
44596 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
44597 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
44598
44599 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
44600 // and the vbroadcast_load are both integer or both fp. In some cases this
44601 // will remove the bitcast entirely.
44602 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
44603 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
44604 auto *BCast = cast<MemIntrinsicSDNode>(N0);
44605 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
44606 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
44607 // Don't swap i8/i16 since don't have fp types that size.
44608 if (MemSize >= 32) {
44609 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
44610 : MVT::getIntegerVT(MemSize);
44611 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
44612 : MVT::getIntegerVT(SrcVTSize);
44613 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
44614
44615 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
44616 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
44617 SDValue ResNode =
44618 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
44619 MemVT, BCast->getMemOperand());
44620 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
44621 return DAG.getBitcast(VT, ResNode);
44622 }
44623 }
44624
44625 // Since MMX types are special and don't usually play with other vector types,
44626 // it's better to handle them early to be sure we emit efficient code by
44627 // avoiding store-load conversions.
44628 if (VT == MVT::x86mmx) {
44629 // Detect MMX constant vectors.
44630 APInt UndefElts;
44631 SmallVector<APInt, 1> EltBits;
44632 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
44633 SDLoc DL(N0);
44634 // Handle zero-extension of i32 with MOVD.
44635 if (EltBits[0].countl_zero() >= 32)
44636 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
44637 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
44638 // Else, bitcast to a double.
44639 // TODO - investigate supporting sext 32-bit immediates on x86_64.
44640 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
44641 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
44642 }
44643
44644 // Detect bitcasts to x86mmx low word.
44645 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44646 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
44647 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
44648 bool LowUndef = true, AllUndefOrZero = true;
44649 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
44650 SDValue Op = N0.getOperand(i);
44651 LowUndef &= Op.isUndef() || (i >= e/2);
44652 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
44653 }
44654 if (AllUndefOrZero) {
44655 SDValue N00 = N0.getOperand(0);
44656 SDLoc dl(N00);
44657 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
44658 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
44659 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
44660 }
44661 }
44662
44663 // Detect bitcasts of 64-bit build vectors and convert to a
44664 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
44665 // lowest element.
44666 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44667 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
44668 SrcVT == MVT::v8i8))
44669 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
44670
44671 // Detect bitcasts between element or subvector extraction to x86mmx.
44672 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
44673 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
44674 isNullConstant(N0.getOperand(1))) {
44675 SDValue N00 = N0.getOperand(0);
44676 if (N00.getValueType().is128BitVector())
44677 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
44678 DAG.getBitcast(MVT::v2i64, N00));
44679 }
44680
44681 // Detect bitcasts from FP_TO_SINT to x86mmx.
44682 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
44683 SDLoc DL(N0);
44684 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
44685 DAG.getUNDEF(MVT::v2i32));
44686 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
44687 DAG.getBitcast(MVT::v2i64, Res));
44688 }
44689 }
44690
44691 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
44692 // most of these to scalar anyway.
44693 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
44694 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44695 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
44696 return combinevXi1ConstantToInteger(N0, DAG);
44697 }
44698
44699 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44700 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44701 isa<ConstantSDNode>(N0)) {
44702 auto *C = cast<ConstantSDNode>(N0);
44703 if (C->isAllOnes())
44704 return DAG.getConstant(1, SDLoc(N0), VT);
44705 if (C->isZero())
44706 return DAG.getConstant(0, SDLoc(N0), VT);
44707 }
44708
44709 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
44710 // Turn it into a sign bit compare that produces a k-register. This avoids
44711 // a trip through a GPR.
44712 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44713 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44714 isPowerOf2_32(VT.getVectorNumElements())) {
44715 unsigned NumElts = VT.getVectorNumElements();
44716 SDValue Src = N0;
44717
44718 // Peek through truncate.
44719 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
44720 Src = N0.getOperand(0);
44721
44722 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
44723 SDValue MovmskIn = Src.getOperand(0);
44724 MVT MovmskVT = MovmskIn.getSimpleValueType();
44725 unsigned MovMskElts = MovmskVT.getVectorNumElements();
44726
44727 // We allow extra bits of the movmsk to be used since they are known zero.
44728 // We can't convert a VPMOVMSKB without avx512bw.
44729 if (MovMskElts <= NumElts &&
44730 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
44731 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
44732 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
44733 SDLoc dl(N);
44734 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
44735 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
44736 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
44737 if (EVT(CmpVT) == VT)
44738 return Cmp;
44739
44740 // Pad with zeroes up to original VT to replace the zeroes that were
44741 // being used from the MOVMSK.
44742 unsigned NumConcats = NumElts / MovMskElts;
44743 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
44744 Ops[0] = Cmp;
44745 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
44746 }
44747 }
44748 }
44749
44750 // Try to remove bitcasts from input and output of mask arithmetic to
44751 // remove GPR<->K-register crossings.
44752 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
44753 return V;
44754
44755 // Convert a bitcasted integer logic operation that has one bitcasted
44756 // floating-point operand into a floating-point logic operation. This may
44757 // create a load of a constant, but that is cheaper than materializing the
44758 // constant in an integer register and transferring it to an SSE register or
44759 // transferring the SSE operand to integer register and back.
44760 unsigned FPOpcode;
44761 switch (N0.getOpcode()) {
44762 case ISD::AND: FPOpcode = X86ISD::FAND; break;
44763 case ISD::OR: FPOpcode = X86ISD::FOR; break;
44764 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44765 default: return SDValue();
44766 }
44767
44768 // Check if we have a bitcast from another integer type as well.
44769 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
44770 (Subtarget.hasSSE2() && VT == MVT::f64) ||
44771 (Subtarget.hasFP16() && VT == MVT::f16) ||
44772 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
44773 TLI.isTypeLegal(VT))))
44774 return SDValue();
44775
44776 SDValue LogicOp0 = N0.getOperand(0);
44777 SDValue LogicOp1 = N0.getOperand(1);
44778 SDLoc DL0(N0);
44779
44780 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
44781 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
44782 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
44783 LogicOp0.getOperand(0).getValueType() == VT &&
44784 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
44785 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
44786 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44787 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
44788 }
44789 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
44790 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
44791 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
44792 LogicOp1.getOperand(0).getValueType() == VT &&
44793 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
44794 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
44795 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44796 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
44797 }
44798
44799 return SDValue();
44800}
44801
44802// (mul (zext a), (sext, b))
44803static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
44804 SDValue &Op1) {
44805 Op0 = Mul.getOperand(0);
44806 Op1 = Mul.getOperand(1);
44807
44808 // The operand1 should be signed extend
44809 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
44810 std::swap(Op0, Op1);
44811
44812 auto IsFreeTruncation = [](SDValue &Op) -> bool {
44813 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
44814 Op.getOpcode() == ISD::SIGN_EXTEND) &&
44815 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
44816 return true;
44817
44818 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
44819 return (BV && BV->isConstant());
44820 };
44821
44822 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
44823 // value, we need to check Op0 is zero extended value. Op1 should be signed
44824 // value, so we just check the signed bits.
44825 if ((IsFreeTruncation(Op0) &&
44826 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
44827 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
44828 return true;
44829
44830 return false;
44831}
44832
44833// Given a ABS node, detect the following pattern:
44834// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
44835// This is useful as it is the input into a SAD pattern.
44836static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
44837 SDValue AbsOp1 = Abs->getOperand(0);
44838 if (AbsOp1.getOpcode() != ISD::SUB)
44839 return false;
44840
44841 Op0 = AbsOp1.getOperand(0);
44842 Op1 = AbsOp1.getOperand(1);
44843
44844 // Check if the operands of the sub are zero-extended from vectors of i8.
44845 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
44846 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
44847 Op1.getOpcode() != ISD::ZERO_EXTEND ||
44848 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
44849 return false;
44850
44851 return true;
44852}
44853
44854static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
44855 unsigned &LogBias, const SDLoc &DL,
44856 const X86Subtarget &Subtarget) {
44857 // Extend or truncate to MVT::i8 first.
44858 MVT Vi8VT =
44859 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
44860 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
44861 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
44862
44863 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
44864 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
44865 // The src A, B element type is i8, but the dst C element type is i32.
44866 // When we calculate the reduce stage, we use src vector type vXi8 for it
44867 // so we need logbias 2 to avoid extra 2 stages.
44868 LogBias = 2;
44869
44870 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
44871 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
44872 RegSize = std::max(512u, RegSize);
44873
44874 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44875 // fill in the missing vector elements with 0.
44876 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
44877 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
44878 Ops[0] = LHS;
44879 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44880 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44881 Ops[0] = RHS;
44882 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44883
44884 // Actually build the DotProduct, split as 256/512 bits for
44885 // AVXVNNI/AVX512VNNI.
44886 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44887 ArrayRef<SDValue> Ops) {
44888 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44889 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
44890 };
44891 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
44892 SDValue Zero = DAG.getConstant(0, DL, DpVT);
44893
44894 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
44895 DpBuilder, false);
44896}
44897
44898// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
44899// to these zexts.
44900static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
44901 const SDValue &Zext1, const SDLoc &DL,
44902 const X86Subtarget &Subtarget) {
44903 // Find the appropriate width for the PSADBW.
44904 EVT InVT = Zext0.getOperand(0).getValueType();
44905 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
44906
44907 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44908 // fill in the missing vector elements with 0.
44909 unsigned NumConcat = RegSize / InVT.getSizeInBits();
44910 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
44911 Ops[0] = Zext0.getOperand(0);
44912 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44913 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44914 Ops[0] = Zext1.getOperand(0);
44915 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44916
44917 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44918 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44919 ArrayRef<SDValue> Ops) {
44920 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44921 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
44922 };
44923 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
44924 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
44925 PSADBWBuilder);
44926}
44927
44928// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
44929// PHMINPOSUW.
44930static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
44931 const X86Subtarget &Subtarget) {
44932 // Bail without SSE41.
44933 if (!Subtarget.hasSSE41())
44934 return SDValue();
44935
44936 EVT ExtractVT = Extract->getValueType(0);
44937 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
44938 return SDValue();
44939
44940 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
44941 ISD::NodeType BinOp;
44942 SDValue Src = DAG.matchBinOpReduction(
44943 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
44944 if (!Src)
44945 return SDValue();
44946
44947 EVT SrcVT = Src.getValueType();
44948 EVT SrcSVT = SrcVT.getScalarType();
44949 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
44950 return SDValue();
44951
44952 SDLoc DL(Extract);
44953 SDValue MinPos = Src;
44954
44955 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
44956 while (SrcVT.getSizeInBits() > 128) {
44957 SDValue Lo, Hi;
44958 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
44959 SrcVT = Lo.getValueType();
44960 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
44961 }
44962 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44964, __extension__
__PRETTY_FUNCTION__))
44963 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44964, __extension__
__PRETTY_FUNCTION__))
44964 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44964, __extension__
__PRETTY_FUNCTION__))
;
44965
44966 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
44967 // to flip the value accordingly.
44968 SDValue Mask;
44969 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
44970 if (BinOp == ISD::SMAX)
44971 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
44972 else if (BinOp == ISD::SMIN)
44973 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
44974 else if (BinOp == ISD::UMAX)
44975 Mask = DAG.getAllOnesConstant(DL, SrcVT);
44976
44977 if (Mask)
44978 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44979
44980 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
44981 // shuffling each upper element down and insert zeros. This means that the
44982 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
44983 // ready for the PHMINPOS.
44984 if (ExtractVT == MVT::i8) {
44985 SDValue Upper = DAG.getVectorShuffle(
44986 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
44987 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
44988 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
44989 }
44990
44991 // Perform the PHMINPOS on a v8i16 vector,
44992 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
44993 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
44994 MinPos = DAG.getBitcast(SrcVT, MinPos);
44995
44996 if (Mask)
44997 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44998
44999 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45000 DAG.getIntPtrConstant(0, DL));
45001}
45002
45003// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45004static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
45005 const X86Subtarget &Subtarget) {
45006 // Bail without SSE2.
45007 if (!Subtarget.hasSSE2())
45008 return SDValue();
45009
45010 EVT ExtractVT = Extract->getValueType(0);
45011 unsigned BitWidth = ExtractVT.getSizeInBits();
45012 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45013 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45014 return SDValue();
45015
45016 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45017 ISD::NodeType BinOp;
45018 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45019 if (!Match && ExtractVT == MVT::i1)
45020 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45021 if (!Match)
45022 return SDValue();
45023
45024 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45025 // which we can't support here for now.
45026 if (Match.getScalarValueSizeInBits() != BitWidth)
45027 return SDValue();
45028
45029 SDValue Movmsk;
45030 SDLoc DL(Extract);
45031 EVT MatchVT = Match.getValueType();
45032 unsigned NumElts = MatchVT.getVectorNumElements();
45033 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45034 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45035 LLVMContext &Ctx = *DAG.getContext();
45036
45037 if (ExtractVT == MVT::i1) {
45038 // Special case for (pre-legalization) vXi1 reductions.
45039 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45040 return SDValue();
45041 if (Match.getOpcode() == ISD::SETCC) {
45042 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45043 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45044 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45045 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45046 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45047 X86::CondCode X86CC;
45048 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45049 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45050 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45051 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45052 DAG, X86CC))
45053 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45054 getSETCC(X86CC, V, DL, DAG));
45055 }
45056 }
45057 if (TLI.isTypeLegal(MatchVT)) {
45058 // If this is a legal AVX512 predicate type then we can just bitcast.
45059 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45060 Movmsk = DAG.getBitcast(MovmskVT, Match);
45061 } else {
45062 // Use combineBitcastvxi1 to create the MOVMSK.
45063 while (NumElts > MaxElts) {
45064 SDValue Lo, Hi;
45065 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45066 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45067 NumElts /= 2;
45068 }
45069 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45070 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45071 }
45072 if (!Movmsk)
45073 return SDValue();
45074 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45075 } else {
45076 // FIXME: Better handling of k-registers or 512-bit vectors?
45077 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45078 if (!(MatchSizeInBits == 128 ||
45079 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45080 return SDValue();
45081
45082 // Make sure this isn't a vector of 1 element. The perf win from using
45083 // MOVMSK diminishes with less elements in the reduction, but it is
45084 // generally better to get the comparison over to the GPRs as soon as
45085 // possible to reduce the number of vector ops.
45086 if (Match.getValueType().getVectorNumElements() < 2)
45087 return SDValue();
45088
45089 // Check that we are extracting a reduction of all sign bits.
45090 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45091 return SDValue();
45092
45093 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45094 SDValue Lo, Hi;
45095 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45096 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45097 MatchSizeInBits = Match.getValueSizeInBits();
45098 }
45099
45100 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45101 MVT MaskSrcVT;
45102 if (64 == BitWidth || 32 == BitWidth)
45103 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
45104 MatchSizeInBits / BitWidth);
45105 else
45106 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45107
45108 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45109 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45110 NumElts = MaskSrcVT.getVectorNumElements();
45111 }
45112 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45113, __extension__
__PRETTY_FUNCTION__))
45113 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45113, __extension__
__PRETTY_FUNCTION__))
;
45114
45115 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45116 if (BinOp == ISD::XOR) {
45117 // parity -> (PARITY(MOVMSK X))
45118 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45119 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45120 }
45121
45122 SDValue CmpC;
45123 ISD::CondCode CondCode;
45124 if (BinOp == ISD::OR) {
45125 // any_of -> MOVMSK != 0
45126 CmpC = DAG.getConstant(0, DL, CmpVT);
45127 CondCode = ISD::CondCode::SETNE;
45128 } else {
45129 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45130 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45131 DL, CmpVT);
45132 CondCode = ISD::CondCode::SETEQ;
45133 }
45134
45135 // The setcc produces an i8 of 0/1, so extend that to the result width and
45136 // negate to get the final 0/-1 mask value.
45137 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45138 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45139 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45140 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
45141 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
45142}
45143
45144static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
45145 const X86Subtarget &Subtarget) {
45146 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45147 return SDValue();
45148
45149 EVT ExtractVT = Extract->getValueType(0);
45150 // Verify the type we're extracting is i32, as the output element type of
45151 // vpdpbusd is i32.
45152 if (ExtractVT != MVT::i32)
45153 return SDValue();
45154
45155 EVT VT = Extract->getOperand(0).getValueType();
45156 if (!isPowerOf2_32(VT.getVectorNumElements()))
45157 return SDValue();
45158
45159 // Match shuffle + add pyramid.
45160 ISD::NodeType BinOp;
45161 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45162
45163 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45164 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45165 // before adding into the accumulator.
45166 // TODO:
45167 // We also need to verify that the multiply has at least 2x the number of bits
45168 // of the input. We shouldn't match
45169 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45170 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45171 // Root = Root.getOperand(0);
45172
45173 // If there was a match, we want Root to be a mul.
45174 if (!Root || Root.getOpcode() != ISD::MUL)
45175 return SDValue();
45176
45177 // Check whether we have an extend and mul pattern
45178 SDValue LHS, RHS;
45179 if (!detectExtMul(DAG, Root, LHS, RHS))
45180 return SDValue();
45181
45182 // Create the dot product instruction.
45183 SDLoc DL(Extract);
45184 unsigned StageBias;
45185 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45186
45187 // If the original vector was wider than 4 elements, sum over the results
45188 // in the DP vector.
45189 unsigned Stages = Log2_32(VT.getVectorNumElements());
45190 EVT DpVT = DP.getValueType();
45191
45192 if (Stages > StageBias) {
45193 unsigned DpElems = DpVT.getVectorNumElements();
45194
45195 for (unsigned i = Stages - StageBias; i > 0; --i) {
45196 SmallVector<int, 16> Mask(DpElems, -1);
45197 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45198 Mask[j] = MaskEnd + j;
45199
45200 SDValue Shuffle =
45201 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45202 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45203 }
45204 }
45205
45206 // Return the lowest ExtractSizeInBits bits.
45207 EVT ResVT =
45208 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45209 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45210 DP = DAG.getBitcast(ResVT, DP);
45211 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45212 Extract->getOperand(1));
45213}
45214
45215static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
45216 const X86Subtarget &Subtarget) {
45217 // PSADBW is only supported on SSE2 and up.
45218 if (!Subtarget.hasSSE2())
45219 return SDValue();
45220
45221 EVT ExtractVT = Extract->getValueType(0);
45222 // Verify the type we're extracting is either i32 or i64.
45223 // FIXME: Could support other types, but this is what we have coverage for.
45224 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45225 return SDValue();
45226
45227 EVT VT = Extract->getOperand(0).getValueType();
45228 if (!isPowerOf2_32(VT.getVectorNumElements()))
45229 return SDValue();
45230
45231 // Match shuffle + add pyramid.
45232 ISD::NodeType BinOp;
45233 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45234
45235 // The operand is expected to be zero extended from i8
45236 // (verified in detectZextAbsDiff).
45237 // In order to convert to i64 and above, additional any/zero/sign
45238 // extend is expected.
45239 // The zero extend from 32 bit has no mathematical effect on the result.
45240 // Also the sign extend is basically zero extend
45241 // (extends the sign bit which is zero).
45242 // So it is correct to skip the sign/zero extend instruction.
45243 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45244 Root.getOpcode() == ISD::ZERO_EXTEND ||
45245 Root.getOpcode() == ISD::ANY_EXTEND))
45246 Root = Root.getOperand(0);
45247
45248 // If there was a match, we want Root to be a select that is the root of an
45249 // abs-diff pattern.
45250 if (!Root || Root.getOpcode() != ISD::ABS)
45251 return SDValue();
45252
45253 // Check whether we have an abs-diff pattern feeding into the select.
45254 SDValue Zext0, Zext1;
45255 if (!detectZextAbsDiff(Root, Zext0, Zext1))
45256 return SDValue();
45257
45258 // Create the SAD instruction.
45259 SDLoc DL(Extract);
45260 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45261
45262 // If the original vector was wider than 8 elements, sum over the results
45263 // in the SAD vector.
45264 unsigned Stages = Log2_32(VT.getVectorNumElements());
45265 EVT SadVT = SAD.getValueType();
45266 if (Stages > 3) {
45267 unsigned SadElems = SadVT.getVectorNumElements();
45268
45269 for(unsigned i = Stages - 3; i > 0; --i) {
45270 SmallVector<int, 16> Mask(SadElems, -1);
45271 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45272 Mask[j] = MaskEnd + j;
45273
45274 SDValue Shuffle =
45275 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45276 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45277 }
45278 }
45279
45280 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45281 // Return the lowest ExtractSizeInBits bits.
45282 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45283 SadVT.getSizeInBits() / ExtractSizeInBits);
45284 SAD = DAG.getBitcast(ResVT, SAD);
45285 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45286 Extract->getOperand(1));
45287}
45288
45289// Attempt to peek through a target shuffle and extract the scalar from the
45290// source.
45291static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
45292 TargetLowering::DAGCombinerInfo &DCI,
45293 const X86Subtarget &Subtarget) {
45294 if (DCI.isBeforeLegalizeOps())
45295 return SDValue();
45296
45297 SDLoc dl(N);
45298 SDValue Src = N->getOperand(0);
45299 SDValue Idx = N->getOperand(1);
45300
45301 EVT VT = N->getValueType(0);
45302 EVT SrcVT = Src.getValueType();
45303 EVT SrcSVT = SrcVT.getVectorElementType();
45304 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45305 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45306
45307 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45308 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45309 return SDValue();
45310
45311 const APInt &IdxC = N->getConstantOperandAPInt(1);
45312 if (IdxC.uge(NumSrcElts))
45313 return SDValue();
45314
45315 SDValue SrcBC = peekThroughBitcasts(Src);
45316
45317 // Handle extract(bitcast(broadcast(scalar_value))).
45318 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45319 SDValue SrcOp = SrcBC.getOperand(0);
45320 EVT SrcOpVT = SrcOp.getValueType();
45321 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45322 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45323 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45324 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45325 // TODO support non-zero offsets.
45326 if (Offset == 0) {
45327 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45328 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45329 return SrcOp;
45330 }
45331 }
45332 }
45333
45334 // If we're extracting a single element from a broadcast load and there are
45335 // no other users, just create a single load.
45336 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45337 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45338 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45339 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45340 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45341 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45342 MemIntr->getBasePtr(),
45343 MemIntr->getPointerInfo(),
45344 MemIntr->getOriginalAlign(),
45345 MemIntr->getMemOperand()->getFlags());
45346 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45347 return Load;
45348 }
45349 }
45350
45351 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45352 // TODO: Move to DAGCombine?
45353 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45354 SrcBC.getValueType().isInteger() &&
45355 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45356 SrcBC.getScalarValueSizeInBits() ==
45357 SrcBC.getOperand(0).getValueSizeInBits()) {
45358 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45359 if (IdxC.ult(Scale)) {
45360 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45361 SDValue Scl = SrcBC.getOperand(0);
45362 EVT SclVT = Scl.getValueType();
45363 if (Offset) {
45364 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45365 DAG.getShiftAmountConstant(Offset, SclVT, dl));
45366 }
45367 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45368 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45369 return Scl;
45370 }
45371 }
45372
45373 // Handle extract(truncate(x)) for 0'th index.
45374 // TODO: Treat this as a faux shuffle?
45375 // TODO: When can we use this for general indices?
45376 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45377 (SrcVT.getSizeInBits() % 128) == 0) {
45378 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45379 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45380 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45381 Idx);
45382 }
45383
45384 // We can only legally extract other elements from 128-bit vectors and in
45385 // certain circumstances, depending on SSE-level.
45386 // TODO: Investigate float/double extraction if it will be just stored.
45387 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45388 unsigned Idx) {
45389 EVT VecSVT = VecVT.getScalarType();
45390 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45391 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45392 VecSVT == MVT::i64)) {
45393 unsigned EltSizeInBits = VecSVT.getSizeInBits();
45394 unsigned NumEltsPerLane = 128 / EltSizeInBits;
45395 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45396 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45397 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45398 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45399 Idx &= (NumEltsPerLane - 1);
45400 }
45401 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45402 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45403 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45404 DAG.getBitcast(VecVT, Vec),
45405 DAG.getIntPtrConstant(Idx, dl));
45406 }
45407 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45408 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45409 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45410 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45411 DAG.getTargetConstant(Idx, dl, MVT::i8));
45412 }
45413 return SDValue();
45414 };
45415
45416 // Resolve the target shuffle inputs and mask.
45417 SmallVector<int, 16> Mask;
45418 SmallVector<SDValue, 2> Ops;
45419 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45420 return SDValue();
45421
45422 // Shuffle inputs must be the same size as the result.
45423 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
45424 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
45425 }))
45426 return SDValue();
45427
45428 // Attempt to narrow/widen the shuffle mask to the correct size.
45429 if (Mask.size() != NumSrcElts) {
45430 if ((NumSrcElts % Mask.size()) == 0) {
45431 SmallVector<int, 16> ScaledMask;
45432 int Scale = NumSrcElts / Mask.size();
45433 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
45434 Mask = std::move(ScaledMask);
45435 } else if ((Mask.size() % NumSrcElts) == 0) {
45436 // Simplify Mask based on demanded element.
45437 int ExtractIdx = (int)IdxC.getZExtValue();
45438 int Scale = Mask.size() / NumSrcElts;
45439 int Lo = Scale * ExtractIdx;
45440 int Hi = Scale * (ExtractIdx + 1);
45441 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
45442 if (i < Lo || Hi <= i)
45443 Mask[i] = SM_SentinelUndef;
45444
45445 SmallVector<int, 16> WidenedMask;
45446 while (Mask.size() > NumSrcElts &&
45447 canWidenShuffleElements(Mask, WidenedMask))
45448 Mask = std::move(WidenedMask);
45449 }
45450 }
45451
45452 // If narrowing/widening failed, see if we can extract+zero-extend.
45453 int ExtractIdx;
45454 EVT ExtractVT;
45455 if (Mask.size() == NumSrcElts) {
45456 ExtractIdx = Mask[IdxC.getZExtValue()];
45457 ExtractVT = SrcVT;
45458 } else {
45459 unsigned Scale = Mask.size() / NumSrcElts;
45460 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
45461 return SDValue();
45462 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
45463 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
45464 return SDValue();
45465 ExtractIdx = Mask[ScaledIdx];
45466 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
45467 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
45468 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45469, __extension__
__PRETTY_FUNCTION__))
45469 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45469, __extension__
__PRETTY_FUNCTION__))
;
45470 }
45471
45472 // If the shuffle source element is undef/zero then we can just accept it.
45473 if (ExtractIdx == SM_SentinelUndef)
45474 return DAG.getUNDEF(VT);
45475
45476 if (ExtractIdx == SM_SentinelZero)
45477 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
45478 : DAG.getConstant(0, dl, VT);
45479
45480 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
45481 ExtractIdx = ExtractIdx % Mask.size();
45482 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
45483 return DAG.getZExtOrTrunc(V, dl, VT);
45484
45485 return SDValue();
45486}
45487
45488/// Extracting a scalar FP value from vector element 0 is free, so extract each
45489/// operand first, then perform the math as a scalar op.
45490static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
45491 const X86Subtarget &Subtarget) {
45492 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45492, __extension__
__PRETTY_FUNCTION__))
;
45493 SDValue Vec = ExtElt->getOperand(0);
45494 SDValue Index = ExtElt->getOperand(1);
45495 EVT VT = ExtElt->getValueType(0);
45496 EVT VecVT = Vec.getValueType();
45497
45498 // TODO: If this is a unary/expensive/expand op, allow extraction from a
45499 // non-zero element because the shuffle+scalar op will be cheaper?
45500 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
45501 return SDValue();
45502
45503 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
45504 // extract, the condition code), so deal with those as a special-case.
45505 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
45506 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
45507 if (OpVT != MVT::f32 && OpVT != MVT::f64)
45508 return SDValue();
45509
45510 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
45511 SDLoc DL(ExtElt);
45512 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45513 Vec.getOperand(0), Index);
45514 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45515 Vec.getOperand(1), Index);
45516 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
45517 }
45518
45519 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
45520 VT != MVT::f64)
45521 return SDValue();
45522
45523 // Vector FP selects don't fit the pattern of FP math ops (because the
45524 // condition has a different type and we have to change the opcode), so deal
45525 // with those here.
45526 // FIXME: This is restricted to pre type legalization by ensuring the setcc
45527 // has i1 elements. If we loosen this we need to convert vector bool to a
45528 // scalar bool.
45529 if (Vec.getOpcode() == ISD::VSELECT &&
45530 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
45531 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
45532 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
45533 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
45534 SDLoc DL(ExtElt);
45535 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
45536 Vec.getOperand(0).getValueType().getScalarType(),
45537 Vec.getOperand(0), Index);
45538 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45539 Vec.getOperand(1), Index);
45540 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45541 Vec.getOperand(2), Index);
45542 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
45543 }
45544
45545 // TODO: This switch could include FNEG and the x86-specific FP logic ops
45546 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
45547 // missed load folding and fma+fneg combining.
45548 switch (Vec.getOpcode()) {
45549 case ISD::FMA: // Begin 3 operands
45550 case ISD::FMAD:
45551 case ISD::FADD: // Begin 2 operands
45552 case ISD::FSUB:
45553 case ISD::FMUL:
45554 case ISD::FDIV:
45555 case ISD::FREM:
45556 case ISD::FCOPYSIGN:
45557 case ISD::FMINNUM:
45558 case ISD::FMAXNUM:
45559 case ISD::FMINNUM_IEEE:
45560 case ISD::FMAXNUM_IEEE:
45561 case ISD::FMAXIMUM:
45562 case ISD::FMINIMUM:
45563 case X86ISD::FMAX:
45564 case X86ISD::FMIN:
45565 case ISD::FABS: // Begin 1 operand
45566 case ISD::FSQRT:
45567 case ISD::FRINT:
45568 case ISD::FCEIL:
45569 case ISD::FTRUNC:
45570 case ISD::FNEARBYINT:
45571 case ISD::FROUND:
45572 case ISD::FFLOOR:
45573 case X86ISD::FRCP:
45574 case X86ISD::FRSQRT: {
45575 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
45576 SDLoc DL(ExtElt);
45577 SmallVector<SDValue, 4> ExtOps;
45578 for (SDValue Op : Vec->ops())
45579 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
45580 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
45581 }
45582 default:
45583 return SDValue();
45584 }
45585 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45585)
;
45586}
45587
45588/// Try to convert a vector reduction sequence composed of binops and shuffles
45589/// into horizontal ops.
45590static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
45591 const X86Subtarget &Subtarget) {
45592 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45592, __extension__
__PRETTY_FUNCTION__))
;
45593
45594 // We need at least SSE2 to anything here.
45595 if (!Subtarget.hasSSE2())
45596 return SDValue();
45597
45598 ISD::NodeType Opc;
45599 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
45600 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
45601 if (!Rdx)
45602 return SDValue();
45603
45604 SDValue Index = ExtElt->getOperand(1);
45605 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45606, __extension__
__PRETTY_FUNCTION__))
45606 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45606, __extension__
__PRETTY_FUNCTION__))
;
45607
45608 EVT VT = ExtElt->getValueType(0);
45609 EVT VecVT = Rdx.getValueType();
45610 if (VecVT.getScalarType() != VT)
45611 return SDValue();
45612
45613 SDLoc DL(ExtElt);
45614 unsigned NumElts = VecVT.getVectorNumElements();
45615 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
45616
45617 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
45618 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
45619 if (V.getValueType() == MVT::v4i8) {
45620 if (ZeroExtend && Subtarget.hasSSE41()) {
45621 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
45622 DAG.getConstant(0, DL, MVT::v4i32),
45623 DAG.getBitcast(MVT::i32, V),
45624 DAG.getIntPtrConstant(0, DL));
45625 return DAG.getBitcast(MVT::v16i8, V);
45626 }
45627 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
45628 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
45629 : DAG.getUNDEF(MVT::v4i8));
45630 }
45631 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
45632 DAG.getUNDEF(MVT::v8i8));
45633 };
45634
45635 // vXi8 mul reduction - promote to vXi16 mul reduction.
45636 if (Opc == ISD::MUL) {
45637 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
45638 return SDValue();
45639 if (VecVT.getSizeInBits() >= 128) {
45640 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
45641 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45642 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45643 Lo = DAG.getBitcast(WideVT, Lo);
45644 Hi = DAG.getBitcast(WideVT, Hi);
45645 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
45646 while (Rdx.getValueSizeInBits() > 128) {
45647 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45648 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
45649 }
45650 } else {
45651 Rdx = WidenToV16I8(Rdx, false);
45652 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
45653 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
45654 }
45655 if (NumElts >= 8)
45656 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45657 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45658 {4, 5, 6, 7, -1, -1, -1, -1}));
45659 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45660 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45661 {2, 3, -1, -1, -1, -1, -1, -1}));
45662 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45663 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45664 {1, -1, -1, -1, -1, -1, -1, -1}));
45665 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45666 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45667 }
45668
45669 // vXi8 add reduction - sub 128-bit vector.
45670 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
45671 Rdx = WidenToV16I8(Rdx, true);
45672 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45673 DAG.getConstant(0, DL, MVT::v16i8));
45674 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45675 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45676 }
45677
45678 // Must be a >=128-bit vector with pow2 elements.
45679 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
45680 return SDValue();
45681
45682 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
45683 if (VT == MVT::i8) {
45684 while (Rdx.getValueSizeInBits() > 128) {
45685 SDValue Lo, Hi;
45686 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45687 VecVT = Lo.getValueType();
45688 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45689 }
45690 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45690, __extension__
__PRETTY_FUNCTION__))
;
45691
45692 SDValue Hi = DAG.getVectorShuffle(
45693 MVT::v16i8, DL, Rdx, Rdx,
45694 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45695 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
45696 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45697 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
45698 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45699 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45700 }
45701
45702 // See if we can use vXi8 PSADBW add reduction for larger zext types.
45703 // If the source vector values are 0-255, then we can use PSADBW to
45704 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
45705 // TODO: See if its worth avoiding vXi16/i32 truncations?
45706 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
45707 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
45708 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
45709 Subtarget.hasAVX512())) {
45710 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
45711 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
45712 if (ByteVT.getSizeInBits() < 128)
45713 Rdx = WidenToV16I8(Rdx, true);
45714
45715 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45716 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45717 ArrayRef<SDValue> Ops) {
45718 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45719 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
45720 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
45721 };
45722 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
45723 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
45724
45725 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
45726 while (Rdx.getValueSizeInBits() > 128) {
45727 SDValue Lo, Hi;
45728 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45729 VecVT = Lo.getValueType();
45730 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45731 }
45732 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45732, __extension__
__PRETTY_FUNCTION__))
;
45733
45734 if (NumElts > 8) {
45735 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45736 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
45737 }
45738
45739 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
45740 Rdx = DAG.getBitcast(VecVT, Rdx);
45741 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45742 }
45743
45744 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
45745 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
45746 return SDValue();
45747
45748 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
45749
45750 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45751 // across the whole vector, so we need an extract + hop preliminary stage.
45752 // This is the only step where the operands of the hop are not the same value.
45753 // TODO: We could extend this to handle 512-bit or even longer vectors.
45754 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
45755 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
45756 unsigned NumElts = VecVT.getVectorNumElements();
45757 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
45758 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
45759 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
45760 VecVT = Rdx.getValueType();
45761 }
45762 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
45763 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
45764 return SDValue();
45765
45766 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
45767 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
45768 for (unsigned i = 0; i != ReductionSteps; ++i)
45769 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
45770
45771 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45772}
45773
45774/// Detect vector gather/scatter index generation and convert it from being a
45775/// bunch of shuffles and extracts into a somewhat faster sequence.
45776/// For i686, the best sequence is apparently storing the value and loading
45777/// scalars back, while for x64 we should use 64-bit extracts and shifts.
45778static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
45779 TargetLowering::DAGCombinerInfo &DCI,
45780 const X86Subtarget &Subtarget) {
45781 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
45782 return NewOp;
45783
45784 SDValue InputVector = N->getOperand(0);
45785 SDValue EltIdx = N->getOperand(1);
45786 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
45787
45788 EVT SrcVT = InputVector.getValueType();
45789 EVT VT = N->getValueType(0);
45790 SDLoc dl(InputVector);
45791 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
45792 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45793 unsigned NumEltBits = VT.getScalarSizeInBits();
45794 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45795
45796 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
45797 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45798
45799 // Integer Constant Folding.
45800 if (CIdx && VT.isInteger()) {
45801 APInt UndefVecElts;
45802 SmallVector<APInt, 16> EltBits;
45803 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
45804 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
45805 EltBits, true, false)) {
45806 uint64_t Idx = CIdx->getZExtValue();
45807 if (UndefVecElts[Idx])
45808 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45809 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
45810 }
45811
45812 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
45813 // Improves lowering of bool masks on rust which splits them into byte array.
45814 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
45815 SDValue Src = peekThroughBitcasts(InputVector);
45816 if (Src.getValueType().getScalarType() == MVT::i1 &&
45817 TLI.isTypeLegal(Src.getValueType())) {
45818 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
45819 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
45820 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
45821 return DAG.getBitcast(VT, Sub);
45822 }
45823 }
45824 }
45825
45826 if (IsPextr) {
45827 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
45828 DCI))
45829 return SDValue(N, 0);
45830
45831 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
45832 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
45833 InputVector.getOpcode() == X86ISD::PINSRW) &&
45834 InputVector.getOperand(2) == EltIdx) {
45835 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45836, __extension__
__PRETTY_FUNCTION__))
45836 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45836, __extension__
__PRETTY_FUNCTION__))
;
45837 SDValue Scl = InputVector.getOperand(1);
45838 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
45839 return DAG.getZExtOrTrunc(Scl, dl, VT);
45840 }
45841
45842 // TODO - Remove this once we can handle the implicit zero-extension of
45843 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
45844 // combineBasicSADPattern.
45845 return SDValue();
45846 }
45847
45848 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
45849 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
45850 InputVector.getOpcode() == ISD::BITCAST &&
45851 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45852 isNullConstant(EltIdx) && InputVector.hasOneUse())
45853 return DAG.getBitcast(VT, InputVector);
45854
45855 // Detect mmx to i32 conversion through a v2i32 elt extract.
45856 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
45857 InputVector.getOpcode() == ISD::BITCAST &&
45858 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45859 isNullConstant(EltIdx) && InputVector.hasOneUse())
45860 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
45861 InputVector.getOperand(0));
45862
45863 // Check whether this extract is the root of a sum of absolute differences
45864 // pattern. This has to be done here because we really want it to happen
45865 // pre-legalization,
45866 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
45867 return SAD;
45868
45869 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
45870 return VPDPBUSD;
45871
45872 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
45873 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
45874 return Cmp;
45875
45876 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
45877 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
45878 return MinMax;
45879
45880 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
45881 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
45882 return V;
45883
45884 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
45885 return V;
45886
45887 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
45888 // and then testing the relevant element.
45889 //
45890 // Note that we only combine extracts on the *same* result number, i.e.
45891 // t0 = merge_values a0, a1, a2, a3
45892 // i1 = extract_vector_elt t0, Constant:i64<2>
45893 // i1 = extract_vector_elt t0, Constant:i64<3>
45894 // but not
45895 // i1 = extract_vector_elt t0:1, Constant:i64<2>
45896 // since the latter would need its own MOVMSK.
45897 if (SrcVT.getScalarType() == MVT::i1) {
45898 bool IsVar = !CIdx;
45899 SmallVector<SDNode *, 16> BoolExtracts;
45900 unsigned ResNo = InputVector.getResNo();
45901 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
45902 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45903 Use->getOperand(0).getResNo() == ResNo &&
45904 Use->getValueType(0) == MVT::i1) {
45905 BoolExtracts.push_back(Use);
45906 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
45907 return true;
45908 }
45909 return false;
45910 };
45911 // TODO: Can we drop the oneuse check for constant extracts?
45912 if (all_of(InputVector->uses(), IsBoolExtract) &&
45913 (IsVar || BoolExtracts.size() > 1)) {
45914 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
45915 if (SDValue BC =
45916 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
45917 for (SDNode *Use : BoolExtracts) {
45918 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
45919 // Mask = 1 << MaskIdx
45920 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
45921 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
45922 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
45923 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
45924 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
45925 DCI.CombineTo(Use, Res);
45926 }
45927 return SDValue(N, 0);
45928 }
45929 }
45930 }
45931
45932 // If this extract is from a loaded vector value and will be used as an
45933 // integer, that requires a potentially expensive XMM -> GPR transfer.
45934 // Additionally, if we can convert to a scalar integer load, that will likely
45935 // be folded into a subsequent integer op.
45936 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
45937 // to a single-use of the loaded vector. For the reasons above, we
45938 // expect this to be profitable even if it creates an extra load.
45939 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
45940 return Use->getOpcode() == ISD::STORE ||
45941 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45942 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45943 });
45944 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
45945 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
45946 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
45947 !LikelyUsedAsVector && LoadVec->isSimple()) {
45948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45949 SDValue NewPtr =
45950 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
45951 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
45952 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45953 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45954 SDValue Load =
45955 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45956 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45957 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
45958 return Load;
45959 }
45960
45961 return SDValue();
45962}
45963
45964// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
45965// This is more or less the reverse of combineBitcastvxi1.
45966static SDValue combineToExtendBoolVectorInReg(
45967 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
45968 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
45969 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
45970 Opcode != ISD::ANY_EXTEND)
45971 return SDValue();
45972 if (!DCI.isBeforeLegalizeOps())
45973 return SDValue();
45974 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
45975 return SDValue();
45976
45977 EVT SVT = VT.getScalarType();
45978 EVT InSVT = N0.getValueType().getScalarType();
45979 unsigned EltSizeInBits = SVT.getSizeInBits();
45980
45981 // Input type must be extending a bool vector (bit-casted from a scalar
45982 // integer) to legal integer types.
45983 if (!VT.isVector())
45984 return SDValue();
45985 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
45986 return SDValue();
45987 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
45988 return SDValue();
45989
45990 SDValue N00 = N0.getOperand(0);
45991 EVT SclVT = N00.getValueType();
45992 if (!SclVT.isScalarInteger())
45993 return SDValue();
45994
45995 SDValue Vec;
45996 SmallVector<int> ShuffleMask;
45997 unsigned NumElts = VT.getVectorNumElements();
45998 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45998, __extension__
__PRETTY_FUNCTION__))
;
45999
46000 // Broadcast the scalar integer to the vector elements.
46001 if (NumElts > EltSizeInBits) {
46002 // If the scalar integer is greater than the vector element size, then we
46003 // must split it down into sub-sections for broadcasting. For example:
46004 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46005 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46006 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46006, __extension__
__PRETTY_FUNCTION__))
;
46007 unsigned Scale = NumElts / EltSizeInBits;
46008 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46009 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46010 Vec = DAG.getBitcast(VT, Vec);
46011
46012 for (unsigned i = 0; i != Scale; ++i)
46013 ShuffleMask.append(EltSizeInBits, i);
46014 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46015 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46016 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46017 // If we have register broadcast instructions, use the scalar size as the
46018 // element type for the shuffle. Then cast to the wider element type. The
46019 // widened bits won't be used, and this might allow the use of a broadcast
46020 // load.
46021 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46021, __extension__
__PRETTY_FUNCTION__))
;
46022 unsigned Scale = EltSizeInBits / NumElts;
46023 EVT BroadcastVT =
46024 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
46025 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46026 ShuffleMask.append(NumElts * Scale, 0);
46027 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
46028 Vec = DAG.getBitcast(VT, Vec);
46029 } else {
46030 // For smaller scalar integers, we can simply any-extend it to the vector
46031 // element size (we don't care about the upper bits) and broadcast it to all
46032 // elements.
46033 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
46034 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
46035 ShuffleMask.append(NumElts, 0);
46036 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46037 }
46038
46039 // Now, mask the relevant bit in each element.
46040 SmallVector<SDValue, 32> Bits;
46041 for (unsigned i = 0; i != NumElts; ++i) {
46042 int BitIdx = (i % EltSizeInBits);
46043 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46044 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46045 }
46046 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46047 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46048
46049 // Compare against the bitmask and extend the result.
46050 EVT CCVT = VT.changeVectorElementType(MVT::i1);
46051 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46052 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46053
46054 // For SEXT, this is now done, otherwise shift the result down for
46055 // zero-extension.
46056 if (Opcode == ISD::SIGN_EXTEND)
46057 return Vec;
46058 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46059 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46060}
46061
46062/// If a vector select has an operand that is -1 or 0, try to simplify the
46063/// select to a bitwise logic operation.
46064/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46065static SDValue
46066combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
46067 TargetLowering::DAGCombinerInfo &DCI,
46068 const X86Subtarget &Subtarget) {
46069 SDValue Cond = N->getOperand(0);
46070 SDValue LHS = N->getOperand(1);
46071 SDValue RHS = N->getOperand(2);
46072 EVT VT = LHS.getValueType();
46073 EVT CondVT = Cond.getValueType();
46074 SDLoc DL(N);
46075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46076
46077 if (N->getOpcode() != ISD::VSELECT)
46078 return SDValue();
46079
46080 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46080, __extension__
__PRETTY_FUNCTION__))
;
46081
46082 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46083 // TODO: Can we assert that both operands are not zeros (because that should
46084 // get simplified at node creation time)?
46085 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46086 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46087
46088 // If both inputs are 0/undef, create a complete zero vector.
46089 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46090 if (TValIsAllZeros && FValIsAllZeros) {
46091 if (VT.isFloatingPoint())
46092 return DAG.getConstantFP(0.0, DL, VT);
46093 return DAG.getConstant(0, DL, VT);
46094 }
46095
46096 // To use the condition operand as a bitwise mask, it must have elements that
46097 // are the same size as the select elements. Ie, the condition operand must
46098 // have already been promoted from the IR select condition type <N x i1>.
46099 // Don't check if the types themselves are equal because that excludes
46100 // vector floating-point selects.
46101 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46102 return SDValue();
46103
46104 // Try to invert the condition if true value is not all 1s and false value is
46105 // not all 0s. Only do this if the condition has one use.
46106 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46107 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46108 // Check if the selector will be produced by CMPP*/PCMP*.
46109 Cond.getOpcode() == ISD::SETCC &&
46110 // Check if SETCC has already been promoted.
46111 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46112 CondVT) {
46113 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46114
46115 if (TValIsAllZeros || FValIsAllOnes) {
46116 SDValue CC = Cond.getOperand(2);
46117 ISD::CondCode NewCC = ISD::getSetCCInverse(
46118 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46119 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46120 NewCC);
46121 std::swap(LHS, RHS);
46122 TValIsAllOnes = FValIsAllOnes;
46123 FValIsAllZeros = TValIsAllZeros;
46124 }
46125 }
46126
46127 // Cond value must be 'sign splat' to be converted to a logical op.
46128 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46129 return SDValue();
46130
46131 // vselect Cond, 111..., 000... -> Cond
46132 if (TValIsAllOnes && FValIsAllZeros)
46133 return DAG.getBitcast(VT, Cond);
46134
46135 if (!TLI.isTypeLegal(CondVT))
46136 return SDValue();
46137
46138 // vselect Cond, 111..., X -> or Cond, X
46139 if (TValIsAllOnes) {
46140 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46141 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46142 return DAG.getBitcast(VT, Or);
46143 }
46144
46145 // vselect Cond, X, 000... -> and Cond, X
46146 if (FValIsAllZeros) {
46147 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46148 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46149 return DAG.getBitcast(VT, And);
46150 }
46151
46152 // vselect Cond, 000..., X -> andn Cond, X
46153 if (TValIsAllZeros) {
46154 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46155 SDValue AndN;
46156 // The canonical form differs for i1 vectors - x86andnp is not used
46157 if (CondVT.getScalarType() == MVT::i1)
46158 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46159 CastRHS);
46160 else
46161 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46162 return DAG.getBitcast(VT, AndN);
46163 }
46164
46165 return SDValue();
46166}
46167
46168/// If both arms of a vector select are concatenated vectors, split the select,
46169/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46170/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46171/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46172static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
46173 const X86Subtarget &Subtarget) {
46174 unsigned Opcode = N->getOpcode();
46175 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46176 return SDValue();
46177
46178 // TODO: Split 512-bit vectors too?
46179 EVT VT = N->getValueType(0);
46180 if (!VT.is256BitVector())
46181 return SDValue();
46182
46183 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46184 SDValue Cond = N->getOperand(0);
46185 SDValue TVal = N->getOperand(1);
46186 SDValue FVal = N->getOperand(2);
46187 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
46188 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46189 !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
46190 !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
46191 return SDValue();
46192
46193 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46194 ArrayRef<SDValue> Ops) {
46195 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46196 };
46197 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
46198 makeBlend, /*CheckBWI*/ false);
46199}
46200
46201static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
46202 SDValue Cond = N->getOperand(0);
46203 SDValue LHS = N->getOperand(1);
46204 SDValue RHS = N->getOperand(2);
46205 SDLoc DL(N);
46206
46207 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46208 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46209 if (!TrueC || !FalseC)
46210 return SDValue();
46211
46212 // Don't do this for crazy integer types.
46213 EVT VT = N->getValueType(0);
46214 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46215 return SDValue();
46216
46217 // We're going to use the condition bit in math or logic ops. We could allow
46218 // this with a wider condition value (post-legalization it becomes an i8),
46219 // but if nothing is creating selects that late, it doesn't matter.
46220 if (Cond.getValueType() != MVT::i1)
46221 return SDValue();
46222
46223 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46224 // 3, 5, or 9 with i32/i64, so those get transformed too.
46225 // TODO: For constants that overflow or do not differ by power-of-2 or small
46226 // multiplier, convert to 'and' + 'add'.
46227 const APInt &TrueVal = TrueC->getAPIntValue();
46228 const APInt &FalseVal = FalseC->getAPIntValue();
46229
46230 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46231 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46232 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46233 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46234 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46235 return SDValue();
46236 }
46237
46238 bool OV;
46239 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46240 if (OV)
46241 return SDValue();
46242
46243 APInt AbsDiff = Diff.abs();
46244 if (AbsDiff.isPowerOf2() ||
46245 ((VT == MVT::i32 || VT == MVT::i64) &&
46246 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46247
46248 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46249 // of the condition can usually be folded into a compare predicate, but even
46250 // without that, the sequence should be cheaper than a CMOV alternative.
46251 if (TrueVal.slt(FalseVal)) {
46252 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46253 std::swap(TrueC, FalseC);
46254 }
46255
46256 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46257 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46258
46259 // Multiply condition by the difference if non-one.
46260 if (!AbsDiff.isOne())
46261 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46262
46263 // Add the base if non-zero.
46264 if (!FalseC->isZero())
46265 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46266
46267 return R;
46268 }
46269
46270 return SDValue();
46271}
46272
46273/// If this is a *dynamic* select (non-constant condition) and we can match
46274/// this node with one of the variable blend instructions, restructure the
46275/// condition so that blends can use the high (sign) bit of each element.
46276/// This function will also call SimplifyDemandedBits on already created
46277/// BLENDV to perform additional simplifications.
46278static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
46279 TargetLowering::DAGCombinerInfo &DCI,
46280 const X86Subtarget &Subtarget) {
46281 SDValue Cond = N->getOperand(0);
46282 if ((N->getOpcode() != ISD::VSELECT &&
46283 N->getOpcode() != X86ISD::BLENDV) ||
46284 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
46285 return SDValue();
46286
46287 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46288 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46289 EVT VT = N->getValueType(0);
46290
46291 // We can only handle the cases where VSELECT is directly legal on the
46292 // subtarget. We custom lower VSELECT nodes with constant conditions and
46293 // this makes it hard to see whether a dynamic VSELECT will correctly
46294 // lower, so we both check the operation's status and explicitly handle the
46295 // cases where a *dynamic* blend will fail even though a constant-condition
46296 // blend could be custom lowered.
46297 // FIXME: We should find a better way to handle this class of problems.
46298 // Potentially, we should combine constant-condition vselect nodes
46299 // pre-legalization into shuffles and not mark as many types as custom
46300 // lowered.
46301 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
46302 return SDValue();
46303 // FIXME: We don't support i16-element blends currently. We could and
46304 // should support them by making *all* the bits in the condition be set
46305 // rather than just the high bit and using an i8-element blend.
46306 if (VT.getVectorElementType() == MVT::i16)
46307 return SDValue();
46308 // Dynamic blending was only available from SSE4.1 onward.
46309 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46310 return SDValue();
46311 // Byte blends are only available in AVX2
46312 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46313 return SDValue();
46314 // There are no 512-bit blend instructions that use sign bits.
46315 if (VT.is512BitVector())
46316 return SDValue();
46317
46318 // Don't optimize before the condition has been transformed to a legal type
46319 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46320 if (BitWidth < 8 || BitWidth > 64)
46321 return SDValue();
46322
46323 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46324 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
46325 UI != UE; ++UI)
46326 if ((UI->getOpcode() != ISD::VSELECT &&
46327 UI->getOpcode() != X86ISD::BLENDV) ||
46328 UI.getOperandNo() != 0)
46329 return false;
46330
46331 return true;
46332 };
46333
46334 APInt DemandedBits(APInt::getSignMask(BitWidth));
46335
46336 if (OnlyUsedAsSelectCond(Cond)) {
46337 KnownBits Known;
46338 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
46339 !DCI.isBeforeLegalizeOps());
46340 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46341 return SDValue();
46342
46343 // If we changed the computation somewhere in the DAG, this change will
46344 // affect all users of Cond. Update all the nodes so that we do not use
46345 // the generic VSELECT anymore. Otherwise, we may perform wrong
46346 // optimizations as we messed with the actual expectation for the vector
46347 // boolean values.
46348 for (SDNode *U : Cond->uses()) {
46349 if (U->getOpcode() == X86ISD::BLENDV)
46350 continue;
46351
46352 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46353 Cond, U->getOperand(1), U->getOperand(2));
46354 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46355 DCI.AddToWorklist(U);
46356 }
46357 DCI.CommitTargetLoweringOpt(TLO);
46358 return SDValue(N, 0);
46359 }
46360
46361 // Otherwise we can still at least try to simplify multiple use bits.
46362 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
46363 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
46364 N->getOperand(1), N->getOperand(2));
46365
46366 return SDValue();
46367}
46368
46369// Try to match:
46370// (or (and (M, (sub 0, X)), (pandn M, X)))
46371// which is a special case of:
46372// (select M, (sub 0, X), X)
46373// Per:
46374// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46375// We know that, if fNegate is 0 or 1:
46376// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46377//
46378// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46379// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46380// ( M ? -X : X) == ((X ^ M ) + (M & 1))
46381// This lets us transform our vselect to:
46382// (add (xor X, M), (and M, 1))
46383// And further to:
46384// (sub (xor X, M), M)
46385static SDValue combineLogicBlendIntoConditionalNegate(
46386 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46387 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46388 EVT MaskVT = Mask.getValueType();
46389 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46391, __extension__
__PRETTY_FUNCTION__))
46390 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46391, __extension__
__PRETTY_FUNCTION__))
46391 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46391, __extension__
__PRETTY_FUNCTION__))
;
46392
46393 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46394 return SDValue();
46395 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
46396 return SDValue();
46397
46398 auto IsNegV = [](SDNode *N, SDValue V) {
46399 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46400 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46401 };
46402
46403 SDValue V;
46404 if (IsNegV(Y.getNode(), X))
46405 V = X;
46406 else if (IsNegV(X.getNode(), Y))
46407 V = Y;
46408 else
46409 return SDValue();
46410
46411 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46412 SDValue SubOp2 = Mask;
46413
46414 // If the negate was on the false side of the select, then
46415 // the operands of the SUB need to be swapped. PR 27251.
46416 // This is because the pattern being matched above is
46417 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46418 // but if the pattern matched was
46419 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46420 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
46421 // pattern also needs to be a negation of the replacement pattern above.
46422 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
46423 // sub accomplishes the negation of the replacement pattern.
46424 if (V == Y)
46425 std::swap(SubOp1, SubOp2);
46426
46427 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
46428 return DAG.getBitcast(VT, Res);
46429}
46430
46431/// Do target-specific dag combines on SELECT and VSELECT nodes.
46432static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
46433 TargetLowering::DAGCombinerInfo &DCI,
46434 const X86Subtarget &Subtarget) {
46435 SDLoc DL(N);
46436 SDValue Cond = N->getOperand(0);
46437 SDValue LHS = N->getOperand(1);
46438 SDValue RHS = N->getOperand(2);
46439
46440 // Try simplification again because we use this function to optimize
46441 // BLENDV nodes that are not handled by the generic combiner.
46442 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
46443 return V;
46444
46445 EVT VT = LHS.getValueType();
46446 EVT CondVT = Cond.getValueType();
46447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46448 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
46449
46450 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
46451 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
46452 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
46453 if (CondVT.isVector() && CondVT.isInteger() &&
46454 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
46455 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
46456 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
46457 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
46458 DL, DAG, Subtarget))
46459 return V;
46460
46461 // Convert vselects with constant condition into shuffles.
46462 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
46463 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
46464 SmallVector<int, 64> Mask;
46465 if (createShuffleMaskFromVSELECT(Mask, Cond,
46466 N->getOpcode() == X86ISD::BLENDV))
46467 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
46468 }
46469
46470 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
46471 // by forcing the unselected elements to zero.
46472 // TODO: Can we handle more shuffles with this?
46473 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
46474 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
46475 LHS.hasOneUse() && RHS.hasOneUse()) {
46476 MVT SimpleVT = VT.getSimpleVT();
46477 SmallVector<SDValue, 1> LHSOps, RHSOps;
46478 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
46479 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
46480 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
46481 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
46482 int NumElts = VT.getVectorNumElements();
46483 for (int i = 0; i != NumElts; ++i) {
46484 // getConstVector sets negative shuffle mask values as undef, so ensure
46485 // we hardcode SM_SentinelZero values to zero (0x80).
46486 if (CondMask[i] < NumElts) {
46487 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
46488 RHSMask[i] = 0x80;
46489 } else {
46490 LHSMask[i] = 0x80;
46491 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
46492 }
46493 }
46494 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
46495 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
46496 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
46497 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
46498 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
46499 }
46500 }
46501
46502 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
46503 // instructions match the semantics of the common C idiom x<y?x:y but not
46504 // x<=y?x:y, because of how they handle negative zero (which can be
46505 // ignored in unsafe-math mode).
46506 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
46507 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
46508 VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
46509 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
46510 (Subtarget.hasSSE2() ||
46511 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
46512 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46513
46514 unsigned Opcode = 0;
46515 // Check for x CC y ? x : y.
46516 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
46517 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
46518 switch (CC) {
46519 default: break;
46520 case ISD::SETULT:
46521 // Converting this to a min would handle NaNs incorrectly, and swapping
46522 // the operands would cause it to handle comparisons between positive
46523 // and negative zero incorrectly.
46524 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46525 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46526 !(DAG.isKnownNeverZeroFloat(LHS) ||
46527 DAG.isKnownNeverZeroFloat(RHS)))
46528 break;
46529 std::swap(LHS, RHS);
46530 }
46531 Opcode = X86ISD::FMIN;
46532 break;
46533 case ISD::SETOLE:
46534 // Converting this to a min would handle comparisons between positive
46535 // and negative zero incorrectly.
46536 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46537 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46538 break;
46539 Opcode = X86ISD::FMIN;
46540 break;
46541 case ISD::SETULE:
46542 // Converting this to a min would handle both negative zeros and NaNs
46543 // incorrectly, but we can swap the operands to fix both.
46544 std::swap(LHS, RHS);
46545 [[fallthrough]];
46546 case ISD::SETOLT:
46547 case ISD::SETLT:
46548 case ISD::SETLE:
46549 Opcode = X86ISD::FMIN;
46550 break;
46551
46552 case ISD::SETOGE:
46553 // Converting this to a max would handle comparisons between positive
46554 // and negative zero incorrectly.
46555 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46556 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46557 break;
46558 Opcode = X86ISD::FMAX;
46559 break;
46560 case ISD::SETUGT:
46561 // Converting this to a max would handle NaNs incorrectly, and swapping
46562 // the operands would cause it to handle comparisons between positive
46563 // and negative zero incorrectly.
46564 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46565 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46566 !(DAG.isKnownNeverZeroFloat(LHS) ||
46567 DAG.isKnownNeverZeroFloat(RHS)))
46568 break;
46569 std::swap(LHS, RHS);
46570 }
46571 Opcode = X86ISD::FMAX;
46572 break;
46573 case ISD::SETUGE:
46574 // Converting this to a max would handle both negative zeros and NaNs
46575 // incorrectly, but we can swap the operands to fix both.
46576 std::swap(LHS, RHS);
46577 [[fallthrough]];
46578 case ISD::SETOGT:
46579 case ISD::SETGT:
46580 case ISD::SETGE:
46581 Opcode = X86ISD::FMAX;
46582 break;
46583 }
46584 // Check for x CC y ? y : x -- a min/max with reversed arms.
46585 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
46586 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
46587 switch (CC) {
46588 default: break;
46589 case ISD::SETOGE:
46590 // Converting this to a min would handle comparisons between positive
46591 // and negative zero incorrectly, and swapping the operands would
46592 // cause it to handle NaNs incorrectly.
46593 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46594 !(DAG.isKnownNeverZeroFloat(LHS) ||
46595 DAG.isKnownNeverZeroFloat(RHS))) {
46596 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46597 break;
46598 std::swap(LHS, RHS);
46599 }
46600 Opcode = X86ISD::FMIN;
46601 break;
46602 case ISD::SETUGT:
46603 // Converting this to a min would handle NaNs incorrectly.
46604 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46605 break;
46606 Opcode = X86ISD::FMIN;
46607 break;
46608 case ISD::SETUGE:
46609 // Converting this to a min would handle both negative zeros and NaNs
46610 // incorrectly, but we can swap the operands to fix both.
46611 std::swap(LHS, RHS);
46612 [[fallthrough]];
46613 case ISD::SETOGT:
46614 case ISD::SETGT:
46615 case ISD::SETGE:
46616 Opcode = X86ISD::FMIN;
46617 break;
46618
46619 case ISD::SETULT:
46620 // Converting this to a max would handle NaNs incorrectly.
46621 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46622 break;
46623 Opcode = X86ISD::FMAX;
46624 break;
46625 case ISD::SETOLE:
46626 // Converting this to a max would handle comparisons between positive
46627 // and negative zero incorrectly, and swapping the operands would
46628 // cause it to handle NaNs incorrectly.
46629 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46630 !DAG.isKnownNeverZeroFloat(LHS) &&
46631 !DAG.isKnownNeverZeroFloat(RHS)) {
46632 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46633 break;
46634 std::swap(LHS, RHS);
46635 }
46636 Opcode = X86ISD::FMAX;
46637 break;
46638 case ISD::SETULE:
46639 // Converting this to a max would handle both negative zeros and NaNs
46640 // incorrectly, but we can swap the operands to fix both.
46641 std::swap(LHS, RHS);
46642 [[fallthrough]];
46643 case ISD::SETOLT:
46644 case ISD::SETLT:
46645 case ISD::SETLE:
46646 Opcode = X86ISD::FMAX;
46647 break;
46648 }
46649 }
46650
46651 if (Opcode)
46652 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46653 }
46654
46655 // Some mask scalar intrinsics rely on checking if only one bit is set
46656 // and implement it in C code like this:
46657 // A[0] = (U & 1) ? A[0] : W[0];
46658 // This creates some redundant instructions that break pattern matching.
46659 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46660 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46661 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
46662 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46663 SDValue AndNode = Cond.getOperand(0);
46664 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
46665 isNullConstant(Cond.getOperand(1)) &&
46666 isOneConstant(AndNode.getOperand(1))) {
46667 // LHS and RHS swapped due to
46668 // setcc outputting 1 when AND resulted in 0 and vice versa.
46669 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
46670 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
46671 }
46672 }
46673
46674 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
46675 // lowering on KNL. In this case we convert it to
46676 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
46677 // The same situation all vectors of i8 and i16 without BWI.
46678 // Make sure we extend these even before type legalization gets a chance to
46679 // split wide vectors.
46680 // Since SKX these selects have a proper lowering.
46681 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
46682 CondVT.getVectorElementType() == MVT::i1 &&
46683 (VT.getVectorElementType() == MVT::i8 ||
46684 VT.getVectorElementType() == MVT::i16)) {
46685 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
46686 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46687 }
46688
46689 // AVX512 - Extend select with zero to merge with target shuffle.
46690 // select(mask, extract_subvector(shuffle(x)), zero) -->
46691 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
46692 // TODO - support non target shuffles as well.
46693 if (Subtarget.hasAVX512() && CondVT.isVector() &&
46694 CondVT.getVectorElementType() == MVT::i1) {
46695 auto SelectableOp = [&TLI](SDValue Op) {
46696 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46697 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
46698 isNullConstant(Op.getOperand(1)) &&
46699 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
46700 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
46701 };
46702
46703 bool SelectableLHS = SelectableOp(LHS);
46704 bool SelectableRHS = SelectableOp(RHS);
46705 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
46706 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
46707
46708 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
46709 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
46710 : RHS.getOperand(0).getValueType();
46711 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
46712 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
46713 VT.getSizeInBits());
46714 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
46715 VT.getSizeInBits());
46716 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
46717 DAG.getUNDEF(SrcCondVT), Cond,
46718 DAG.getIntPtrConstant(0, DL));
46719 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
46720 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
46721 }
46722 }
46723
46724 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
46725 return V;
46726
46727 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46728 Cond.hasOneUse()) {
46729 EVT CondVT = Cond.getValueType();
46730 SDValue Cond0 = Cond.getOperand(0);
46731 SDValue Cond1 = Cond.getOperand(1);
46732 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46733
46734 // Canonicalize min/max:
46735 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46736 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46737 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
46738 // the need for an extra compare against zero. e.g.
46739 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46740 // subl %esi, %edi
46741 // testl %edi, %edi
46742 // movl $0, %eax
46743 // cmovgl %edi, %eax
46744 // =>
46745 // xorl %eax, %eax
46746 // subl %esi, $edi
46747 // cmovsl %eax, %edi
46748 //
46749 // We can also canonicalize
46750 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46751 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46752 // This allows the use of a test instruction for the compare.
46753 if (LHS == Cond0 && RHS == Cond1) {
46754 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
46755 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
46756 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
46757 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46758 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46759 }
46760 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
46761 ISD::CondCode NewCC = ISD::SETUGE;
46762 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46763 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46764 }
46765 }
46766
46767 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
46768 // fold eq + gt/lt nested selects into ge/le selects
46769 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
46770 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
46771 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
46772 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
46773 // .. etc ..
46774 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
46775 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
46776 SDValue InnerSetCC = RHS.getOperand(0);
46777 ISD::CondCode InnerCC =
46778 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
46779 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
46780 Cond0 == InnerSetCC.getOperand(0) &&
46781 Cond1 == InnerSetCC.getOperand(1)) {
46782 ISD::CondCode NewCC;
46783 switch (CC == ISD::SETEQ ? InnerCC : CC) {
46784 case ISD::SETGT: NewCC = ISD::SETGE; break;
46785 case ISD::SETLT: NewCC = ISD::SETLE; break;
46786 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
46787 case ISD::SETULT: NewCC = ISD::SETULE; break;
46788 default: NewCC = ISD::SETCC_INVALID; break;
46789 }
46790 if (NewCC != ISD::SETCC_INVALID) {
46791 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
46792 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
46793 }
46794 }
46795 }
46796 }
46797
46798 // Check if the first operand is all zeros and Cond type is vXi1.
46799 // If this an avx512 target we can improve the use of zero masking by
46800 // swapping the operands and inverting the condition.
46801 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
46802 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
46803 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
46804 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
46805 // Invert the cond to not(cond) : xor(op,allones)=not(op)
46806 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
46807 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
46808 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
46809 }
46810
46811 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
46812 // get split by legalization.
46813 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
46814 CondVT.getVectorElementType() == MVT::i1 &&
46815 TLI.isTypeLegal(VT.getScalarType())) {
46816 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
46817 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
46818 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
46819 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
46820 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
46821 }
46822 }
46823
46824 // Early exit check
46825 if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
46826 return SDValue();
46827
46828 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
46829 return V;
46830
46831 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
46832 return V;
46833
46834 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
46835 return V;
46836
46837 // select(~Cond, X, Y) -> select(Cond, Y, X)
46838 if (CondVT.getScalarType() != MVT::i1) {
46839 if (SDValue CondNot = IsNOT(Cond, DAG))
46840 return DAG.getNode(N->getOpcode(), DL, VT,
46841 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
46842
46843 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
46844 // signbit.
46845 if (Cond.getOpcode() == X86ISD::PCMPGT &&
46846 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
46847 Cond.hasOneUse()) {
46848 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
46849 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
46850 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
46851 }
46852 }
46853
46854 // Try to optimize vXi1 selects if both operands are either all constants or
46855 // bitcasts from scalar integer type. In that case we can convert the operands
46856 // to integer and use an integer select which will be converted to a CMOV.
46857 // We need to take a little bit of care to avoid creating an i64 type after
46858 // type legalization.
46859 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
46860 VT.getVectorElementType() == MVT::i1 &&
46861 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
46862 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46863 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
46864 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
46865 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
46866
46867 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
46868 LHS.getOperand(0).getValueType() == IntVT)) &&
46869 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
46870 RHS.getOperand(0).getValueType() == IntVT))) {
46871 if (LHSIsConst)
46872 LHS = combinevXi1ConstantToInteger(LHS, DAG);
46873 else
46874 LHS = LHS.getOperand(0);
46875
46876 if (RHSIsConst)
46877 RHS = combinevXi1ConstantToInteger(RHS, DAG);
46878 else
46879 RHS = RHS.getOperand(0);
46880
46881 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
46882 return DAG.getBitcast(VT, Select);
46883 }
46884 }
46885 }
46886
46887 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
46888 // single bits, then invert the predicate and swap the select operands.
46889 // This can lower using a vector shift bit-hack rather than mask and compare.
46890 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
46891 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
46892 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
46893 Cond.getOperand(0).getOpcode() == ISD::AND &&
46894 isNullOrNullSplat(Cond.getOperand(1)) &&
46895 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
46896 Cond.getOperand(0).getValueType() == VT) {
46897 // The 'and' mask must be composed of power-of-2 constants.
46898 SDValue And = Cond.getOperand(0);
46899 auto *C = isConstOrConstSplat(And.getOperand(1));
46900 if (C && C->getAPIntValue().isPowerOf2()) {
46901 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
46902 SDValue NotCond =
46903 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
46904 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
46905 }
46906
46907 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
46908 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
46909 // 16-bit lacks a proper blendv.
46910 unsigned EltBitWidth = VT.getScalarSizeInBits();
46911 bool CanShiftBlend =
46912 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
46913 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
46914 (Subtarget.hasXOP()));
46915 if (CanShiftBlend &&
46916 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
46917 return C->getAPIntValue().isPowerOf2();
46918 })) {
46919 // Create a left-shift constant to get the mask bits over to the sign-bit.
46920 SDValue Mask = And.getOperand(1);
46921 SmallVector<int, 32> ShlVals;
46922 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
46923 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
46924 ShlVals.push_back(EltBitWidth - 1 -
46925 MaskVal->getAPIntValue().exactLogBase2());
46926 }
46927 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
46928 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
46929 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
46930 SDValue NewCond =
46931 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
46932 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
46933 }
46934 }
46935
46936 return SDValue();
46937}
46938
46939/// Combine:
46940/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
46941/// to:
46942/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
46943/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
46944/// Note that this is only legal for some op/cc combinations.
46945static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
46946 SelectionDAG &DAG,
46947 const X86Subtarget &Subtarget) {
46948 // This combine only operates on CMP-like nodes.
46949 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46950 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46951 return SDValue();
46952
46953 // Can't replace the cmp if it has more uses than the one we're looking at.
46954 // FIXME: We would like to be able to handle this, but would need to make sure
46955 // all uses were updated.
46956 if (!Cmp.hasOneUse())
46957 return SDValue();
46958
46959 // This only applies to variations of the common case:
46960 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46961 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46962 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46963 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46964 // Using the proper condcodes (see below), overflow is checked for.
46965
46966 // FIXME: We can generalize both constraints:
46967 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46968 // - LHS != 1
46969 // if the result is compared.
46970
46971 SDValue CmpLHS = Cmp.getOperand(0);
46972 SDValue CmpRHS = Cmp.getOperand(1);
46973 EVT CmpVT = CmpLHS.getValueType();
46974
46975 if (!CmpLHS.hasOneUse())
46976 return SDValue();
46977
46978 unsigned Opc = CmpLHS.getOpcode();
46979 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
46980 return SDValue();
46981
46982 SDValue OpRHS = CmpLHS.getOperand(2);
46983 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
46984 if (!OpRHSC)
46985 return SDValue();
46986
46987 APInt Addend = OpRHSC->getAPIntValue();
46988 if (Opc == ISD::ATOMIC_LOAD_SUB)
46989 Addend = -Addend;
46990
46991 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
46992 if (!CmpRHSC)
46993 return SDValue();
46994
46995 APInt Comparison = CmpRHSC->getAPIntValue();
46996 APInt NegAddend = -Addend;
46997
46998 // See if we can adjust the CC to make the comparison match the negated
46999 // addend.
47000 if (Comparison != NegAddend) {
47001 APInt IncComparison = Comparison + 1;
47002 if (IncComparison == NegAddend) {
47003 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47004 Comparison = IncComparison;
47005 CC = X86::COND_AE;
47006 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47007 Comparison = IncComparison;
47008 CC = X86::COND_L;
47009 }
47010 }
47011 APInt DecComparison = Comparison - 1;
47012 if (DecComparison == NegAddend) {
47013 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47014 Comparison = DecComparison;
47015 CC = X86::COND_A;
47016 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47017 Comparison = DecComparison;
47018 CC = X86::COND_LE;
47019 }
47020 }
47021 }
47022
47023 // If the addend is the negation of the comparison value, then we can do
47024 // a full comparison by emitting the atomic arithmetic as a locked sub.
47025 if (Comparison == NegAddend) {
47026 // The CC is fine, but we need to rewrite the LHS of the comparison as an
47027 // atomic sub.
47028 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47029 auto AtomicSub = DAG.getAtomic(
47030 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47031 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47032 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47033 AN->getMemOperand());
47034 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47035 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47036 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47037 return LockOp;
47038 }
47039
47040 // We can handle comparisons with zero in a number of cases by manipulating
47041 // the CC used.
47042 if (!Comparison.isZero())
47043 return SDValue();
47044
47045 if (CC == X86::COND_S && Addend == 1)
47046 CC = X86::COND_LE;
47047 else if (CC == X86::COND_NS && Addend == 1)
47048 CC = X86::COND_G;
47049 else if (CC == X86::COND_G && Addend == -1)
47050 CC = X86::COND_GE;
47051 else if (CC == X86::COND_LE && Addend == -1)
47052 CC = X86::COND_L;
47053 else
47054 return SDValue();
47055
47056 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47057 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47058 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47059 return LockOp;
47060}
47061
47062// Check whether a boolean test is testing a boolean value generated by
47063// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47064// code.
47065//
47066// Simplify the following patterns:
47067// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47068// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47069// to (Op EFLAGS Cond)
47070//
47071// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47072// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47073// to (Op EFLAGS !Cond)
47074//
47075// where Op could be BRCOND or CMOV.
47076//
47077static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
47078 // This combine only operates on CMP-like nodes.
47079 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47080 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47081 return SDValue();
47082
47083 // Quit if not used as a boolean value.
47084 if (CC != X86::COND_E && CC != X86::COND_NE)
47085 return SDValue();
47086
47087 // Check CMP operands. One of them should be 0 or 1 and the other should be
47088 // an SetCC or extended from it.
47089 SDValue Op1 = Cmp.getOperand(0);
47090 SDValue Op2 = Cmp.getOperand(1);
47091
47092 SDValue SetCC;
47093 const ConstantSDNode* C = nullptr;
47094 bool needOppositeCond = (CC == X86::COND_E);
47095 bool checkAgainstTrue = false; // Is it a comparison against 1?
47096
47097 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47098 SetCC = Op2;
47099 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47100 SetCC = Op1;
47101 else // Quit if all operands are not constants.
47102 return SDValue();
47103
47104 if (C->getZExtValue() == 1) {
47105 needOppositeCond = !needOppositeCond;
47106 checkAgainstTrue = true;
47107 } else if (C->getZExtValue() != 0)
47108 // Quit if the constant is neither 0 or 1.
47109 return SDValue();
47110
47111 bool truncatedToBoolWithAnd = false;
47112 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47113 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47114 SetCC.getOpcode() == ISD::TRUNCATE ||
47115 SetCC.getOpcode() == ISD::AND) {
47116 if (SetCC.getOpcode() == ISD::AND) {
47117 int OpIdx = -1;
47118 if (isOneConstant(SetCC.getOperand(0)))
47119 OpIdx = 1;
47120 if (isOneConstant(SetCC.getOperand(1)))
47121 OpIdx = 0;
47122 if (OpIdx < 0)
47123 break;
47124 SetCC = SetCC.getOperand(OpIdx);
47125 truncatedToBoolWithAnd = true;
47126 } else
47127 SetCC = SetCC.getOperand(0);
47128 }
47129
47130 switch (SetCC.getOpcode()) {
47131 case X86ISD::SETCC_CARRY:
47132 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47133 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47134 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47135 // truncated to i1 using 'and'.
47136 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47137 break;
47138 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47139, __extension__
__PRETTY_FUNCTION__))
47139 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47139, __extension__
__PRETTY_FUNCTION__))
;
47140 [[fallthrough]];
47141 case X86ISD::SETCC:
47142 // Set the condition code or opposite one if necessary.
47143 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
47144 if (needOppositeCond)
47145 CC = X86::GetOppositeBranchCondition(CC);
47146 return SetCC.getOperand(1);
47147 case X86ISD::CMOV: {
47148 // Check whether false/true value has canonical one, i.e. 0 or 1.
47149 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47150 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47151 // Quit if true value is not a constant.
47152 if (!TVal)
47153 return SDValue();
47154 // Quit if false value is not a constant.
47155 if (!FVal) {
47156 SDValue Op = SetCC.getOperand(0);
47157 // Skip 'zext' or 'trunc' node.
47158 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47159 Op.getOpcode() == ISD::TRUNCATE)
47160 Op = Op.getOperand(0);
47161 // A special case for rdrand/rdseed, where 0 is set if false cond is
47162 // found.
47163 if ((Op.getOpcode() != X86ISD::RDRAND &&
47164 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47165 return SDValue();
47166 }
47167 // Quit if false value is not the constant 0 or 1.
47168 bool FValIsFalse = true;
47169 if (FVal && FVal->getZExtValue() != 0) {
47170 if (FVal->getZExtValue() != 1)
47171 return SDValue();
47172 // If FVal is 1, opposite cond is needed.
47173 needOppositeCond = !needOppositeCond;
47174 FValIsFalse = false;
47175 }
47176 // Quit if TVal is not the constant opposite of FVal.
47177 if (FValIsFalse && TVal->getZExtValue() != 1)
47178 return SDValue();
47179 if (!FValIsFalse && TVal->getZExtValue() != 0)
47180 return SDValue();
47181 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
47182 if (needOppositeCond)
47183 CC = X86::GetOppositeBranchCondition(CC);
47184 return SetCC.getOperand(3);
47185 }
47186 }
47187
47188 return SDValue();
47189}
47190
47191/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47192/// Match:
47193/// (X86or (X86setcc) (X86setcc))
47194/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47195static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
47196 X86::CondCode &CC1, SDValue &Flags,
47197 bool &isAnd) {
47198 if (Cond->getOpcode() == X86ISD::CMP) {
47199 if (!isNullConstant(Cond->getOperand(1)))
47200 return false;
47201
47202 Cond = Cond->getOperand(0);
47203 }
47204
47205 isAnd = false;
47206
47207 SDValue SetCC0, SetCC1;
47208 switch (Cond->getOpcode()) {
47209 default: return false;
47210 case ISD::AND:
47211 case X86ISD::AND:
47212 isAnd = true;
47213 [[fallthrough]];
47214 case ISD::OR:
47215 case X86ISD::OR:
47216 SetCC0 = Cond->getOperand(0);
47217 SetCC1 = Cond->getOperand(1);
47218 break;
47219 };
47220
47221 // Make sure we have SETCC nodes, using the same flags value.
47222 if (SetCC0.getOpcode() != X86ISD::SETCC ||
47223 SetCC1.getOpcode() != X86ISD::SETCC ||
47224 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47225 return false;
47226
47227 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47228 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47229 Flags = SetCC0->getOperand(1);
47230 return true;
47231}
47232
47233// When legalizing carry, we create carries via add X, -1
47234// If that comes from an actual carry, via setcc, we use the
47235// carry directly.
47236static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
47237 if (EFLAGS.getOpcode() == X86ISD::ADD) {
47238 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47239 bool FoundAndLSB = false;
47240 SDValue Carry = EFLAGS.getOperand(0);
47241 while (Carry.getOpcode() == ISD::TRUNCATE ||
47242 Carry.getOpcode() == ISD::ZERO_EXTEND ||
47243 (Carry.getOpcode() == ISD::AND &&
47244 isOneConstant(Carry.getOperand(1)))) {
47245 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47246 Carry = Carry.getOperand(0);
47247 }
47248 if (Carry.getOpcode() == X86ISD::SETCC ||
47249 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47250 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47251 uint64_t CarryCC = Carry.getConstantOperandVal(0);
47252 SDValue CarryOp1 = Carry.getOperand(1);
47253 if (CarryCC == X86::COND_B)
47254 return CarryOp1;
47255 if (CarryCC == X86::COND_A) {
47256 // Try to convert COND_A into COND_B in an attempt to facilitate
47257 // materializing "setb reg".
47258 //
47259 // Do not flip "e > c", where "c" is a constant, because Cmp
47260 // instruction cannot take an immediate as its first operand.
47261 //
47262 if (CarryOp1.getOpcode() == X86ISD::SUB &&
47263 CarryOp1.getNode()->hasOneUse() &&
47264 CarryOp1.getValueType().isInteger() &&
47265 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
47266 SDValue SubCommute =
47267 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
47268 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
47269 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
47270 }
47271 }
47272 // If this is a check of the z flag of an add with 1, switch to the
47273 // C flag.
47274 if (CarryCC == X86::COND_E &&
47275 CarryOp1.getOpcode() == X86ISD::ADD &&
47276 isOneConstant(CarryOp1.getOperand(1)))
47277 return CarryOp1;
47278 } else if (FoundAndLSB) {
47279 SDLoc DL(Carry);
47280 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
47281 if (Carry.getOpcode() == ISD::SRL) {
47282 BitNo = Carry.getOperand(1);
47283 Carry = Carry.getOperand(0);
47284 }
47285 return getBT(Carry, BitNo, DL, DAG);
47286 }
47287 }
47288 }
47289
47290 return SDValue();
47291}
47292
47293/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
47294/// to avoid the inversion.
47295static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
47296 SelectionDAG &DAG,
47297 const X86Subtarget &Subtarget) {
47298 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
47299 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
47300 EFLAGS.getOpcode() != X86ISD::TESTP)
47301 return SDValue();
47302
47303 // PTEST/TESTP sets EFLAGS as:
47304 // TESTZ: ZF = (Op0 & Op1) == 0
47305 // TESTC: CF = (~Op0 & Op1) == 0
47306 // TESTNZC: ZF == 0 && CF == 0
47307 EVT VT = EFLAGS.getValueType();
47308 SDValue Op0 = EFLAGS.getOperand(0);
47309 SDValue Op1 = EFLAGS.getOperand(1);
47310 EVT OpVT = Op0.getValueType();
47311
47312 // TEST*(~X,Y) == TEST*(X,Y)
47313 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
47314 X86::CondCode InvCC;
47315 switch (CC) {
47316 case X86::COND_B:
47317 // testc -> testz.
47318 InvCC = X86::COND_E;
47319 break;
47320 case X86::COND_AE:
47321 // !testc -> !testz.
47322 InvCC = X86::COND_NE;
47323 break;
47324 case X86::COND_E:
47325 // testz -> testc.
47326 InvCC = X86::COND_B;
47327 break;
47328 case X86::COND_NE:
47329 // !testz -> !testc.
47330 InvCC = X86::COND_AE;
47331 break;
47332 case X86::COND_A:
47333 case X86::COND_BE:
47334 // testnzc -> testnzc (no change).
47335 InvCC = CC;
47336 break;
47337 default:
47338 InvCC = X86::COND_INVALID;
47339 break;
47340 }
47341
47342 if (InvCC != X86::COND_INVALID) {
47343 CC = InvCC;
47344 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47345 DAG.getBitcast(OpVT, NotOp0), Op1);
47346 }
47347 }
47348
47349 if (CC == X86::COND_B || CC == X86::COND_AE) {
47350 // TESTC(X,~X) == TESTC(X,-1)
47351 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47352 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
47353 SDLoc DL(EFLAGS);
47354 return DAG.getNode(EFLAGS.getOpcode(), DL, VT,
47355 DAG.getBitcast(OpVT, NotOp1),
47356 DAG.getAllOnesConstant(DL, OpVT));
47357 }
47358 }
47359 }
47360
47361 if (CC == X86::COND_E || CC == X86::COND_NE) {
47362 // TESTZ(X,~Y) == TESTC(Y,X)
47363 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47364 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47365 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47366 DAG.getBitcast(OpVT, NotOp1), Op0);
47367 }
47368
47369 if (Op0 == Op1) {
47370 SDValue BC = peekThroughBitcasts(Op0);
47371 EVT BCVT = BC.getValueType();
47372
47373 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
47374 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
47375 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47376 DAG.getBitcast(OpVT, BC.getOperand(0)),
47377 DAG.getBitcast(OpVT, BC.getOperand(1)));
47378 }
47379
47380 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
47381 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
47382 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47383 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47384 DAG.getBitcast(OpVT, BC.getOperand(0)),
47385 DAG.getBitcast(OpVT, BC.getOperand(1)));
47386 }
47387
47388 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
47389 // to more efficiently extract the sign bits and compare that.
47390 // TODO: Handle TESTC with comparison inversion.
47391 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
47392 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
47393 if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
47394 unsigned EltBits = BCVT.getScalarSizeInBits();
47395 if (DAG.ComputeNumSignBits(BC) == EltBits) {
47396 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47396, __extension__
__PRETTY_FUNCTION__))
;
47397 APInt SignMask = APInt::getSignMask(EltBits);
47398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47399 if (SDValue Res =
47400 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
47401 // For vXi16 cases we need to use pmovmksb and extract every other
47402 // sign bit.
47403 SDLoc DL(EFLAGS);
47404 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
47405 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
47406 MVT FloatVT =
47407 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
47408 Res = DAG.getBitcast(FloatVT, Res);
47409 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
47410 } else if (EltBits == 16) {
47411 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
47412 Res = DAG.getBitcast(MovmskVT, Res);
47413 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47414 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
47415 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47416 } else {
47417 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47418 }
47419 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
47420 DAG.getConstant(0, DL, MVT::i32));
47421 }
47422 }
47423 }
47424 }
47425
47426 // TESTZ(-1,X) == TESTZ(X,X)
47427 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
47428 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
47429
47430 // TESTZ(X,-1) == TESTZ(X,X)
47431 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
47432 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
47433
47434 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
47435 // TODO: Add COND_NE handling?
47436 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
47437 SDValue Src0 = peekThroughBitcasts(Op0);
47438 SDValue Src1 = peekThroughBitcasts(Op1);
47439 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
47440 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
47441 peekThroughBitcasts(Src0.getOperand(1)), true);
47442 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
47443 peekThroughBitcasts(Src1.getOperand(1)), true);
47444 if (Src0 && Src1)
47445 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47446 DAG.getBitcast(MVT::v4i64, Src0),
47447 DAG.getBitcast(MVT::v4i64, Src1));
47448 }
47449 }
47450 }
47451
47452 return SDValue();
47453}
47454
47455// Attempt to simplify the MOVMSK input based on the comparison type.
47456static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
47457 SelectionDAG &DAG,
47458 const X86Subtarget &Subtarget) {
47459 // Handle eq/ne against zero (any_of).
47460 // Handle eq/ne against -1 (all_of).
47461 if (!(CC == X86::COND_E || CC == X86::COND_NE))
47462 return SDValue();
47463 if (EFLAGS.getValueType() != MVT::i32)
47464 return SDValue();
47465 unsigned CmpOpcode = EFLAGS.getOpcode();
47466 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
47467 return SDValue();
47468 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
47469 if (!CmpConstant)
47470 return SDValue();
47471 const APInt &CmpVal = CmpConstant->getAPIntValue();
47472
47473 SDValue CmpOp = EFLAGS.getOperand(0);
47474 unsigned CmpBits = CmpOp.getValueSizeInBits();
47475 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47475, __extension__
__PRETTY_FUNCTION__))
;
47476
47477 // Peek through any truncate.
47478 if (CmpOp.getOpcode() == ISD::TRUNCATE)
47479 CmpOp = CmpOp.getOperand(0);
47480
47481 // Bail if we don't find a MOVMSK.
47482 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
47483 return SDValue();
47484
47485 SDValue Vec = CmpOp.getOperand(0);
47486 MVT VecVT = Vec.getSimpleValueType();
47487 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47488, __extension__
__PRETTY_FUNCTION__))
47488 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47488, __extension__
__PRETTY_FUNCTION__))
;
47489 unsigned NumElts = VecVT.getVectorNumElements();
47490 unsigned NumEltBits = VecVT.getScalarSizeInBits();
47491
47492 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
47493 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
47494 NumElts <= CmpBits && CmpVal.isMask(NumElts);
47495 if (!IsAnyOf && !IsAllOf)
47496 return SDValue();
47497
47498 // TODO: Check more combining cases for me.
47499 // Here we check the cmp use number to decide do combining or not.
47500 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
47501 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
47502 bool IsOneUse = CmpOp.getNode()->hasOneUse();
47503
47504 // See if we can peek through to a vector with a wider element type, if the
47505 // signbits extend down to all the sub-elements as well.
47506 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
47507 // potential SimplifyDemandedBits/Elts cases.
47508 // If we looked through a truncate that discard bits, we can't do this
47509 // transform.
47510 // FIXME: We could do this transform for truncates that discarded bits by
47511 // inserting an AND mask between the new MOVMSK and the CMP.
47512 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
47513 SDValue BC = peekThroughBitcasts(Vec);
47514 MVT BCVT = BC.getSimpleValueType();
47515 unsigned BCNumElts = BCVT.getVectorNumElements();
47516 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
47517 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
47518 BCNumEltBits > NumEltBits &&
47519 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47520 SDLoc DL(EFLAGS);
47521 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
47522 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47523 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
47524 DAG.getConstant(CmpMask, DL, MVT::i32));
47525 }
47526 }
47527
47528 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
47529 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
47530 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
47531 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
47532 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
47533 SmallVector<SDValue> Ops;
47534 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
47535 Ops.size() == 2) {
47536 SDLoc DL(EFLAGS);
47537 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
47538 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
47539 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
47540 DAG.getBitcast(SubVT, Ops[0]),
47541 DAG.getBitcast(SubVT, Ops[1]));
47542 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
47543 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47544 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
47545 DAG.getConstant(CmpMask, DL, MVT::i32));
47546 }
47547 }
47548
47549 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
47550 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
47551 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
47552 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
47553 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
47554 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
47555 SDValue BC = peekThroughBitcasts(Vec);
47556 // Ensure MOVMSK was testing every signbit of BC.
47557 if (BC.getValueType().getVectorNumElements() <= NumElts) {
47558 if (BC.getOpcode() == X86ISD::PCMPEQ) {
47559 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
47560 BC.getOperand(0), BC.getOperand(1));
47561 V = DAG.getBitcast(TestVT, V);
47562 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47563 }
47564 // Check for 256-bit split vector cases.
47565 if (BC.getOpcode() == ISD::AND &&
47566 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
47567 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
47568 SDValue LHS = BC.getOperand(0);
47569 SDValue RHS = BC.getOperand(1);
47570 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
47571 LHS.getOperand(0), LHS.getOperand(1));
47572 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
47573 RHS.getOperand(0), RHS.getOperand(1));
47574 LHS = DAG.getBitcast(TestVT, LHS);
47575 RHS = DAG.getBitcast(TestVT, RHS);
47576 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
47577 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47578 }
47579 }
47580 }
47581
47582 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
47583 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
47584 // sign bits prior to the comparison with zero unless we know that
47585 // the vXi16 splats the sign bit down to the lower i8 half.
47586 // TODO: Handle all_of patterns.
47587 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
47588 SDValue VecOp0 = Vec.getOperand(0);
47589 SDValue VecOp1 = Vec.getOperand(1);
47590 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
47591 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
47592 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
47593 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
47594 SDLoc DL(EFLAGS);
47595 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
47596 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47597 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
47598 if (!SignExt0) {
47599 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
47600 DAG.getConstant(0xAAAA, DL, MVT::i16));
47601 }
47602 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47603 DAG.getConstant(0, DL, MVT::i16));
47604 }
47605 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
47606 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
47607 if (CmpBits >= 16 && Subtarget.hasInt256() &&
47608 (IsAnyOf || (SignExt0 && SignExt1))) {
47609 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
47610 SDLoc DL(EFLAGS);
47611 SDValue Result = peekThroughBitcasts(Src);
47612 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
47613 Result.getValueType().getVectorNumElements() <= NumElts) {
47614 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
47615 Result.getOperand(0), Result.getOperand(1));
47616 V = DAG.getBitcast(MVT::v4i64, V);
47617 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47618 }
47619 Result = DAG.getBitcast(MVT::v32i8, Result);
47620 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47621 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
47622 if (!SignExt0 || !SignExt1) {
47623 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47624, __extension__
__PRETTY_FUNCTION__))
47624 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47624, __extension__
__PRETTY_FUNCTION__))
;
47625 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
47626 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47627 }
47628 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47629 DAG.getConstant(CmpMask, DL, MVT::i32));
47630 }
47631 }
47632 }
47633
47634 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
47635 SmallVector<int, 32> ShuffleMask;
47636 SmallVector<SDValue, 2> ShuffleInputs;
47637 if (NumElts <= CmpBits &&
47638 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
47639 ShuffleMask, DAG) &&
47640 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
47641 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
47642 unsigned NumShuffleElts = ShuffleMask.size();
47643 APInt DemandedElts = APInt::getZero(NumShuffleElts);
47644 for (int M : ShuffleMask) {
47645 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47645, __extension__
__PRETTY_FUNCTION__))
;
47646 DemandedElts.setBit(M);
47647 }
47648 if (DemandedElts.isAllOnes()) {
47649 SDLoc DL(EFLAGS);
47650 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
47651 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47652 Result =
47653 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
47654 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47655 EFLAGS.getOperand(1));
47656 }
47657 }
47658
47659 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
47660 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
47661 // iff every element is referenced.
47662 if (NumElts <= CmpBits && IsAnyOf && Subtarget.hasAVX() && IsOneUse &&
47663 (NumEltBits == 32 || NumEltBits == 64)) {
47664 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
47665 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
47666 SDValue V = DAG.getBitcast(FloatVT, Vec);
47667 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), MVT::i32, V, V);
47668 }
47669
47670 return SDValue();
47671}
47672
47673/// Optimize an EFLAGS definition used according to the condition code \p CC
47674/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
47675/// uses of chain values.
47676static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
47677 SelectionDAG &DAG,
47678 const X86Subtarget &Subtarget) {
47679 if (CC == X86::COND_B)
47680 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
47681 return Flags;
47682
47683 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
47684 return R;
47685
47686 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
47687 return R;
47688
47689 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
47690 return R;
47691
47692 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
47693}
47694
47695/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
47696static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
47697 TargetLowering::DAGCombinerInfo &DCI,
47698 const X86Subtarget &Subtarget) {
47699 SDLoc DL(N);
47700
47701 SDValue FalseOp = N->getOperand(0);
47702 SDValue TrueOp = N->getOperand(1);
47703 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47704 SDValue Cond = N->getOperand(3);
47705
47706 // cmov X, X, ?, ? --> X
47707 if (TrueOp == FalseOp)
47708 return TrueOp;
47709
47710 // Try to simplify the EFLAGS and condition code operands.
47711 // We can't always do this as FCMOV only supports a subset of X86 cond.
47712 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
47713 if (!(FalseOp.getValueType() == MVT::f80 ||
47714 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
47715 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
47716 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
47717 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
47718 Flags};
47719 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47720 }
47721 }
47722
47723 // If this is a select between two integer constants, try to do some
47724 // optimizations. Note that the operands are ordered the opposite of SELECT
47725 // operands.
47726 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
47727 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
47728 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
47729 // larger than FalseC (the false value).
47730 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47731 CC = X86::GetOppositeBranchCondition(CC);
47732 std::swap(TrueC, FalseC);
47733 std::swap(TrueOp, FalseOp);
47734 }
47735
47736 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47737 // This is efficient for any integer data type (including i8/i16) and
47738 // shift amount.
47739 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47740 Cond = getSETCC(CC, Cond, DL, DAG);
47741
47742 // Zero extend the condition if needed.
47743 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
47744
47745 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
47746 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
47747 DAG.getConstant(ShAmt, DL, MVT::i8));
47748 return Cond;
47749 }
47750
47751 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
47752 // for any integer data type, including i8/i16.
47753 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
47754 Cond = getSETCC(CC, Cond, DL, DAG);
47755
47756 // Zero extend the condition if needed.
47757 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
47758 FalseC->getValueType(0), Cond);
47759 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47760 SDValue(FalseC, 0));
47761 return Cond;
47762 }
47763
47764 // Optimize cases that will turn into an LEA instruction. This requires
47765 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
47766 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
47767 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
47768 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47769, __extension__
__PRETTY_FUNCTION__))
47769 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47769, __extension__
__PRETTY_FUNCTION__))
;
47770
47771 bool isFastMultiplier = false;
47772 if (Diff.ult(10)) {
47773 switch (Diff.getZExtValue()) {
47774 default: break;
47775 case 1: // result = add base, cond
47776 case 2: // result = lea base( , cond*2)
47777 case 3: // result = lea base(cond, cond*2)
47778 case 4: // result = lea base( , cond*4)
47779 case 5: // result = lea base(cond, cond*4)
47780 case 8: // result = lea base( , cond*8)
47781 case 9: // result = lea base(cond, cond*8)
47782 isFastMultiplier = true;
47783 break;
47784 }
47785 }
47786
47787 if (isFastMultiplier) {
47788 Cond = getSETCC(CC, Cond, DL ,DAG);
47789 // Zero extend the condition if needed.
47790 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
47791 Cond);
47792 // Scale the condition by the difference.
47793 if (Diff != 1)
47794 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
47795 DAG.getConstant(Diff, DL, Cond.getValueType()));
47796
47797 // Add the base if non-zero.
47798 if (FalseC->getAPIntValue() != 0)
47799 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47800 SDValue(FalseC, 0));
47801 return Cond;
47802 }
47803 }
47804 }
47805 }
47806
47807 // Handle these cases:
47808 // (select (x != c), e, c) -> select (x != c), e, x),
47809 // (select (x == c), c, e) -> select (x == c), x, e)
47810 // where the c is an integer constant, and the "select" is the combination
47811 // of CMOV and CMP.
47812 //
47813 // The rationale for this change is that the conditional-move from a constant
47814 // needs two instructions, however, conditional-move from a register needs
47815 // only one instruction.
47816 //
47817 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
47818 // some instruction-combining opportunities. This opt needs to be
47819 // postponed as late as possible.
47820 //
47821 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
47822 // the DCI.xxxx conditions are provided to postpone the optimization as
47823 // late as possible.
47824
47825 ConstantSDNode *CmpAgainst = nullptr;
47826 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
47827 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
47828 !isa<ConstantSDNode>(Cond.getOperand(0))) {
47829
47830 if (CC == X86::COND_NE &&
47831 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
47832 CC = X86::GetOppositeBranchCondition(CC);
47833 std::swap(TrueOp, FalseOp);
47834 }
47835
47836 if (CC == X86::COND_E &&
47837 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
47838 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
47839 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
47840 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47841 }
47842 }
47843 }
47844
47845 // Transform:
47846 //
47847 // (cmov 1 T (uge T 2))
47848 //
47849 // to:
47850 //
47851 // (adc T 0 (sub T 1))
47852 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
47853 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
47854 SDValue Cond0 = Cond.getOperand(0);
47855 if (Cond0.getOpcode() == ISD::TRUNCATE)
47856 Cond0 = Cond0.getOperand(0);
47857 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
47858 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
47859 EVT CondVT = Cond->getValueType(0);
47860 EVT OuterVT = N->getValueType(0);
47861 // Subtract 1 and generate a carry.
47862 SDValue NewSub =
47863 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
47864 DAG.getConstant(1, DL, CondVT));
47865 SDValue EFLAGS(NewSub.getNode(), 1);
47866 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
47867 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
47868 }
47869 }
47870
47871 // Fold and/or of setcc's to double CMOV:
47872 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
47873 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
47874 //
47875 // This combine lets us generate:
47876 // cmovcc1 (jcc1 if we don't have CMOV)
47877 // cmovcc2 (same)
47878 // instead of:
47879 // setcc1
47880 // setcc2
47881 // and/or
47882 // cmovne (jne if we don't have CMOV)
47883 // When we can't use the CMOV instruction, it might increase branch
47884 // mispredicts.
47885 // When we can use CMOV, or when there is no mispredict, this improves
47886 // throughput and reduces register pressure.
47887 //
47888 if (CC == X86::COND_NE) {
47889 SDValue Flags;
47890 X86::CondCode CC0, CC1;
47891 bool isAndSetCC;
47892 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
47893 if (isAndSetCC) {
47894 std::swap(FalseOp, TrueOp);
47895 CC0 = X86::GetOppositeBranchCondition(CC0);
47896 CC1 = X86::GetOppositeBranchCondition(CC1);
47897 }
47898
47899 SDValue LOps[] = {FalseOp, TrueOp,
47900 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
47901 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
47902 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
47903 Flags};
47904 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47905 return CMOV;
47906 }
47907 }
47908
47909 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
47910 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
47911 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
47912 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
47913 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
47914 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
47915 SDValue Add = TrueOp;
47916 SDValue Const = FalseOp;
47917 // Canonicalize the condition code for easier matching and output.
47918 if (CC == X86::COND_E)
47919 std::swap(Add, Const);
47920
47921 // We might have replaced the constant in the cmov with the LHS of the
47922 // compare. If so change it to the RHS of the compare.
47923 if (Const == Cond.getOperand(0))
47924 Const = Cond.getOperand(1);
47925
47926 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
47927 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
47928 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
47929 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
47930 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
47931 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
47932 EVT VT = N->getValueType(0);
47933 // This should constant fold.
47934 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
47935 SDValue CMov =
47936 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
47937 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
47938 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
47939 }
47940 }
47941
47942 return SDValue();
47943}
47944
47945/// Different mul shrinking modes.
47946enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
47947
47948static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
47949 EVT VT = N->getOperand(0).getValueType();
47950 if (VT.getScalarSizeInBits() != 32)
47951 return false;
47952
47953 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47953, __extension__
__PRETTY_FUNCTION__))
;
47954 unsigned SignBits[2] = {1, 1};
47955 bool IsPositive[2] = {false, false};
47956 for (unsigned i = 0; i < 2; i++) {
47957 SDValue Opd = N->getOperand(i);
47958
47959 SignBits[i] = DAG.ComputeNumSignBits(Opd);
47960 IsPositive[i] = DAG.SignBitIsZero(Opd);
47961 }
47962
47963 bool AllPositive = IsPositive[0] && IsPositive[1];
47964 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
47965 // When ranges are from -128 ~ 127, use MULS8 mode.
47966 if (MinSignBits >= 25)
47967 Mode = ShrinkMode::MULS8;
47968 // When ranges are from 0 ~ 255, use MULU8 mode.
47969 else if (AllPositive && MinSignBits >= 24)
47970 Mode = ShrinkMode::MULU8;
47971 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47972 else if (MinSignBits >= 17)
47973 Mode = ShrinkMode::MULS16;
47974 // When ranges are from 0 ~ 65535, use MULU16 mode.
47975 else if (AllPositive && MinSignBits >= 16)
47976 Mode = ShrinkMode::MULU16;
47977 else
47978 return false;
47979 return true;
47980}
47981
47982/// When the operands of vector mul are extended from smaller size values,
47983/// like i8 and i16, the type of mul may be shrinked to generate more
47984/// efficient code. Two typical patterns are handled:
47985/// Pattern1:
47986/// %2 = sext/zext <N x i8> %1 to <N x i32>
47987/// %4 = sext/zext <N x i8> %3 to <N x i32>
47988// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47989/// %5 = mul <N x i32> %2, %4
47990///
47991/// Pattern2:
47992/// %2 = zext/sext <N x i16> %1 to <N x i32>
47993/// %4 = zext/sext <N x i16> %3 to <N x i32>
47994/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47995/// %5 = mul <N x i32> %2, %4
47996///
47997/// There are four mul shrinking modes:
47998/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
47999/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48000/// generate pmullw+sext32 for it (MULS8 mode).
48001/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48002/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48003/// generate pmullw+zext32 for it (MULU8 mode).
48004/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48005/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48006/// generate pmullw+pmulhw for it (MULS16 mode).
48007/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48008/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48009/// generate pmullw+pmulhuw for it (MULU16 mode).
48010static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
48011 const X86Subtarget &Subtarget) {
48012 // Check for legality
48013 // pmullw/pmulhw are not supported by SSE.
48014 if (!Subtarget.hasSSE2())
48015 return SDValue();
48016
48017 // Check for profitability
48018 // pmulld is supported since SSE41. It is better to use pmulld
48019 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48020 // the expansion.
48021 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48022 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48023 return SDValue();
48024
48025 ShrinkMode Mode;
48026 if (!canReduceVMulWidth(N, DAG, Mode))
48027 return SDValue();
48028
48029 SDLoc DL(N);
48030 SDValue N0 = N->getOperand(0);
48031 SDValue N1 = N->getOperand(1);
48032 EVT VT = N->getOperand(0).getValueType();
48033 unsigned NumElts = VT.getVectorNumElements();
48034 if ((NumElts % 2) != 0)
48035 return SDValue();
48036
48037 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48038
48039 // Shrink the operands of mul.
48040 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48041 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48042
48043 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48044 // lower part is needed.
48045 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48046 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48047 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48048 : ISD::SIGN_EXTEND,
48049 DL, VT, MulLo);
48050
48051 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48052 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48053 // the higher part is also needed.
48054 SDValue MulHi =
48055 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48056 ReducedVT, NewN0, NewN1);
48057
48058 // Repack the lower part and higher part result of mul into a wider
48059 // result.
48060 // Generate shuffle functioning as punpcklwd.
48061 SmallVector<int, 16> ShuffleMask(NumElts);
48062 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48063 ShuffleMask[2 * i] = i;
48064 ShuffleMask[2 * i + 1] = i + NumElts;
48065 }
48066 SDValue ResLo =
48067 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48068 ResLo = DAG.getBitcast(ResVT, ResLo);
48069 // Generate shuffle functioning as punpckhwd.
48070 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48071 ShuffleMask[2 * i] = i + NumElts / 2;
48072 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48073 }
48074 SDValue ResHi =
48075 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48076 ResHi = DAG.getBitcast(ResVT, ResHi);
48077 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48078}
48079
48080static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
48081 EVT VT, const SDLoc &DL) {
48082
48083 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48084 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48085 DAG.getConstant(Mult, DL, VT));
48086 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48087 DAG.getConstant(Shift, DL, MVT::i8));
48088 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48089 N->getOperand(0));
48090 return Result;
48091 };
48092
48093 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48094 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48095 DAG.getConstant(Mul1, DL, VT));
48096 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48097 DAG.getConstant(Mul2, DL, VT));
48098 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48099 N->getOperand(0));
48100 return Result;
48101 };
48102
48103 switch (MulAmt) {
48104 default:
48105 break;
48106 case 11:
48107 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48108 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48109 case 21:
48110 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48111 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48112 case 41:
48113 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48114 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48115 case 22:
48116 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48117 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48118 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48119 case 19:
48120 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48121 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48122 case 37:
48123 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48124 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48125 case 73:
48126 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48127 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48128 case 13:
48129 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48130 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48131 case 23:
48132 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48133 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48134 case 26:
48135 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48136 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48137 case 28:
48138 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48139 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48140 case 29:
48141 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48142 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48143 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48144 }
48145
48146 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48147 // by a single LEA.
48148 // First check if this a sum of two power of 2s because that's easy. Then
48149 // count how many zeros are up to the first bit.
48150 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48151 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48152 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48153 if (ScaleShift >= 1 && ScaleShift < 4) {
48154 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48155 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48156 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48157 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48158 DAG.getConstant(ScaleShift, DL, MVT::i8));
48159 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48160 }
48161 }
48162
48163 return SDValue();
48164}
48165
48166// If the upper 17 bits of either element are zero and the other element are
48167// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48168// PMULLD, except on KNL.
48169static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
48170 const X86Subtarget &Subtarget) {
48171 if (!Subtarget.hasSSE2())
48172 return SDValue();
48173
48174 if (Subtarget.isPMADDWDSlow())
48175 return SDValue();
48176
48177 EVT VT = N->getValueType(0);
48178
48179 // Only support vXi32 vectors.
48180 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48181 return SDValue();
48182
48183 // Make sure the type is legal or can split/widen to a legal type.
48184 // With AVX512 but without BWI, we would need to split v32i16.
48185 unsigned NumElts = VT.getVectorNumElements();
48186 if (NumElts == 1 || !isPowerOf2_32(NumElts))
48187 return SDValue();
48188
48189 // With AVX512 but without BWI, we would need to split v32i16.
48190 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48191 return SDValue();
48192
48193 SDValue N0 = N->getOperand(0);
48194 SDValue N1 = N->getOperand(1);
48195
48196 // If we are zero/sign extending two steps without SSE4.1, its better to
48197 // reduce the vmul width instead.
48198 if (!Subtarget.hasSSE41() &&
48199 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48200 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48201 (N1.getOpcode() == ISD::ZERO_EXTEND &&
48202 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48203 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48204 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48205 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48206 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48207 return SDValue();
48208
48209 // If we are sign extending a wide vector without SSE4.1, its better to reduce
48210 // the vmul width instead.
48211 if (!Subtarget.hasSSE41() &&
48212 (N0.getOpcode() == ISD::SIGN_EXTEND &&
48213 N0.getOperand(0).getValueSizeInBits() > 128) &&
48214 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48215 N1.getOperand(0).getValueSizeInBits() > 128))
48216 return SDValue();
48217
48218 // Sign bits must extend down to the lowest i16.
48219 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48220 DAG.ComputeMaxSignificantBits(N0) > 16)
48221 return SDValue();
48222
48223 // At least one of the elements must be zero in the upper 17 bits, or can be
48224 // safely made zero without altering the final result.
48225 auto GetZeroableOp = [&](SDValue Op) {
48226 APInt Mask17 = APInt::getHighBitsSet(32, 17);
48227 if (DAG.MaskedValueIsZero(Op, Mask17))
48228 return Op;
48229 // Mask off upper 16-bits of sign-extended constants.
48230 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
48231 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
48232 DAG.getConstant(0xFFFF, SDLoc(N), VT));
48233 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48234 SDValue Src = Op.getOperand(0);
48235 // Convert sext(vXi16) to zext(vXi16).
48236 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
48237 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48238 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
48239 // which will expand the extension.
48240 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
48241 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
48242 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
48243 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48244 }
48245 }
48246 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
48247 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
48248 N->isOnlyUserOf(Op.getNode())) {
48249 SDValue Src = Op.getOperand(0);
48250 if (Src.getScalarValueSizeInBits() == 16)
48251 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
48252 }
48253 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
48254 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
48255 N->isOnlyUserOf(Op.getNode())) {
48256 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
48257 Op.getOperand(1));
48258 }
48259 return SDValue();
48260 };
48261 SDValue ZeroN0 = GetZeroableOp(N0);
48262 SDValue ZeroN1 = GetZeroableOp(N1);
48263 if (!ZeroN0 && !ZeroN1)
48264 return SDValue();
48265 N0 = ZeroN0 ? ZeroN0 : N0;
48266 N1 = ZeroN1 ? ZeroN1 : N1;
48267
48268 // Use SplitOpsAndApply to handle AVX splitting.
48269 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48270 ArrayRef<SDValue> Ops) {
48271 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
48272 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
48273 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
48274 DAG.getBitcast(OpVT, Ops[0]),
48275 DAG.getBitcast(OpVT, Ops[1]));
48276 };
48277 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
48278 PMADDWDBuilder);
48279}
48280
48281static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
48282 const X86Subtarget &Subtarget) {
48283 if (!Subtarget.hasSSE2())
48284 return SDValue();
48285
48286 EVT VT = N->getValueType(0);
48287
48288 // Only support vXi64 vectors.
48289 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
48290 VT.getVectorNumElements() < 2 ||
48291 !isPowerOf2_32(VT.getVectorNumElements()))
48292 return SDValue();
48293
48294 SDValue N0 = N->getOperand(0);
48295 SDValue N1 = N->getOperand(1);
48296
48297 // MULDQ returns the 64-bit result of the signed multiplication of the lower
48298 // 32-bits. We can lower with this if the sign bits stretch that far.
48299 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
48300 DAG.ComputeNumSignBits(N1) > 32) {
48301 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48302 ArrayRef<SDValue> Ops) {
48303 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
48304 };
48305 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48306 PMULDQBuilder, /*CheckBWI*/false);
48307 }
48308
48309 // If the upper bits are zero we can use a single pmuludq.
48310 APInt Mask = APInt::getHighBitsSet(64, 32);
48311 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
48312 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48313 ArrayRef<SDValue> Ops) {
48314 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
48315 };
48316 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48317 PMULUDQBuilder, /*CheckBWI*/false);
48318 }
48319
48320 return SDValue();
48321}
48322
48323static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
48324 TargetLowering::DAGCombinerInfo &DCI,
48325 const X86Subtarget &Subtarget) {
48326 EVT VT = N->getValueType(0);
48327
48328 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
48329 return V;
48330
48331 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
48332 return V;
48333
48334 if (DCI.isBeforeLegalize() && VT.isVector())
48335 return reduceVMULWidth(N, DAG, Subtarget);
48336
48337 // Optimize a single multiply with constant into two operations in order to
48338 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
48339 if (!MulConstantOptimization)
48340 return SDValue();
48341
48342 // An imul is usually smaller than the alternative sequence.
48343 if (DAG.getMachineFunction().getFunction().hasMinSize())
48344 return SDValue();
48345
48346 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
48347 return SDValue();
48348
48349 if (VT != MVT::i64 && VT != MVT::i32)
48350 return SDValue();
48351
48352 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
48353 if (!C)
48354 return SDValue();
48355 if (isPowerOf2_64(C->getZExtValue()))
48356 return SDValue();
48357
48358 int64_t SignMulAmt = C->getSExtValue();
48359 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48359, __extension__
__PRETTY_FUNCTION__))
;
48360 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
48361
48362 SDLoc DL(N);
48363 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
48364 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48365 DAG.getConstant(AbsMulAmt, DL, VT));
48366 if (SignMulAmt < 0)
48367 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48368 NewMul);
48369
48370 return NewMul;
48371 }
48372
48373 uint64_t MulAmt1 = 0;
48374 uint64_t MulAmt2 = 0;
48375 if ((AbsMulAmt % 9) == 0) {
48376 MulAmt1 = 9;
48377 MulAmt2 = AbsMulAmt / 9;
48378 } else if ((AbsMulAmt % 5) == 0) {
48379 MulAmt1 = 5;
48380 MulAmt2 = AbsMulAmt / 5;
48381 } else if ((AbsMulAmt % 3) == 0) {
48382 MulAmt1 = 3;
48383 MulAmt2 = AbsMulAmt / 3;
48384 }
48385
48386 SDValue NewMul;
48387 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
48388 if (MulAmt2 &&
48389 (isPowerOf2_64(MulAmt2) ||
48390 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
48391
48392 if (isPowerOf2_64(MulAmt2) &&
48393 !(SignMulAmt >= 0 && N->hasOneUse() &&
48394 N->use_begin()->getOpcode() == ISD::ADD))
48395 // If second multiplifer is pow2, issue it first. We want the multiply by
48396 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
48397 // is an add. Only do this for positive multiply amounts since the
48398 // negate would prevent it from being used as an address mode anyway.
48399 std::swap(MulAmt1, MulAmt2);
48400
48401 if (isPowerOf2_64(MulAmt1))
48402 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48403 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
48404 else
48405 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48406 DAG.getConstant(MulAmt1, DL, VT));
48407
48408 if (isPowerOf2_64(MulAmt2))
48409 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
48410 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
48411 else
48412 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
48413 DAG.getConstant(MulAmt2, DL, VT));
48414
48415 // Negate the result.
48416 if (SignMulAmt < 0)
48417 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48418 NewMul);
48419 } else if (!Subtarget.slowLEA())
48420 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
48421
48422 if (!NewMul) {
48423 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))
48424 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))
48425 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))
48426 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))
;
48427 if (isPowerOf2_64(AbsMulAmt - 1)) {
48428 // (mul x, 2^N + 1) => (add (shl x, N), x)
48429 NewMul = DAG.getNode(
48430 ISD::ADD, DL, VT, N->getOperand(0),
48431 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48432 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
48433 MVT::i8)));
48434 // To negate, subtract the number from zero
48435 if (SignMulAmt < 0)
48436 NewMul = DAG.getNode(ISD::SUB, DL, VT,
48437 DAG.getConstant(0, DL, VT), NewMul);
48438 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
48439 // (mul x, 2^N - 1) => (sub (shl x, N), x)
48440 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48441 DAG.getConstant(Log2_64(AbsMulAmt + 1),
48442 DL, MVT::i8));
48443 // To negate, reverse the operands of the subtract.
48444 if (SignMulAmt < 0)
48445 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
48446 else
48447 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
48448 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
48449 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
48450 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48451 DAG.getConstant(Log2_64(AbsMulAmt - 2),
48452 DL, MVT::i8));
48453 NewMul = DAG.getNode(
48454 ISD::ADD, DL, VT, NewMul,
48455 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48456 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
48457 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48458 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48459 DAG.getConstant(Log2_64(AbsMulAmt + 2),
48460 DL, MVT::i8));
48461 NewMul = DAG.getNode(
48462 ISD::SUB, DL, VT, NewMul,
48463 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48464 }
48465 }
48466
48467 return NewMul;
48468}
48469
48470// Try to form a MULHU or MULHS node by looking for
48471// (srl (mul ext, ext), 16)
48472// TODO: This is X86 specific because we want to be able to handle wide types
48473// before type legalization. But we can only do it if the vector will be
48474// legalized via widening/splitting. Type legalization can't handle promotion
48475// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
48476// combiner.
48477static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
48478 const X86Subtarget &Subtarget) {
48479 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48480, __extension__
__PRETTY_FUNCTION__))
48480 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48480, __extension__
__PRETTY_FUNCTION__))
;
48481 SDLoc DL(N);
48482
48483 if (!Subtarget.hasSSE2())
48484 return SDValue();
48485
48486 // The operation feeding into the shift must be a multiply.
48487 SDValue ShiftOperand = N->getOperand(0);
48488 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
48489 return SDValue();
48490
48491 // Input type should be at least vXi32.
48492 EVT VT = N->getValueType(0);
48493 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
48494 return SDValue();
48495
48496 // Need a shift by 16.
48497 APInt ShiftAmt;
48498 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
48499 ShiftAmt != 16)
48500 return SDValue();
48501
48502 SDValue LHS = ShiftOperand.getOperand(0);
48503 SDValue RHS = ShiftOperand.getOperand(1);
48504
48505 unsigned ExtOpc = LHS.getOpcode();
48506 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
48507 RHS.getOpcode() != ExtOpc)
48508 return SDValue();
48509
48510 // Peek through the extends.
48511 LHS = LHS.getOperand(0);
48512 RHS = RHS.getOperand(0);
48513
48514 // Ensure the input types match.
48515 EVT MulVT = LHS.getValueType();
48516 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
48517 return SDValue();
48518
48519 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
48520 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
48521
48522 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48523 return DAG.getNode(ExtOpc, DL, VT, Mulh);
48524}
48525
48526static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
48527 SDValue N0 = N->getOperand(0);
48528 SDValue N1 = N->getOperand(1);
48529 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
48530 EVT VT = N0.getValueType();
48531
48532 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
48533 // since the result of setcc_c is all zero's or all ones.
48534 if (VT.isInteger() && !VT.isVector() &&
48535 N1C && N0.getOpcode() == ISD::AND &&
48536 N0.getOperand(1).getOpcode() == ISD::Constant) {
48537 SDValue N00 = N0.getOperand(0);
48538 APInt Mask = N0.getConstantOperandAPInt(1);
48539 Mask <<= N1C->getAPIntValue();
48540 bool MaskOK = false;
48541 // We can handle cases concerning bit-widening nodes containing setcc_c if
48542 // we carefully interrogate the mask to make sure we are semantics
48543 // preserving.
48544 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
48545 // of the underlying setcc_c operation if the setcc_c was zero extended.
48546 // Consider the following example:
48547 // zext(setcc_c) -> i32 0x0000FFFF
48548 // c1 -> i32 0x0000FFFF
48549 // c2 -> i32 0x00000001
48550 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
48551 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
48552 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
48553 MaskOK = true;
48554 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
48555 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48556 MaskOK = true;
48557 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
48558 N00.getOpcode() == ISD::ANY_EXTEND) &&
48559 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48560 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
48561 }
48562 if (MaskOK && Mask != 0) {
48563 SDLoc DL(N);
48564 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
48565 }
48566 }
48567
48568 return SDValue();
48569}
48570
48571static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
48572 const X86Subtarget &Subtarget) {
48573 SDValue N0 = N->getOperand(0);
48574 SDValue N1 = N->getOperand(1);
48575 EVT VT = N0.getValueType();
48576 unsigned Size = VT.getSizeInBits();
48577
48578 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48579 return V;
48580
48581 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
48582 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
48583 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
48584 // depending on sign of (SarConst - [56,48,32,24,16])
48585
48586 // sexts in X86 are MOVs. The MOVs have the same code size
48587 // as above SHIFTs (only SHIFT on 1 has lower code size).
48588 // However the MOVs have 2 advantages to a SHIFT:
48589 // 1. MOVs can write to a register that differs from source
48590 // 2. MOVs accept memory operands
48591
48592 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
48593 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
48594 N0.getOperand(1).getOpcode() != ISD::Constant)
48595 return SDValue();
48596
48597 SDValue N00 = N0.getOperand(0);
48598 SDValue N01 = N0.getOperand(1);
48599 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
48600 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
48601 EVT CVT = N1.getValueType();
48602
48603 if (SarConst.isNegative())
48604 return SDValue();
48605
48606 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
48607 unsigned ShiftSize = SVT.getSizeInBits();
48608 // skipping types without corresponding sext/zext and
48609 // ShlConst that is not one of [56,48,32,24,16]
48610 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48611 continue;
48612 SDLoc DL(N);
48613 SDValue NN =
48614 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
48615 SarConst = SarConst - (Size - ShiftSize);
48616 if (SarConst == 0)
48617 return NN;
48618 if (SarConst.isNegative())
48619 return DAG.getNode(ISD::SHL, DL, VT, NN,
48620 DAG.getConstant(-SarConst, DL, CVT));
48621 return DAG.getNode(ISD::SRA, DL, VT, NN,
48622 DAG.getConstant(SarConst, DL, CVT));
48623 }
48624 return SDValue();
48625}
48626
48627static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
48628 TargetLowering::DAGCombinerInfo &DCI,
48629 const X86Subtarget &Subtarget) {
48630 SDValue N0 = N->getOperand(0);
48631 SDValue N1 = N->getOperand(1);
48632 EVT VT = N0.getValueType();
48633
48634 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48635 return V;
48636
48637 // Only do this on the last DAG combine as it can interfere with other
48638 // combines.
48639 if (!DCI.isAfterLegalizeDAG())
48640 return SDValue();
48641
48642 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
48643 // TODO: This is a generic DAG combine that became an x86-only combine to
48644 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48645 // and-not ('andn').
48646 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
48647 return SDValue();
48648
48649 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
48650 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
48651 if (!ShiftC || !AndC)
48652 return SDValue();
48653
48654 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48655 // transform should reduce code size. It may also enable secondary transforms
48656 // from improved known-bits analysis or instruction selection.
48657 APInt MaskVal = AndC->getAPIntValue();
48658
48659 // If this can be matched by a zero extend, don't optimize.
48660 if (MaskVal.isMask()) {
48661 unsigned TO = MaskVal.countr_one();
48662 if (TO >= 8 && isPowerOf2_32(TO))
48663 return SDValue();
48664 }
48665
48666 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48667 unsigned OldMaskSize = MaskVal.getSignificantBits();
48668 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
48669 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
48670 (OldMaskSize > 32 && NewMaskSize <= 32)) {
48671 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48672 SDLoc DL(N);
48673 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
48674 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
48675 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
48676 }
48677 return SDValue();
48678}
48679
48680static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
48681 const X86Subtarget &Subtarget) {
48682 unsigned Opcode = N->getOpcode();
48683 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__))
;
48684
48685 SDLoc DL(N);
48686 EVT VT = N->getValueType(0);
48687 SDValue N0 = N->getOperand(0);
48688 SDValue N1 = N->getOperand(1);
48689 EVT SrcVT = N0.getValueType();
48690
48691 SDValue BC0 =
48692 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48693 SDValue BC1 =
48694 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48695
48696 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
48697 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
48698 // truncation trees that help us avoid lane crossing shuffles.
48699 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48700 // TODO: We don't handle vXf64 shuffles yet.
48701 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48702 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
48703 SmallVector<SDValue> ShuffleOps;
48704 SmallVector<int> ShuffleMask, ScaledMask;
48705 SDValue Vec = peekThroughBitcasts(BCSrc);
48706 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
48707 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
48708 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
48709 // shuffle to a v4X64 width - we can probably relax this in the future.
48710 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
48711 ShuffleOps[0].getValueType().is256BitVector() &&
48712 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
48713 SDValue Lo, Hi;
48714 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48715 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
48716 Lo = DAG.getBitcast(SrcVT, Lo);
48717 Hi = DAG.getBitcast(SrcVT, Hi);
48718 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
48719 Res = DAG.getBitcast(ShufVT, Res);
48720 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
48721 return DAG.getBitcast(VT, Res);
48722 }
48723 }
48724 }
48725 }
48726
48727 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48728 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48729 // If either/both ops are a shuffle that can scale to v2x64,
48730 // then see if we can perform this as a v4x32 post shuffle.
48731 SmallVector<SDValue> Ops0, Ops1;
48732 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
48733 bool IsShuf0 =
48734 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48735 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48736 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48737 bool IsShuf1 =
48738 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48739 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
48740 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48741 if (IsShuf0 || IsShuf1) {
48742 if (!IsShuf0) {
48743 Ops0.assign({BC0});
48744 ScaledMask0.assign({0, 1});
48745 }
48746 if (!IsShuf1) {
48747 Ops1.assign({BC1});
48748 ScaledMask1.assign({0, 1});
48749 }
48750
48751 SDValue LHS, RHS;
48752 int PostShuffle[4] = {-1, -1, -1, -1};
48753 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
48754 if (M < 0)
48755 return true;
48756 Idx = M % 2;
48757 SDValue Src = Ops[M / 2];
48758 if (!LHS || LHS == Src) {
48759 LHS = Src;
48760 return true;
48761 }
48762 if (!RHS || RHS == Src) {
48763 Idx += 2;
48764 RHS = Src;
48765 return true;
48766 }
48767 return false;
48768 };
48769 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
48770 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
48771 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
48772 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
48773 LHS = DAG.getBitcast(SrcVT, LHS);
48774 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
48775 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48776 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
48777 Res = DAG.getBitcast(ShufVT, Res);
48778 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
48779 return DAG.getBitcast(VT, Res);
48780 }
48781 }
48782 }
48783
48784 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
48785 if (VT.is256BitVector() && Subtarget.hasInt256()) {
48786 SmallVector<int> Mask0, Mask1;
48787 SmallVector<SDValue> Ops0, Ops1;
48788 SmallVector<int, 2> ScaledMask0, ScaledMask1;
48789 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48790 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48791 !Ops0.empty() && !Ops1.empty() &&
48792 all_of(Ops0,
48793 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48794 all_of(Ops1,
48795 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48796 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48797 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
48798 SDValue Op00 = peekThroughBitcasts(Ops0.front());
48799 SDValue Op10 = peekThroughBitcasts(Ops1.front());
48800 SDValue Op01 = peekThroughBitcasts(Ops0.back());
48801 SDValue Op11 = peekThroughBitcasts(Ops1.back());
48802 if ((Op00 == Op11) && (Op01 == Op10)) {
48803 std::swap(Op10, Op11);
48804 ShuffleVectorSDNode::commuteMask(ScaledMask1);
48805 }
48806 if ((Op00 == Op10) && (Op01 == Op11)) {
48807 const int Map[4] = {0, 2, 1, 3};
48808 SmallVector<int, 4> ShuffleMask(
48809 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
48810 Map[ScaledMask1[1]]});
48811 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
48812 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
48813 DAG.getBitcast(SrcVT, Op01));
48814 Res = DAG.getBitcast(ShufVT, Res);
48815 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
48816 return DAG.getBitcast(VT, Res);
48817 }
48818 }
48819 }
48820
48821 return SDValue();
48822}
48823
48824static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
48825 TargetLowering::DAGCombinerInfo &DCI,
48826 const X86Subtarget &Subtarget) {
48827 unsigned Opcode = N->getOpcode();
48828 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48829, __extension__
__PRETTY_FUNCTION__))
48829 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48829, __extension__
__PRETTY_FUNCTION__))
;
48830
48831 EVT VT = N->getValueType(0);
48832 SDValue N0 = N->getOperand(0);
48833 SDValue N1 = N->getOperand(1);
48834 unsigned NumDstElts = VT.getVectorNumElements();
48835 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
48836 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
48837 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48839, __extension__
__PRETTY_FUNCTION__))
48838 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48839, __extension__
__PRETTY_FUNCTION__))
48839 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48839, __extension__
__PRETTY_FUNCTION__))
;
48840
48841 bool IsSigned = (X86ISD::PACKSS == Opcode);
48842
48843 // Constant Folding.
48844 APInt UndefElts0, UndefElts1;
48845 SmallVector<APInt, 32> EltBits0, EltBits1;
48846 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
48847 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
48848 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
48849 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
48850 unsigned NumLanes = VT.getSizeInBits() / 128;
48851 unsigned NumSrcElts = NumDstElts / 2;
48852 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
48853 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
48854
48855 APInt Undefs(NumDstElts, 0);
48856 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
48857 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
48858 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
48859 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
48860 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
48861 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
48862
48863 if (UndefElts[SrcIdx]) {
48864 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
48865 continue;
48866 }
48867
48868 APInt &Val = EltBits[SrcIdx];
48869 if (IsSigned) {
48870 // PACKSS: Truncate signed value with signed saturation.
48871 // Source values less than dst minint are saturated to minint.
48872 // Source values greater than dst maxint are saturated to maxint.
48873 if (Val.isSignedIntN(DstBitsPerElt))
48874 Val = Val.trunc(DstBitsPerElt);
48875 else if (Val.isNegative())
48876 Val = APInt::getSignedMinValue(DstBitsPerElt);
48877 else
48878 Val = APInt::getSignedMaxValue(DstBitsPerElt);
48879 } else {
48880 // PACKUS: Truncate signed value with unsigned saturation.
48881 // Source values less than zero are saturated to zero.
48882 // Source values greater than dst maxuint are saturated to maxuint.
48883 if (Val.isIntN(DstBitsPerElt))
48884 Val = Val.trunc(DstBitsPerElt);
48885 else if (Val.isNegative())
48886 Val = APInt::getZero(DstBitsPerElt);
48887 else
48888 Val = APInt::getAllOnes(DstBitsPerElt);
48889 }
48890 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
48891 }
48892 }
48893
48894 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
48895 }
48896
48897 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48898 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48899 return V;
48900
48901 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
48902 // truncate to create a larger truncate.
48903 if (Subtarget.hasAVX512() &&
48904 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
48905 N0.getOperand(0).getValueType() == MVT::v8i32) {
48906 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
48907 (!IsSigned &&
48908 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
48909 if (Subtarget.hasVLX())
48910 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
48911
48912 // Widen input to v16i32 so we can truncate that.
48913 SDLoc dl(N);
48914 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
48915 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
48916 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
48917 }
48918 }
48919
48920 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48921 if (VT.is128BitVector()) {
48922 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48923 SDValue Src0, Src1;
48924 if (N0.getOpcode() == ExtOpc &&
48925 N0.getOperand(0).getValueType().is64BitVector() &&
48926 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48927 Src0 = N0.getOperand(0);
48928 }
48929 if (N1.getOpcode() == ExtOpc &&
48930 N1.getOperand(0).getValueType().is64BitVector() &&
48931 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48932 Src1 = N1.getOperand(0);
48933 }
48934 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
48935 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48935, __extension__
__PRETTY_FUNCTION__))
;
48936 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
48937 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
48938 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
48939 }
48940
48941 // Try again with pack(*_extend_vector_inreg, undef).
48942 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
48943 : ISD::ZERO_EXTEND_VECTOR_INREG;
48944 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
48945 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
48946 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
48947 DAG);
48948 }
48949
48950 // Attempt to combine as shuffle.
48951 SDValue Op(N, 0);
48952 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48953 return Res;
48954
48955 return SDValue();
48956}
48957
48958static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
48959 TargetLowering::DAGCombinerInfo &DCI,
48960 const X86Subtarget &Subtarget) {
48961 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48963, __extension__
__PRETTY_FUNCTION__))
48962 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48963, __extension__
__PRETTY_FUNCTION__))
48963 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48963, __extension__
__PRETTY_FUNCTION__))
;
48964
48965 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
48966 MVT VT = N->getSimpleValueType(0);
48967 SDValue LHS = N->getOperand(0);
48968 SDValue RHS = N->getOperand(1);
48969
48970 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48971 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48972 LHS.getOpcode() == RHS.getOpcode() &&
48973 LHS.getValueType() == RHS.getValueType() &&
48974 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48975 SDValue LHS0 = LHS.getOperand(0);
48976 SDValue LHS1 = LHS.getOperand(1);
48977 SDValue RHS0 = RHS.getOperand(0);
48978 SDValue RHS1 = RHS.getOperand(1);
48979 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
48980 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
48981 SDLoc DL(N);
48982 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
48983 LHS0.isUndef() ? LHS1 : LHS0,
48984 RHS0.isUndef() ? RHS1 : RHS0);
48985 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
48986 Res = DAG.getBitcast(ShufVT, Res);
48987 SDValue NewLHS =
48988 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48989 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
48990 SDValue NewRHS =
48991 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48992 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
48993 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48994 DAG.getBitcast(VT, NewRHS));
48995 }
48996 }
48997 }
48998
48999 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49000 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49001 return V;
49002
49003 return SDValue();
49004}
49005
49006static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
49007 TargetLowering::DAGCombinerInfo &DCI,
49008 const X86Subtarget &Subtarget) {
49009 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49011, __extension__
__PRETTY_FUNCTION__))
49010 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49011, __extension__
__PRETTY_FUNCTION__))
49011 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49011, __extension__
__PRETTY_FUNCTION__))
;
49012 EVT VT = N->getValueType(0);
49013 SDValue N0 = N->getOperand(0);
49014 SDValue N1 = N->getOperand(1);
49015
49016 // Shift zero -> zero.
49017 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49018 return DAG.getConstant(0, SDLoc(N), VT);
49019
49020 // Detect constant shift amounts.
49021 APInt UndefElts;
49022 SmallVector<APInt, 32> EltBits;
49023 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
49024 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49025 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49026 EltBits[0].getZExtValue(), DAG);
49027 }
49028
49029 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49030 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49031 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49032 return SDValue(N, 0);
49033
49034 return SDValue();
49035}
49036
49037static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
49038 TargetLowering::DAGCombinerInfo &DCI,
49039 const X86Subtarget &Subtarget) {
49040 unsigned Opcode = N->getOpcode();
49041 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49043, __extension__
__PRETTY_FUNCTION__))
49042 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49043, __extension__
__PRETTY_FUNCTION__))
49043 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49043, __extension__
__PRETTY_FUNCTION__))
;
49044 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49045 EVT VT = N->getValueType(0);
49046 SDValue N0 = N->getOperand(0);
49047 SDValue N1 = N->getOperand(1);
49048 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49049 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49050, __extension__
__PRETTY_FUNCTION__))
49050 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49050, __extension__
__PRETTY_FUNCTION__))
;
49051 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49051, __extension__
__PRETTY_FUNCTION__))
;
49052
49053 // (shift undef, X) -> 0
49054 if (N0.isUndef())
49055 return DAG.getConstant(0, SDLoc(N), VT);
49056
49057 // Out of range logical bit shifts are guaranteed to be zero.
49058 // Out of range arithmetic bit shifts splat the sign bit.
49059 unsigned ShiftVal = N->getConstantOperandVal(1);
49060 if (ShiftVal >= NumBitsPerElt) {
49061 if (LogicalShift)
49062 return DAG.getConstant(0, SDLoc(N), VT);
49063 ShiftVal = NumBitsPerElt - 1;
49064 }
49065
49066 // (shift X, 0) -> X
49067 if (!ShiftVal)
49068 return N0;
49069
49070 // (shift 0, C) -> 0
49071 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49072 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49073 // result are all zeros, not undef.
49074 return DAG.getConstant(0, SDLoc(N), VT);
49075
49076 // (VSRAI -1, C) -> -1
49077 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49078 // N0 is all ones or undef. We guarantee that the bits shifted into the
49079 // result are all ones, not undef.
49080 return DAG.getConstant(-1, SDLoc(N), VT);
49081
49082 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49083 unsigned NewShiftVal = Amt0 + Amt1;
49084 if (NewShiftVal >= NumBitsPerElt) {
49085 // Out of range logical bit shifts are guaranteed to be zero.
49086 // Out of range arithmetic bit shifts splat the sign bit.
49087 if (LogicalShift)
49088 return DAG.getConstant(0, SDLoc(N), VT);
49089 NewShiftVal = NumBitsPerElt - 1;
49090 }
49091 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49092 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49093 };
49094
49095 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49096 if (Opcode == N0.getOpcode())
49097 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49098
49099 // (shl (add X, X), C) -> (shl X, (C + 1))
49100 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49101 N0.getOperand(0) == N0.getOperand(1))
49102 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49103
49104 // We can decode 'whole byte' logical bit shifts as shuffles.
49105 if (LogicalShift && (ShiftVal % 8) == 0) {
49106 SDValue Op(N, 0);
49107 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49108 return Res;
49109 }
49110
49111 auto TryConstantFold = [&](SDValue V) {
49112 APInt UndefElts;
49113 SmallVector<APInt, 32> EltBits;
49114 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
49115 return SDValue();
49116 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49117, __extension__
__PRETTY_FUNCTION__))
49117 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49117, __extension__
__PRETTY_FUNCTION__))
;
49118 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
49119 // created an undef input due to no input bits being demanded, but user
49120 // still expects 0 in other bits.
49121 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
49122 APInt &Elt = EltBits[i];
49123 if (UndefElts[i])
49124 Elt = 0;
49125 else if (X86ISD::VSHLI == Opcode)
49126 Elt <<= ShiftVal;
49127 else if (X86ISD::VSRAI == Opcode)
49128 Elt.ashrInPlace(ShiftVal);
49129 else
49130 Elt.lshrInPlace(ShiftVal);
49131 }
49132 // Reset undef elements since they were zeroed above.
49133 UndefElts = 0;
49134 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
49135 };
49136
49137 // Constant Folding.
49138 if (N->isOnlyUserOf(N0.getNode())) {
49139 if (SDValue C = TryConstantFold(N0))
49140 return C;
49141
49142 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
49143 // Don't break NOT patterns.
49144 SDValue BC = peekThroughOneUseBitcasts(N0);
49145 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
49146 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
49147 !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
49148 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
49149 SDLoc DL(N);
49150 SDValue LHS = DAG.getNode(Opcode, DL, VT,
49151 DAG.getBitcast(VT, BC.getOperand(0)), N1);
49152 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
49153 }
49154 }
49155 }
49156
49157 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49158 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
49159 DCI))
49160 return SDValue(N, 0);
49161
49162 return SDValue();
49163}
49164
49165static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
49166 TargetLowering::DAGCombinerInfo &DCI,
49167 const X86Subtarget &Subtarget) {
49168 EVT VT = N->getValueType(0);
49169 unsigned Opcode = N->getOpcode();
49170 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))
49171 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))
49172 Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))
49173 "Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))
;
49174
49175 SDValue Vec = N->getOperand(0);
49176 SDValue Scl = N->getOperand(1);
49177 SDValue Idx = N->getOperand(2);
49178
49179 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
49180 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
49181 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
49182
49183 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
49184 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49186 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
49187 APInt::getAllOnes(NumBitsPerElt), DCI))
49188 return SDValue(N, 0);
49189 }
49190
49191 // Attempt to combine insertion patterns to a shuffle.
49192 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
49193 SDValue Op(N, 0);
49194 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49195 return Res;
49196 }
49197
49198 return SDValue();
49199}
49200
49201/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
49202/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
49203/// OR -> CMPNEQSS.
49204static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
49205 TargetLowering::DAGCombinerInfo &DCI,
49206 const X86Subtarget &Subtarget) {
49207 unsigned opcode;
49208
49209 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
49210 // we're requiring SSE2 for both.
49211 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
49212 SDValue N0 = N->getOperand(0);
49213 SDValue N1 = N->getOperand(1);
49214 SDValue CMP0 = N0.getOperand(1);
49215 SDValue CMP1 = N1.getOperand(1);
49216 SDLoc DL(N);
49217
49218 // The SETCCs should both refer to the same CMP.
49219 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
49220 return SDValue();
49221
49222 SDValue CMP00 = CMP0->getOperand(0);
49223 SDValue CMP01 = CMP0->getOperand(1);
49224 EVT VT = CMP00.getValueType();
49225
49226 if (VT == MVT::f32 || VT == MVT::f64 ||
49227 (VT == MVT::f16 && Subtarget.hasFP16())) {
49228 bool ExpectingFlags = false;
49229 // Check for any users that want flags:
49230 for (const SDNode *U : N->uses()) {
49231 if (ExpectingFlags)
49232 break;
49233
49234 switch (U->getOpcode()) {
49235 default:
49236 case ISD::BR_CC:
49237 case ISD::BRCOND:
49238 case ISD::SELECT:
49239 ExpectingFlags = true;
49240 break;
49241 case ISD::CopyToReg:
49242 case ISD::SIGN_EXTEND:
49243 case ISD::ZERO_EXTEND:
49244 case ISD::ANY_EXTEND:
49245 break;
49246 }
49247 }
49248
49249 if (!ExpectingFlags) {
49250 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
49251 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
49252
49253 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
49254 X86::CondCode tmp = cc0;
49255 cc0 = cc1;
49256 cc1 = tmp;
49257 }
49258
49259 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
49260 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
49261 // FIXME: need symbolic constants for these magic numbers.
49262 // See X86ATTInstPrinter.cpp:printSSECC().
49263 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
49264 if (Subtarget.hasAVX512()) {
49265 SDValue FSetCC =
49266 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
49267 DAG.getTargetConstant(x86cc, DL, MVT::i8));
49268 // Need to fill with zeros to ensure the bitcast will produce zeroes
49269 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
49270 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
49271 DAG.getConstant(0, DL, MVT::v16i1),
49272 FSetCC, DAG.getIntPtrConstant(0, DL));
49273 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
49274 N->getSimpleValueType(0));
49275 }
49276 SDValue OnesOrZeroesF =
49277 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
49278 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
49279
49280 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
49281 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
49282
49283 if (is64BitFP && !Subtarget.is64Bit()) {
49284 // On a 32-bit target, we cannot bitcast the 64-bit float to a
49285 // 64-bit integer, since that's not a legal type. Since
49286 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
49287 // bits, but can do this little dance to extract the lowest 32 bits
49288 // and work with those going forward.
49289 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
49290 OnesOrZeroesF);
49291 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
49292 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
49293 Vector32, DAG.getIntPtrConstant(0, DL));
49294 IntVT = MVT::i32;
49295 }
49296
49297 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
49298 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
49299 DAG.getConstant(1, DL, IntVT));
49300 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
49301 ANDed);
49302 return OneBitOfTruth;
49303 }
49304 }
49305 }
49306 }
49307 return SDValue();
49308}
49309
49310/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
49311static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
49312 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49312, __extension__
__PRETTY_FUNCTION__))
;
49313
49314 MVT VT = N->getSimpleValueType(0);
49315 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
49316 return SDValue();
49317
49318 SDValue X, Y;
49319 SDValue N0 = N->getOperand(0);
49320 SDValue N1 = N->getOperand(1);
49321
49322 if (SDValue Not = IsNOT(N0, DAG)) {
49323 X = Not;
49324 Y = N1;
49325 } else if (SDValue Not = IsNOT(N1, DAG)) {
49326 X = Not;
49327 Y = N0;
49328 } else
49329 return SDValue();
49330
49331 X = DAG.getBitcast(VT, X);
49332 Y = DAG.getBitcast(VT, Y);
49333 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
49334}
49335
49336/// Try to fold:
49337/// and (vector_shuffle<Z,...,Z>
49338/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49339/// ->
49340/// andnp (vector_shuffle<Z,...,Z>
49341/// (insert_vector_elt undef, X, Z), undef), Y
49342static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
49343 const X86Subtarget &Subtarget) {
49344 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49344, __extension__
__PRETTY_FUNCTION__))
;
49345
49346 EVT VT = N->getValueType(0);
49347 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
49348 // value and require extra moves.
49349 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49350 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
49351 return SDValue();
49352
49353 auto GetNot = [&DAG](SDValue V) {
49354 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
49355 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49356 // end-users are ISD::AND including cases
49357 // (and(extract_vector_element(SVN), Y)).
49358 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49359 !SVN->getOperand(1).isUndef()) {
49360 return SDValue();
49361 }
49362 SDValue IVEN = SVN->getOperand(0);
49363 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
49364 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
49365 return SDValue();
49366 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
49367 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49368 return SDValue();
49369 SDValue Src = IVEN.getOperand(1);
49370 if (SDValue Not = IsNOT(Src, DAG)) {
49371 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
49372 SDValue NotIVEN =
49373 DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
49374 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
49375 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49376 SVN->getOperand(1), SVN->getMask());
49377 }
49378 return SDValue();
49379 };
49380
49381 SDValue X, Y;
49382 SDValue N0 = N->getOperand(0);
49383 SDValue N1 = N->getOperand(1);
49384
49385 if (SDValue Not = GetNot(N0)) {
49386 X = Not;
49387 Y = N1;
49388 } else if (SDValue Not = GetNot(N1)) {
49389 X = Not;
49390 Y = N0;
49391 } else
49392 return SDValue();
49393
49394 X = DAG.getBitcast(VT, X);
49395 Y = DAG.getBitcast(VT, Y);
49396 SDLoc DL(N);
49397 // We do not split for SSE at all, but we need to split vectors for AVX1 and
49398 // AVX2.
49399 if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
49400 SDValue LoX, HiX;
49401 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
49402 SDValue LoY, HiY;
49403 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
49404 EVT SplitVT = LoX.getValueType();
49405 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
49406 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
49407 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
49408 }
49409 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
49410}
49411
49412// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
49413// logical operations, like in the example below.
49414// or (and (truncate x, truncate y)),
49415// (xor (truncate z, build_vector (constants)))
49416// Given a target type \p VT, we generate
49417// or (and x, y), (xor z, zext(build_vector (constants)))
49418// given x, y and z are of type \p VT. We can do so, if operands are either
49419// truncates from VT types, the second operand is a vector of constants or can
49420// be recursively promoted.
49421static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
49422 unsigned Depth) {
49423 // Limit recursion to avoid excessive compile times.
49424 if (Depth >= SelectionDAG::MaxRecursionDepth)
49425 return SDValue();
49426
49427 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
49428 N->getOpcode() != ISD::OR)
49429 return SDValue();
49430
49431 SDValue N0 = N->getOperand(0);
49432 SDValue N1 = N->getOperand(1);
49433 SDLoc DL(N);
49434
49435 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49436 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
49437 return SDValue();
49438
49439 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
49440 N0 = NN0;
49441 else {
49442 // The Left side has to be a trunc.
49443 if (N0.getOpcode() != ISD::TRUNCATE)
49444 return SDValue();
49445
49446 // The type of the truncated inputs.
49447 if (N0.getOperand(0).getValueType() != VT)
49448 return SDValue();
49449
49450 N0 = N0.getOperand(0);
49451 }
49452
49453 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
49454 N1 = NN1;
49455 else {
49456 // The right side has to be a 'trunc' or a constant vector.
49457 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
49458 N1.getOperand(0).getValueType() == VT;
49459 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
49460 return SDValue();
49461
49462 if (RHSTrunc)
49463 N1 = N1.getOperand(0);
49464 else
49465 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
49466 }
49467
49468 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
49469}
49470
49471// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
49472// register. In most cases we actually compare or select YMM-sized registers
49473// and mixing the two types creates horrible code. This method optimizes
49474// some of the transition sequences.
49475// Even with AVX-512 this is still useful for removing casts around logical
49476// operations on vXi1 mask types.
49477static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
49478 const X86Subtarget &Subtarget) {
49479 EVT VT = N->getValueType(0);
49480 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49480, __extension__
__PRETTY_FUNCTION__))
;
49481
49482 SDLoc DL(N);
49483 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49485, __extension__
__PRETTY_FUNCTION__))
49484 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49485, __extension__
__PRETTY_FUNCTION__))
49485 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49485, __extension__
__PRETTY_FUNCTION__))
;
49486
49487 SDValue Narrow = N->getOperand(0);
49488 EVT NarrowVT = Narrow.getValueType();
49489
49490 // Generate the wide operation.
49491 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
49492 if (!Op)
49493 return SDValue();
49494 switch (N->getOpcode()) {
49495 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49495)
;
49496 case ISD::ANY_EXTEND:
49497 return Op;
49498 case ISD::ZERO_EXTEND:
49499 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
49500 case ISD::SIGN_EXTEND:
49501 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
49502 Op, DAG.getValueType(NarrowVT));
49503 }
49504}
49505
49506static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
49507 unsigned FPOpcode;
49508 switch (Opcode) {
49509 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49509)
;
49510 case ISD::AND: FPOpcode = X86ISD::FAND; break;
49511 case ISD::OR: FPOpcode = X86ISD::FOR; break;
49512 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
49513 }
49514 return FPOpcode;
49515}
49516
49517/// If both input operands of a logic op are being cast from floating-point
49518/// types or FP compares, try to convert this into a floating-point logic node
49519/// to avoid unnecessary moves from SSE to integer registers.
49520static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
49521 TargetLowering::DAGCombinerInfo &DCI,
49522 const X86Subtarget &Subtarget) {
49523 EVT VT = N->getValueType(0);
49524 SDValue N0 = N->getOperand(0);
49525 SDValue N1 = N->getOperand(1);
49526 SDLoc DL(N);
49527
49528 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
49529 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
49530 return SDValue();
49531
49532 SDValue N00 = N0.getOperand(0);
49533 SDValue N10 = N1.getOperand(0);
49534 EVT N00Type = N00.getValueType();
49535 EVT N10Type = N10.getValueType();
49536
49537 // Ensure that both types are the same and are legal scalar fp types.
49538 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
49539 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
49540 (Subtarget.hasFP16() && N00Type == MVT::f16)))
49541 return SDValue();
49542
49543 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
49544 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49545 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
49546 return DAG.getBitcast(VT, FPLogic);
49547 }
49548
49549 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
49550 !N1.hasOneUse())
49551 return SDValue();
49552
49553 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49554 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49555
49556 // The vector ISA for FP predicates is incomplete before AVX, so converting
49557 // COMIS* to CMPS* may not be a win before AVX.
49558 if (!Subtarget.hasAVX() &&
49559 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
49560 return SDValue();
49561
49562 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
49563 // and vector logic:
49564 // logic (setcc N00, N01), (setcc N10, N11) -->
49565 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
49566 unsigned NumElts = 128 / N00Type.getSizeInBits();
49567 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
49568 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
49569 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
49570 SDValue N01 = N0.getOperand(1);
49571 SDValue N11 = N1.getOperand(1);
49572 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
49573 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
49574 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
49575 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
49576 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
49577 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
49578 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49579 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
49580}
49581
49582// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49583// to reduce XMM->GPR traffic.
49584static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
49585 unsigned Opc = N->getOpcode();
49586 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49587, __extension__
__PRETTY_FUNCTION__))
49587 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49587, __extension__
__PRETTY_FUNCTION__))
;
49588
49589 SDValue N0 = N->getOperand(0);
49590 SDValue N1 = N->getOperand(1);
49591
49592 // Both operands must be single use MOVMSK.
49593 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
49594 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
49595 return SDValue();
49596
49597 SDValue Vec0 = N0.getOperand(0);
49598 SDValue Vec1 = N1.getOperand(0);
49599 EVT VecVT0 = Vec0.getValueType();
49600 EVT VecVT1 = Vec1.getValueType();
49601
49602 // Both MOVMSK operands must be from vectors of the same size and same element
49603 // size, but its OK for a fp/int diff.
49604 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
49605 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
49606 return SDValue();
49607
49608 SDLoc DL(N);
49609 unsigned VecOpc =
49610 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
49611 SDValue Result =
49612 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
49613 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49614}
49615
49616// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49617// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
49618// handles in InstCombine.
49619static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
49620 unsigned Opc = N->getOpcode();
49621 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49622, __extension__
__PRETTY_FUNCTION__))
49622 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49622, __extension__
__PRETTY_FUNCTION__))
;
49623
49624 SDValue N0 = N->getOperand(0);
49625 SDValue N1 = N->getOperand(1);
49626 EVT VT = N->getValueType(0);
49627
49628 // Both operands must be single use.
49629 if (!N0.hasOneUse() || !N1.hasOneUse())
49630 return SDValue();
49631
49632 // Search for matching shifts.
49633 SDValue BC0 = peekThroughOneUseBitcasts(N0);
49634 SDValue BC1 = peekThroughOneUseBitcasts(N1);
49635
49636 unsigned BCOpc = BC0.getOpcode();
49637 EVT BCVT = BC0.getValueType();
49638 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49639 return SDValue();
49640
49641 switch (BCOpc) {
49642 case X86ISD::VSHLI:
49643 case X86ISD::VSRLI:
49644 case X86ISD::VSRAI: {
49645 if (BC0.getOperand(1) != BC1.getOperand(1))
49646 return SDValue();
49647
49648 SDLoc DL(N);
49649 SDValue BitOp =
49650 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
49651 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
49652 return DAG.getBitcast(VT, Shift);
49653 }
49654 }
49655
49656 return SDValue();
49657}
49658
49659/// If this is a zero/all-bits result that is bitwise-anded with a low bits
49660/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
49661/// with a shift-right to eliminate loading the vector constant mask value.
49662static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
49663 const X86Subtarget &Subtarget) {
49664 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49665 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49666 EVT VT = Op0.getValueType();
49667 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
49668 return SDValue();
49669
49670 // Try to convert an "is positive" signbit masking operation into arithmetic
49671 // shift and "andn". This saves a materialization of a -1 vector constant.
49672 // The "is negative" variant should be handled more generally because it only
49673 // requires "and" rather than "andn":
49674 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49675 //
49676 // This is limited to the original type to avoid producing even more bitcasts.
49677 // If the bitcasts can't be eliminated, then it is unlikely that this fold
49678 // will be profitable.
49679 if (N->getValueType(0) == VT &&
49680 supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
49681 SDValue X, Y;
49682 if (Op1.getOpcode() == X86ISD::PCMPGT &&
49683 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
49684 X = Op1.getOperand(0);
49685 Y = Op0;
49686 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
49687 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
49688 X = Op0.getOperand(0);
49689 Y = Op1;
49690 }
49691 if (X && Y) {
49692 SDLoc DL(N);
49693 SDValue Sra =
49694 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
49695 VT.getScalarSizeInBits() - 1, DAG);
49696 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
49697 }
49698 }
49699
49700 APInt SplatVal;
49701 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
49702 !SplatVal.isMask())
49703 return SDValue();
49704
49705 // Don't prevent creation of ANDN.
49706 if (isBitwiseNot(Op0))
49707 return SDValue();
49708
49709 if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
49710 return SDValue();
49711
49712 unsigned EltBitWidth = VT.getScalarSizeInBits();
49713 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
49714 return SDValue();
49715
49716 SDLoc DL(N);
49717 unsigned ShiftVal = SplatVal.countr_one();
49718 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49719 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
49720 return DAG.getBitcast(N->getValueType(0), Shift);
49721}
49722
49723// Get the index node from the lowered DAG of a GEP IR instruction with one
49724// indexing dimension.
49725static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
49726 if (Ld->isIndexed())
49727 return SDValue();
49728
49729 SDValue Base = Ld->getBasePtr();
49730
49731 if (Base.getOpcode() != ISD::ADD)
49732 return SDValue();
49733
49734 SDValue ShiftedIndex = Base.getOperand(0);
49735
49736 if (ShiftedIndex.getOpcode() != ISD::SHL)
49737 return SDValue();
49738
49739 return ShiftedIndex.getOperand(0);
49740
49741}
49742
49743static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
49744 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
49745 switch (VT.getSizeInBits()) {
49746 default: return false;
49747 case 64: return Subtarget.is64Bit() ? true : false;
49748 case 32: return true;
49749 }
49750 }
49751 return false;
49752}
49753
49754// This function recognizes cases where X86 bzhi instruction can replace and
49755// 'and-load' sequence.
49756// In case of loading integer value from an array of constants which is defined
49757// as follows:
49758//
49759// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
49760//
49761// then applying a bitwise and on the result with another input.
49762// It's equivalent to performing bzhi (zero high bits) on the input, with the
49763// same index of the load.
49764static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
49765 const X86Subtarget &Subtarget) {
49766 MVT VT = Node->getSimpleValueType(0);
49767 SDLoc dl(Node);
49768
49769 // Check if subtarget has BZHI instruction for the node's type
49770 if (!hasBZHI(Subtarget, VT))
49771 return SDValue();
49772
49773 // Try matching the pattern for both operands.
49774 for (unsigned i = 0; i < 2; i++) {
49775 SDValue N = Node->getOperand(i);
49776 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
49777
49778 // continue if the operand is not a load instruction
49779 if (!Ld)
49780 return SDValue();
49781
49782 const Value *MemOp = Ld->getMemOperand()->getValue();
49783
49784 if (!MemOp)
49785 return SDValue();
49786
49787 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
49788 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
49789 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
49790
49791 Constant *Init = GV->getInitializer();
49792 Type *Ty = Init->getType();
49793 if (!isa<ConstantDataArray>(Init) ||
49794 !Ty->getArrayElementType()->isIntegerTy() ||
49795 Ty->getArrayElementType()->getScalarSizeInBits() !=
49796 VT.getSizeInBits() ||
49797 Ty->getArrayNumElements() >
49798 Ty->getArrayElementType()->getScalarSizeInBits())
49799 continue;
49800
49801 // Check if the array's constant elements are suitable to our case.
49802 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
49803 bool ConstantsMatch = true;
49804 for (uint64_t j = 0; j < ArrayElementCount; j++) {
49805 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
49806 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
49807 ConstantsMatch = false;
49808 break;
49809 }
49810 }
49811 if (!ConstantsMatch)
49812 continue;
49813
49814 // Do the transformation (For 32-bit type):
49815 // -> (and (load arr[idx]), inp)
49816 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
49817 // that will be replaced with one bzhi instruction.
49818 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
49819 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
49820
49821 // Get the Node which indexes into the array.
49822 SDValue Index = getIndexFromUnindexedLoad(Ld);
49823 if (!Index)
49824 return SDValue();
49825 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
49826
49827 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
49828 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
49829
49830 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
49831 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
49832
49833 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
49834 }
49835 }
49836 }
49837 }
49838 return SDValue();
49839}
49840
49841// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
49842// Where C is a mask containing the same number of bits as the setcc and
49843// where the setcc will freely 0 upper bits of k-register. We can replace the
49844// undef in the concat with 0s and remove the AND. This mainly helps with
49845// v2i1/v4i1 setcc being casted to scalar.
49846static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
49847 const X86Subtarget &Subtarget) {
49848 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49848, __extension__
__PRETTY_FUNCTION__))
;
49849
49850 EVT VT = N->getValueType(0);
49851
49852 // Make sure this is an AND with constant. We will check the value of the
49853 // constant later.
49854 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49855 if (!C1)
49856 return SDValue();
49857
49858 // This is implied by the ConstantSDNode.
49859 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49859, __extension__
__PRETTY_FUNCTION__))
;
49860
49861 SDValue Src = N->getOperand(0);
49862 if (!Src.hasOneUse())
49863 return SDValue();
49864
49865 // (Optionally) peek through any_extend().
49866 if (Src.getOpcode() == ISD::ANY_EXTEND) {
49867 if (!Src.getOperand(0).hasOneUse())
49868 return SDValue();
49869 Src = Src.getOperand(0);
49870 }
49871
49872 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
49873 return SDValue();
49874
49875 Src = Src.getOperand(0);
49876 EVT SrcVT = Src.getValueType();
49877
49878 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49879 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
49880 !TLI.isTypeLegal(SrcVT))
49881 return SDValue();
49882
49883 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
49884 return SDValue();
49885
49886 // We only care about the first subvector of the concat, we expect the
49887 // other subvectors to be ignored due to the AND if we make the change.
49888 SDValue SubVec = Src.getOperand(0);
49889 EVT SubVecVT = SubVec.getValueType();
49890
49891 // The RHS of the AND should be a mask with as many bits as SubVec.
49892 if (!TLI.isTypeLegal(SubVecVT) ||
49893 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49894 return SDValue();
49895
49896 // First subvector should be a setcc with a legal result type or a
49897 // AND containing at least one setcc with a legal result type.
49898 auto IsLegalSetCC = [&](SDValue V) {
49899 if (V.getOpcode() != ISD::SETCC)
49900 return false;
49901 EVT SetccVT = V.getOperand(0).getValueType();
49902 if (!TLI.isTypeLegal(SetccVT) ||
49903 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
49904 return false;
49905 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
49906 return false;
49907 return true;
49908 };
49909 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
49910 (IsLegalSetCC(SubVec.getOperand(0)) ||
49911 IsLegalSetCC(SubVec.getOperand(1))))))
49912 return SDValue();
49913
49914 // We passed all the checks. Rebuild the concat_vectors with zeroes
49915 // and cast it back to VT.
49916 SDLoc dl(N);
49917 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
49918 DAG.getConstant(0, dl, SubVecVT));
49919 Ops[0] = SubVec;
49920 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
49921 Ops);
49922 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
49923 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
49924}
49925
49926static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
49927 SDValue OpMustEq, SDValue Op, unsigned Depth) {
49928 // We don't want to go crazy with the recursion here. This isn't a super
49929 // important optimization.
49930 static constexpr unsigned kMaxDepth = 2;
49931
49932 // Only do this re-ordering if op has one use.
49933 if (!Op.hasOneUse())
49934 return SDValue();
49935
49936 SDLoc DL(Op);
49937 // If we hit another assosiative op, recurse further.
49938 if (Op.getOpcode() == Opc) {
49939 // Done recursing.
49940 if (Depth++ >= kMaxDepth)
49941 return SDValue();
49942
49943 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49944 if (SDValue R =
49945 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
49946 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
49947 Op.getOperand(1 - OpIdx));
49948
49949 } else if (Op.getOpcode() == ISD::SUB) {
49950 if (Opc == ISD::AND) {
49951 // BLSI: (and x, (sub 0, x))
49952 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
49953 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49954 }
49955 // Opc must be ISD::AND or ISD::XOR
49956 // BLSR: (and x, (sub x, 1))
49957 // BLSMSK: (xor x, (sub x, 1))
49958 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49959 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49960
49961 } else if (Op.getOpcode() == ISD::ADD) {
49962 // Opc must be ISD::AND or ISD::XOR
49963 // BLSR: (and x, (add x, -1))
49964 // BLSMSK: (xor x, (add x, -1))
49965 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49966 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49967 }
49968 return SDValue();
49969}
49970
49971static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
49972 const X86Subtarget &Subtarget) {
49973 EVT VT = N->getValueType(0);
49974 // Make sure this node is a candidate for BMI instructions.
49975 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49976 (VT != MVT::i32 && VT != MVT::i64))
49977 return SDValue();
49978
49979 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49979, __extension__
__PRETTY_FUNCTION__))
;
49980
49981 // Try and match LHS and RHS.
49982 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49983 if (SDValue OpMatch =
49984 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49985 N->getOperand(1 - OpIdx), 0))
49986 return OpMatch;
49987 return SDValue();
49988}
49989
49990static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
49991 TargetLowering::DAGCombinerInfo &DCI,
49992 const X86Subtarget &Subtarget) {
49993 SDValue N0 = N->getOperand(0);
49994 SDValue N1 = N->getOperand(1);
49995 EVT VT = N->getValueType(0);
49996 SDLoc dl(N);
49997 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49998
49999 // If this is SSE1 only convert to FAND to avoid scalarization.
50000 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50001 return DAG.getBitcast(MVT::v4i32,
50002 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
50003 DAG.getBitcast(MVT::v4f32, N0),
50004 DAG.getBitcast(MVT::v4f32, N1)));
50005 }
50006
50007 // Use a 32-bit and+zext if upper bits known zero.
50008 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
50009 APInt HiMask = APInt::getHighBitsSet(64, 32);
50010 if (DAG.MaskedValueIsZero(N1, HiMask) ||
50011 DAG.MaskedValueIsZero(N0, HiMask)) {
50012 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
50013 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
50014 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
50015 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
50016 }
50017 }
50018
50019 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
50020 // TODO: Support multiple SrcOps.
50021 if (VT == MVT::i1) {
50022 SmallVector<SDValue, 2> SrcOps;
50023 SmallVector<APInt, 2> SrcPartials;
50024 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
50025 SrcOps.size() == 1) {
50026 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50027 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50028 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50029 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50030 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50031 if (Mask) {
50032 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50033, __extension__
__PRETTY_FUNCTION__))
50033 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50033, __extension__
__PRETTY_FUNCTION__))
;
50034 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50035 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50036 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
50037 }
50038 }
50039 }
50040
50041 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
50042 return V;
50043
50044 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50045 return R;
50046
50047 if (SDValue R = combineBitOpWithShift(N, DAG))
50048 return R;
50049
50050 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50051 return FPLogic;
50052
50053 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
50054 return R;
50055
50056 if (DCI.isBeforeLegalizeOps())
50057 return SDValue();
50058
50059 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50060 return R;
50061
50062 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
50063 return R;
50064
50065 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
50066 return ShiftRight;
50067
50068 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
50069 return R;
50070
50071 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
50072 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
50073 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
50074 if (VT.isVector() && getTargetConstantFromNode(N1)) {
50075 unsigned Opc0 = N0.getOpcode();
50076 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
50077 getTargetConstantFromNode(N0.getOperand(1)) &&
50078 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
50079 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
50080 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
50081 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
50082 }
50083 }
50084
50085 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
50086 // avoids slow variable shift (moving shift amount to ECX etc.)
50087 if (isOneConstant(N1) && N0->hasOneUse()) {
50088 SDValue Src = N0;
50089 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
50090 Src.getOpcode() == ISD::TRUNCATE) &&
50091 Src.getOperand(0)->hasOneUse())
50092 Src = Src.getOperand(0);
50093 bool ContainsNOT = false;
50094 X86::CondCode X86CC = X86::COND_B;
50095 // Peek through AND(NOT(SRL(X,Y)),1).
50096 if (isBitwiseNot(Src)) {
50097 Src = Src.getOperand(0);
50098 X86CC = X86::COND_AE;
50099 ContainsNOT = true;
50100 }
50101 if (Src.getOpcode() == ISD::SRL &&
50102 !isa<ConstantSDNode>(Src.getOperand(1))) {
50103 SDValue BitNo = Src.getOperand(1);
50104 Src = Src.getOperand(0);
50105 // Peek through AND(SRL(NOT(X),Y),1).
50106 if (isBitwiseNot(Src)) {
50107 Src = Src.getOperand(0);
50108 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
50109 ContainsNOT = true;
50110 }
50111 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
50112 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
50113 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
50114 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
50115 }
50116 }
50117
50118 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50119 // Attempt to recursively combine a bitmask AND with shuffles.
50120 SDValue Op(N, 0);
50121 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50122 return Res;
50123
50124 // If either operand is a constant mask, then only the elements that aren't
50125 // zero are actually demanded by the other operand.
50126 auto GetDemandedMasks = [&](SDValue Op) {
50127 APInt UndefElts;
50128 SmallVector<APInt> EltBits;
50129 int NumElts = VT.getVectorNumElements();
50130 int EltSizeInBits = VT.getScalarSizeInBits();
50131 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50132 APInt DemandedElts = APInt::getAllOnes(NumElts);
50133 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50134 EltBits)) {
50135 DemandedBits.clearAllBits();
50136 DemandedElts.clearAllBits();
50137 for (int I = 0; I != NumElts; ++I) {
50138 if (UndefElts[I]) {
50139 // We can't assume an undef src element gives an undef dst - the
50140 // other src might be zero.
50141 DemandedBits.setAllBits();
50142 DemandedElts.setBit(I);
50143 } else if (!EltBits[I].isZero()) {
50144 DemandedBits |= EltBits[I];
50145 DemandedElts.setBit(I);
50146 }
50147 }
50148 }
50149 return std::make_pair(DemandedBits, DemandedElts);
50150 };
50151 APInt Bits0, Elts0;
50152 APInt Bits1, Elts1;
50153 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
50154 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
50155
50156 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
50157 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
50158 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
50159 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
50160 if (N->getOpcode() != ISD::DELETED_NODE)
50161 DCI.AddToWorklist(N);
50162 return SDValue(N, 0);
50163 }
50164
50165 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
50166 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
50167 if (NewN0 || NewN1)
50168 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
50169 NewN1 ? NewN1 : N1);
50170 }
50171
50172 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
50173 if ((VT.getScalarSizeInBits() % 8) == 0 &&
50174 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50175 isa<ConstantSDNode>(N0.getOperand(1))) {
50176 SDValue BitMask = N1;
50177 SDValue SrcVec = N0.getOperand(0);
50178 EVT SrcVecVT = SrcVec.getValueType();
50179
50180 // Check that the constant bitmask masks whole bytes.
50181 APInt UndefElts;
50182 SmallVector<APInt, 64> EltBits;
50183 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50184 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
50185 llvm::all_of(EltBits, [](const APInt &M) {
50186 return M.isZero() || M.isAllOnes();
50187 })) {
50188 unsigned NumElts = SrcVecVT.getVectorNumElements();
50189 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
50190 unsigned Idx = N0.getConstantOperandVal(1);
50191
50192 // Create a root shuffle mask from the byte mask and the extracted index.
50193 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
50194 for (unsigned i = 0; i != Scale; ++i) {
50195 if (UndefElts[i])
50196 continue;
50197 int VecIdx = Scale * Idx + i;
50198 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
50199 }
50200
50201 if (SDValue Shuffle = combineX86ShufflesRecursively(
50202 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
50203 X86::MaxShuffleCombineDepth,
50204 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
50205 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
50206 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
50207 N0.getOperand(1));
50208 }
50209 }
50210
50211 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
50212 return R;
50213
50214 return SDValue();
50215}
50216
50217// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50218static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
50219 const X86Subtarget &Subtarget) {
50220 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50220, __extension__
__PRETTY_FUNCTION__))
;
50221
50222 MVT VT = N->getSimpleValueType(0);
50223 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50224 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
50225 return SDValue();
50226
50227 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50228 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50229 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
50230 return SDValue();
50231
50232 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
50233 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
50234 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
50235 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
50236 return SDValue();
50237
50238 // Attempt to extract constant byte masks.
50239 APInt UndefElts0, UndefElts1;
50240 SmallVector<APInt, 32> EltBits0, EltBits1;
50241 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
50242 false, false))
50243 return SDValue();
50244 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
50245 false, false))
50246 return SDValue();
50247
50248 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
50249 // TODO - add UNDEF elts support.
50250 if (UndefElts0[i] || UndefElts1[i])
50251 return SDValue();
50252 if (EltBits0[i] != ~EltBits1[i])
50253 return SDValue();
50254 }
50255
50256 SDLoc DL(N);
50257
50258 if (useVPTERNLOG(Subtarget, VT)) {
50259 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50260 // VPTERNLOG is only available as vXi32/64-bit types.
50261 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
50262 MVT OpVT =
50263 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
50264 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
50265 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
50266 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
50267 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
50268 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
50269 DAG, Subtarget);
50270 return DAG.getBitcast(VT, Res);
50271 }
50272
50273 SDValue X = N->getOperand(0);
50274 SDValue Y =
50275 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
50276 DAG.getBitcast(VT, N1.getOperand(0)));
50277 return DAG.getNode(ISD::OR, DL, VT, X, Y);
50278}
50279
50280// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
50281static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
50282 if (N->getOpcode() != ISD::OR)
50283 return false;
50284
50285 SDValue N0 = N->getOperand(0);
50286 SDValue N1 = N->getOperand(1);
50287
50288 // Canonicalize AND to LHS.
50289 if (N1.getOpcode() == ISD::AND)
50290 std::swap(N0, N1);
50291
50292 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
50293 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
50294 return false;
50295
50296 Mask = N1.getOperand(0);
50297 X = N1.getOperand(1);
50298
50299 // Check to see if the mask appeared in both the AND and ANDNP.
50300 if (N0.getOperand(0) == Mask)
50301 Y = N0.getOperand(1);
50302 else if (N0.getOperand(1) == Mask)
50303 Y = N0.getOperand(0);
50304 else
50305 return false;
50306
50307 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50308 // ANDNP combine allows other combines to happen that prevent matching.
50309 return true;
50310}
50311
50312// Try to fold:
50313// (or (and (m, y), (pandn m, x)))
50314// into:
50315// (vselect m, x, y)
50316// As a special case, try to fold:
50317// (or (and (m, (sub 0, x)), (pandn m, x)))
50318// into:
50319// (sub (xor X, M), M)
50320static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
50321 const X86Subtarget &Subtarget) {
50322 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50322, __extension__
__PRETTY_FUNCTION__))
;
50323
50324 EVT VT = N->getValueType(0);
50325 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50326 (VT.is256BitVector() && Subtarget.hasInt256())))
50327 return SDValue();
50328
50329 SDValue X, Y, Mask;
50330 if (!matchLogicBlend(N, X, Y, Mask))
50331 return SDValue();
50332
50333 // Validate that X, Y, and Mask are bitcasts, and see through them.
50334 Mask = peekThroughBitcasts(Mask);
50335 X = peekThroughBitcasts(X);
50336 Y = peekThroughBitcasts(Y);
50337
50338 EVT MaskVT = Mask.getValueType();
50339 unsigned EltBits = MaskVT.getScalarSizeInBits();
50340
50341 // TODO: Attempt to handle floating point cases as well?
50342 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50343 return SDValue();
50344
50345 SDLoc DL(N);
50346
50347 // Attempt to combine to conditional negate: (sub (xor X, M), M)
50348 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50349 DAG, Subtarget))
50350 return Res;
50351
50352 // PBLENDVB is only available on SSE 4.1.
50353 if (!Subtarget.hasSSE41())
50354 return SDValue();
50355
50356 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50357 if (Subtarget.hasVLX())
50358 return SDValue();
50359
50360 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50361
50362 X = DAG.getBitcast(BlendVT, X);
50363 Y = DAG.getBitcast(BlendVT, Y);
50364 Mask = DAG.getBitcast(BlendVT, Mask);
50365 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50366 return DAG.getBitcast(VT, Mask);
50367}
50368
50369// Helper function for combineOrCmpEqZeroToCtlzSrl
50370// Transforms:
50371// seteq(cmp x, 0)
50372// into:
50373// srl(ctlz x), log2(bitsize(x))
50374// Input pattern is checked by caller.
50375static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
50376 SDValue Cmp = Op.getOperand(1);
50377 EVT VT = Cmp.getOperand(0).getValueType();
50378 unsigned Log2b = Log2_32(VT.getSizeInBits());
50379 SDLoc dl(Op);
50380 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50381 // The result of the shift is true or false, and on X86, the 32-bit
50382 // encoding of shr and lzcnt is more desirable.
50383 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50384 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50385 DAG.getConstant(Log2b, dl, MVT::i8));
50386 return Scc;
50387}
50388
50389// Try to transform:
50390// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50391// into:
50392// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50393// Will also attempt to match more generic cases, eg:
50394// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50395// Only applies if the target supports the FastLZCNT feature.
50396static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
50397 TargetLowering::DAGCombinerInfo &DCI,
50398 const X86Subtarget &Subtarget) {
50399 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50400 return SDValue();
50401
50402 auto isORCandidate = [](SDValue N) {
50403 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50404 };
50405
50406 // Check the zero extend is extending to 32-bit or more. The code generated by
50407 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50408 // instructions to clear the upper bits.
50409 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50410 !isORCandidate(N->getOperand(0)))
50411 return SDValue();
50412
50413 // Check the node matches: setcc(eq, cmp 0)
50414 auto isSetCCCandidate = [](SDValue N) {
50415 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50416 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50417 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50418 isNullConstant(N->getOperand(1).getOperand(1)) &&
50419 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50420 };
50421
50422 SDNode *OR = N->getOperand(0).getNode();
50423 SDValue LHS = OR->getOperand(0);
50424 SDValue RHS = OR->getOperand(1);
50425
50426 // Save nodes matching or(or, setcc(eq, cmp 0)).
50427 SmallVector<SDNode *, 2> ORNodes;
50428 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
50429 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
50430 ORNodes.push_back(OR);
50431 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50432 LHS = OR->getOperand(0);
50433 RHS = OR->getOperand(1);
50434 }
50435
50436 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
50437 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
50438 !isORCandidate(SDValue(OR, 0)))
50439 return SDValue();
50440
50441 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
50442 // to
50443 // or(srl(ctlz),srl(ctlz)).
50444 // The dag combiner can then fold it into:
50445 // srl(or(ctlz, ctlz)).
50446 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
50447 SDValue Ret, NewRHS;
50448 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
50449 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
50450
50451 if (!Ret)
50452 return SDValue();
50453
50454 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
50455 while (!ORNodes.empty()) {
50456 OR = ORNodes.pop_back_val();
50457 LHS = OR->getOperand(0);
50458 RHS = OR->getOperand(1);
50459 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
50460 if (RHS->getOpcode() == ISD::OR)
50461 std::swap(LHS, RHS);
50462 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
50463 if (!NewRHS)
50464 return SDValue();
50465 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
50466 }
50467
50468 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50469}
50470
50471static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
50472 SDValue And1_L, SDValue And1_R,
50473 const SDLoc &DL, SelectionDAG &DAG) {
50474 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50475 return SDValue();
50476 SDValue NotOp = And0_L->getOperand(0);
50477 if (NotOp == And1_R)
50478 std::swap(And1_R, And1_L);
50479 if (NotOp != And1_L)
50480 return SDValue();
50481
50482 // (~(NotOp) & And0_R) | (NotOp & And1_R)
50483 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50484 EVT VT = And1_L->getValueType(0);
50485 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
50486 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
50487 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
50488 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
50489 return Xor1;
50490}
50491
50492/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
50493/// equivalent `((x ^ y) & m) ^ y)` pattern.
50494/// This is typically a better representation for targets without a fused
50495/// "and-not" operation. This function is intended to be called from a
50496/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
50497static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
50498 // Note that masked-merge variants using XOR or ADD expressions are
50499 // normalized to OR by InstCombine so we only check for OR.
50500 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50500, __extension__
__PRETTY_FUNCTION__))
;
50501 SDValue N0 = Node->getOperand(0);
50502 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50503 return SDValue();
50504 SDValue N1 = Node->getOperand(1);
50505 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50506 return SDValue();
50507
50508 SDLoc DL(Node);
50509 SDValue N00 = N0->getOperand(0);
50510 SDValue N01 = N0->getOperand(1);
50511 SDValue N10 = N1->getOperand(0);
50512 SDValue N11 = N1->getOperand(1);
50513 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
50514 return Result;
50515 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
50516 return Result;
50517 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
50518 return Result;
50519 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
50520 return Result;
50521 return SDValue();
50522}
50523
50524/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50525/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50526/// with CMP+{ADC, SBB}.
50527/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
50528static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
50529 SDValue X, SDValue Y,
50530 SelectionDAG &DAG,
50531 bool ZeroSecondOpOnly = false) {
50532 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
50533 return SDValue();
50534
50535 // Look through a one-use zext.
50536 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
50537 Y = Y.getOperand(0);
50538
50539 X86::CondCode CC;
50540 SDValue EFLAGS;
50541 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
50542 CC = (X86::CondCode)Y.getConstantOperandVal(0);
50543 EFLAGS = Y.getOperand(1);
50544 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
50545 Y.hasOneUse()) {
50546 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
50547 }
50548
50549 if (!EFLAGS)
50550 return SDValue();
50551
50552 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50553 // the general case below.
50554 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50555 if (ConstantX && !ZeroSecondOpOnly) {
50556 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50557 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50558 // This is a complicated way to get -1 or 0 from the carry flag:
50559 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50560 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50561 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50562 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50563 EFLAGS);
50564 }
50565
50566 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50567 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50568 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50569 EFLAGS.getValueType().isInteger() &&
50570 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50571 // Swap the operands of a SUB, and we have the same pattern as above.
50572 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50573 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50574 SDValue NewSub = DAG.getNode(
50575 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50576 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50577 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50578 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50579 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50580 NewEFLAGS);
50581 }
50582 }
50583 }
50584
50585 if (CC == X86::COND_B) {
50586 // X + SETB Z --> adc X, 0
50587 // X - SETB Z --> sbb X, 0
50588 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50589 DAG.getVTList(VT, MVT::i32), X,
50590 DAG.getConstant(0, DL, VT), EFLAGS);
50591 }
50592
50593 if (ZeroSecondOpOnly)
50594 return SDValue();
50595
50596 if (CC == X86::COND_A) {
50597 // Try to convert COND_A into COND_B in an attempt to facilitate
50598 // materializing "setb reg".
50599 //
50600 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50601 // cannot take an immediate as its first operand.
50602 //
50603 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50604 EFLAGS.getValueType().isInteger() &&
50605 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50606 SDValue NewSub =
50607 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50608 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50609 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50610 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50611 DAG.getVTList(VT, MVT::i32), X,
50612 DAG.getConstant(0, DL, VT), NewEFLAGS);
50613 }
50614 }
50615
50616 if (CC == X86::COND_AE) {
50617 // X + SETAE --> sbb X, -1
50618 // X - SETAE --> adc X, -1
50619 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50620 DAG.getVTList(VT, MVT::i32), X,
50621 DAG.getConstant(-1, DL, VT), EFLAGS);
50622 }
50623
50624 if (CC == X86::COND_BE) {
50625 // X + SETBE --> sbb X, -1
50626 // X - SETBE --> adc X, -1
50627 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50628 // materializing "setae reg".
50629 //
50630 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50631 // cannot take an immediate as its first operand.
50632 //
50633 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50634 EFLAGS.getValueType().isInteger() &&
50635 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50636 SDValue NewSub =
50637 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50638 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50639 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50640 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50641 DAG.getVTList(VT, MVT::i32), X,
50642 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50643 }
50644 }
50645
50646 if (CC != X86::COND_E && CC != X86::COND_NE)
50647 return SDValue();
50648
50649 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
50650 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
50651 !EFLAGS.getOperand(0).getValueType().isInteger())
50652 return SDValue();
50653
50654 SDValue Z = EFLAGS.getOperand(0);
50655 EVT ZVT = Z.getValueType();
50656
50657 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50658 // the general case below.
50659 if (ConstantX) {
50660 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50661 // fake operands:
50662 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50663 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50664 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50665 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50666 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50667 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50668 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50669 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50670 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50671 SDValue(Neg.getNode(), 1));
50672 }
50673
50674 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50675 // with fake operands:
50676 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50677 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50678 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50679 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50680 SDValue One = DAG.getConstant(1, DL, ZVT);
50681 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50682 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50683 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50684 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50685 Cmp1.getValue(1));
50686 }
50687 }
50688
50689 // (cmp Z, 1) sets the carry flag if Z is 0.
50690 SDValue One = DAG.getConstant(1, DL, ZVT);
50691 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50692 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50693
50694 // Add the flags type for ADC/SBB nodes.
50695 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50696
50697 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50698 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50699 if (CC == X86::COND_NE)
50700 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50701 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50702
50703 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50704 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50705 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50706 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50707}
50708
50709/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50710/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50711/// with CMP+{ADC, SBB}.
50712static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
50713 bool IsSub = N->getOpcode() == ISD::SUB;
50714 SDValue X = N->getOperand(0);
50715 SDValue Y = N->getOperand(1);
50716 EVT VT = N->getValueType(0);
50717 SDLoc DL(N);
50718
50719 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
50720 return ADCOrSBB;
50721
50722 // Commute and try again (negate the result for subtracts).
50723 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
50724 if (IsSub)
50725 ADCOrSBB =
50726 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
50727 return ADCOrSBB;
50728 }
50729
50730 return SDValue();
50731}
50732
50733static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
50734 SelectionDAG &DAG) {
50735 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50736, __extension__
__PRETTY_FUNCTION__))
50736 "Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50736, __extension__
__PRETTY_FUNCTION__))
;
50737
50738 // Delegate to combineAddOrSubToADCOrSBB if we have:
50739 //
50740 // (xor/or (zero_extend (setcc)) imm)
50741 //
50742 // where imm is odd if and only if we have xor, in which case the XOR/OR are
50743 // equivalent to a SUB/ADD, respectively.
50744 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
50745 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
50746 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
50747 bool IsSub = N->getOpcode() == ISD::XOR;
50748 bool N1COdd = N1C->getZExtValue() & 1;
50749 if (IsSub ? N1COdd : !N1COdd) {
50750 SDLoc DL(N);
50751 EVT VT = N->getValueType(0);
50752 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
50753 return R;
50754 }
50755 }
50756 }
50757
50758 return SDValue();
50759}
50760
50761static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
50762 TargetLowering::DAGCombinerInfo &DCI,
50763 const X86Subtarget &Subtarget) {
50764 SDValue N0 = N->getOperand(0);
50765 SDValue N1 = N->getOperand(1);
50766 EVT VT = N->getValueType(0);
50767 SDLoc dl(N);
50768 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50769
50770 // If this is SSE1 only convert to FOR to avoid scalarization.
50771 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50772 return DAG.getBitcast(MVT::v4i32,
50773 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
50774 DAG.getBitcast(MVT::v4f32, N0),
50775 DAG.getBitcast(MVT::v4f32, N1)));
50776 }
50777
50778 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
50779 // TODO: Support multiple SrcOps.
50780 if (VT == MVT::i1) {
50781 SmallVector<SDValue, 2> SrcOps;
50782 SmallVector<APInt, 2> SrcPartials;
50783 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
50784 SrcOps.size() == 1) {
50785 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50786 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50787 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50788 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50789 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50790 if (Mask) {
50791 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50792, __extension__
__PRETTY_FUNCTION__))
50792 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50792, __extension__
__PRETTY_FUNCTION__))
;
50793 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
50794 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50795 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50796 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
50797 }
50798 }
50799 }
50800
50801 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50802 return R;
50803
50804 if (SDValue R = combineBitOpWithShift(N, DAG))
50805 return R;
50806
50807 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50808 return FPLogic;
50809
50810 if (DCI.isBeforeLegalizeOps())
50811 return SDValue();
50812
50813 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50814 return R;
50815
50816 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
50817 return R;
50818
50819 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
50820 return R;
50821
50822 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
50823 if ((VT == MVT::i32 || VT == MVT::i64) &&
50824 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
50825 isNullConstant(N0.getOperand(0))) {
50826 SDValue Cond = N0.getOperand(1);
50827 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
50828 Cond = Cond.getOperand(0);
50829
50830 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
50831 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
50832 uint64_t Val = CN->getZExtValue();
50833 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
50834 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
50835 CCode = X86::GetOppositeBranchCondition(CCode);
50836 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
50837
50838 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
50839 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
50840 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
50841 return R;
50842 }
50843 }
50844 }
50845 }
50846
50847 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
50848 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
50849 // iff the upper elements of the non-shifted arg are zero.
50850 // KUNPCK require 16+ bool vector elements.
50851 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
50852 unsigned NumElts = VT.getVectorNumElements();
50853 unsigned HalfElts = NumElts / 2;
50854 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
50855 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
50856 N1.getConstantOperandAPInt(1) == HalfElts &&
50857 DAG.MaskedVectorIsZero(N0, UpperElts)) {
50858 return DAG.getNode(
50859 ISD::CONCAT_VECTORS, dl, VT,
50860 extractSubVector(N0, 0, DAG, dl, HalfElts),
50861 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
50862 }
50863 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
50864 N0.getConstantOperandAPInt(1) == HalfElts &&
50865 DAG.MaskedVectorIsZero(N1, UpperElts)) {
50866 return DAG.getNode(
50867 ISD::CONCAT_VECTORS, dl, VT,
50868 extractSubVector(N1, 0, DAG, dl, HalfElts),
50869 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
50870 }
50871 }
50872
50873 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50874 // Attempt to recursively combine an OR of shuffles.
50875 SDValue Op(N, 0);
50876 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50877 return Res;
50878
50879 // If either operand is a constant mask, then only the elements that aren't
50880 // allones are actually demanded by the other operand.
50881 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
50882 APInt UndefElts;
50883 SmallVector<APInt> EltBits;
50884 int NumElts = VT.getVectorNumElements();
50885 int EltSizeInBits = VT.getScalarSizeInBits();
50886 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
50887 return false;
50888
50889 APInt DemandedElts = APInt::getZero(NumElts);
50890 for (int I = 0; I != NumElts; ++I)
50891 if (!EltBits[I].isAllOnes())
50892 DemandedElts.setBit(I);
50893
50894 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
50895 };
50896 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
50897 if (N->getOpcode() != ISD::DELETED_NODE)
50898 DCI.AddToWorklist(N);
50899 return SDValue(N, 0);
50900 }
50901 }
50902
50903 // We should fold "masked merge" patterns when `andn` is not available.
50904 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
50905 if (SDValue R = foldMaskedMerge(N, DAG))
50906 return R;
50907
50908 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
50909 return R;
50910
50911 return SDValue();
50912}
50913
50914/// Try to turn tests against the signbit in the form of:
50915/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50916/// into:
50917/// SETGT(X, -1)
50918static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
50919 // This is only worth doing if the output type is i8 or i1.
50920 EVT ResultType = N->getValueType(0);
50921 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50922 return SDValue();
50923
50924 SDValue N0 = N->getOperand(0);
50925 SDValue N1 = N->getOperand(1);
50926
50927 // We should be performing an xor against a truncated shift.
50928 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50929 return SDValue();
50930
50931 // Make sure we are performing an xor against one.
50932 if (!isOneConstant(N1))
50933 return SDValue();
50934
50935 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50936 SDValue Shift = N0.getOperand(0);
50937 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50938 return SDValue();
50939
50940 // Make sure we are truncating from one of i16, i32 or i64.
50941 EVT ShiftTy = Shift.getValueType();
50942 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50943 return SDValue();
50944
50945 // Make sure the shift amount extracts the sign bit.
50946 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50947 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50948 return SDValue();
50949
50950 // Create a greater-than comparison against -1.
50951 // N.B. Using SETGE against 0 works but we want a canonical looking
50952 // comparison, using SETGT matches up with what TranslateX86CC.
50953 SDLoc DL(N);
50954 SDValue ShiftOp = Shift.getOperand(0);
50955 EVT ShiftOpTy = ShiftOp.getValueType();
50956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50957 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50958 *DAG.getContext(), ResultType);
50959 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50960 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50961 if (SetCCResultType != ResultType)
50962 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50963 return Cond;
50964}
50965
50966/// Turn vector tests of the signbit in the form of:
50967/// xor (sra X, elt_size(X)-1), -1
50968/// into:
50969/// pcmpgt X, -1
50970///
50971/// This should be called before type legalization because the pattern may not
50972/// persist after that.
50973static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
50974 const X86Subtarget &Subtarget) {
50975 EVT VT = N->getValueType(0);
50976 if (!VT.isSimple())
50977 return SDValue();
50978
50979 switch (VT.getSimpleVT().SimpleTy) {
50980 default: return SDValue();
50981 case MVT::v16i8:
50982 case MVT::v8i16:
50983 case MVT::v4i32:
50984 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50985 case MVT::v32i8:
50986 case MVT::v16i16:
50987 case MVT::v8i32:
50988 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50989 }
50990
50991 // There must be a shift right algebraic before the xor, and the xor must be a
50992 // 'not' operation.
50993 SDValue Shift = N->getOperand(0);
50994 SDValue Ones = N->getOperand(1);
50995 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50996 !ISD::isBuildVectorAllOnes(Ones.getNode()))
50997 return SDValue();
50998
50999 // The shift should be smearing the sign bit across each vector element.
51000 auto *ShiftAmt =
51001 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
51002 if (!ShiftAmt ||
51003 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
51004 return SDValue();
51005
51006 // Create a greater-than comparison against -1. We don't use the more obvious
51007 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
51008 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
51009}
51010
51011/// Detect patterns of truncation with unsigned saturation:
51012///
51013/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
51014/// Return the source value x to be truncated or SDValue() if the pattern was
51015/// not matched.
51016///
51017/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
51018/// where C1 >= 0 and C2 is unsigned max of destination type.
51019///
51020/// (truncate (smax (smin (x, C2), C1)) to dest_type)
51021/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
51022///
51023/// These two patterns are equivalent to:
51024/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
51025/// So return the smax(x, C1) value to be truncated or SDValue() if the
51026/// pattern was not matched.
51027static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51028 const SDLoc &DL) {
51029 EVT InVT = In.getValueType();
51030
51031 // Saturation with truncation. We truncate from InVT to VT.
51032 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51033, __extension__
__PRETTY_FUNCTION__))
51033 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51033, __extension__
__PRETTY_FUNCTION__))
;
51034
51035 // Match min/max and return limit value as a parameter.
51036 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
51037 if (V.getOpcode() == Opcode &&
51038 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
51039 return V.getOperand(0);
51040 return SDValue();
51041 };
51042
51043 APInt C1, C2;
51044 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
51045 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
51046 // the element size of the destination type.
51047 if (C2.isMask(VT.getScalarSizeInBits()))
51048 return UMin;
51049
51050 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
51051 if (MatchMinMax(SMin, ISD::SMAX, C1))
51052 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
51053 return SMin;
51054
51055 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
51056 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
51057 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
51058 C2.uge(C1)) {
51059 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
51060 }
51061
51062 return SDValue();
51063}
51064
51065/// Detect patterns of truncation with signed saturation:
51066/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
51067/// signed_max_of_dest_type)) to dest_type)
51068/// or:
51069/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
51070/// signed_min_of_dest_type)) to dest_type).
51071/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
51072/// Return the source value to be truncated or SDValue() if the pattern was not
51073/// matched.
51074static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
51075 unsigned NumDstBits = VT.getScalarSizeInBits();
51076 unsigned NumSrcBits = In.getScalarValueSizeInBits();
51077 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51077, __extension__
__PRETTY_FUNCTION__))
;
51078
51079 auto MatchMinMax = [](SDValue V, unsigned Opcode,
51080 const APInt &Limit) -> SDValue {
51081 APInt C;
51082 if (V.getOpcode() == Opcode &&
51083 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
51084 return V.getOperand(0);
51085 return SDValue();
51086 };
51087
51088 APInt SignedMax, SignedMin;
51089 if (MatchPackUS) {
51090 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
51091 SignedMin = APInt(NumSrcBits, 0);
51092 } else {
51093 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
51094 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
51095 }
51096
51097 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
51098 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
51099 return SMax;
51100
51101 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
51102 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
51103 return SMin;
51104
51105 return SDValue();
51106}
51107
51108static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
51109 SelectionDAG &DAG,
51110 const X86Subtarget &Subtarget) {
51111 if (!Subtarget.hasSSE2() || !VT.isVector())
51112 return SDValue();
51113
51114 EVT SVT = VT.getVectorElementType();
51115 EVT InVT = In.getValueType();
51116 EVT InSVT = InVT.getVectorElementType();
51117
51118 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51119 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51120 // and concatenate at the same time. Then we can use a final vpmovuswb to
51121 // clip to 0-255.
51122 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
51123 InVT == MVT::v16i32 && VT == MVT::v16i8) {
51124 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51125 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
51126 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
51127 DL, DAG, Subtarget);
51128 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51128, __extension__
__PRETTY_FUNCTION__))
;
51129 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
51130 }
51131 }
51132
51133 // vXi32 truncate instructions are available with AVX512F.
51134 // vXi16 truncate instructions are only available with AVX512BW.
51135 // For 256-bit or smaller vectors, we require VLX.
51136 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
51137 // If the result type is 256-bits or larger and we have disable 512-bit
51138 // registers, we should go ahead and use the pack instructions if possible.
51139 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
51140 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
51141 (InVT.getSizeInBits() > 128) &&
51142 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
51143 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
51144
51145 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
51146 VT.getSizeInBits() >= 64 &&
51147 (SVT == MVT::i8 || SVT == MVT::i16) &&
51148 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
51149 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51150 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51151 // Only do this when the result is at least 64 bits or we'll leaving
51152 // dangling PACKSSDW nodes.
51153 if (SVT == MVT::i8 && InSVT == MVT::i32) {
51154 EVT MidVT = VT.changeVectorElementType(MVT::i16);
51155 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
51156 DAG, Subtarget);
51157 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51157, __extension__
__PRETTY_FUNCTION__))
;
51158 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
51159 Subtarget);
51160 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51160, __extension__ __PRETTY_FUNCTION__))
;
51161 return V;
51162 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
51163 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
51164 Subtarget);
51165 }
51166 if (SDValue SSatVal = detectSSatPattern(In, VT))
51167 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
51168 Subtarget);
51169 }
51170
51171 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51172 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
51173 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
51174 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
51175 unsigned TruncOpc = 0;
51176 SDValue SatVal;
51177 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
51178 SatVal = SSatVal;
51179 TruncOpc = X86ISD::VTRUNCS;
51180 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
51181 SatVal = USatVal;
51182 TruncOpc = X86ISD::VTRUNCUS;
51183 }
51184 if (SatVal) {
51185 unsigned ResElts = VT.getVectorNumElements();
51186 // If the input type is less than 512 bits and we don't have VLX, we need
51187 // to widen to 512 bits.
51188 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
51189 unsigned NumConcats = 512 / InVT.getSizeInBits();
51190 ResElts *= NumConcats;
51191 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
51192 ConcatOps[0] = SatVal;
51193 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
51194 NumConcats * InVT.getVectorNumElements());
51195 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
51196 }
51197 // Widen the result if its narrower than 128 bits.
51198 if (ResElts * SVT.getSizeInBits() < 128)
51199 ResElts = 128 / SVT.getSizeInBits();
51200 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
51201 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
51202 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51203 DAG.getIntPtrConstant(0, DL));
51204 }
51205 }
51206
51207 return SDValue();
51208}
51209
51210/// This function detects the AVG pattern between vectors of unsigned i8/i16,
51211/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
51212/// ISD::AVGCEILU (AVG) instruction.
51213static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51214 const X86Subtarget &Subtarget,
51215 const SDLoc &DL) {
51216 if (!VT.isVector())
51217 return SDValue();
51218 EVT InVT = In.getValueType();
51219 unsigned NumElems = VT.getVectorNumElements();
51220
51221 EVT ScalarVT = VT.getVectorElementType();
51222 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
51223 return SDValue();
51224
51225 // InScalarVT is the intermediate type in AVG pattern and it should be greater
51226 // than the original input type (i8/i16).
51227 EVT InScalarVT = InVT.getVectorElementType();
51228 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
51229 return SDValue();
51230
51231 if (!Subtarget.hasSSE2())
51232 return SDValue();
51233
51234 // Detect the following pattern:
51235 //
51236 // %1 = zext <N x i8> %a to <N x i32>
51237 // %2 = zext <N x i8> %b to <N x i32>
51238 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
51239 // %4 = add nuw nsw <N x i32> %3, %2
51240 // %5 = lshr <N x i32> %N, <i32 1 x N>
51241 // %6 = trunc <N x i32> %5 to <N x i8>
51242 //
51243 // In AVX512, the last instruction can also be a trunc store.
51244 if (In.getOpcode() != ISD::SRL)
51245 return SDValue();
51246
51247 // A lambda checking the given SDValue is a constant vector and each element
51248 // is in the range [Min, Max].
51249 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
51250 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
51251 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
51252 });
51253 };
51254
51255 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
51256 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
51257 return MaxActiveBits <= ScalarVT.getSizeInBits();
51258 };
51259
51260 // Check if each element of the vector is right-shifted by one.
51261 SDValue LHS = In.getOperand(0);
51262 SDValue RHS = In.getOperand(1);
51263 if (!IsConstVectorInRange(RHS, 1, 1))
51264 return SDValue();
51265 if (LHS.getOpcode() != ISD::ADD)
51266 return SDValue();
51267
51268 // Detect a pattern of a + b + 1 where the order doesn't matter.
51269 SDValue Operands[3];
51270 Operands[0] = LHS.getOperand(0);
51271 Operands[1] = LHS.getOperand(1);
51272
51273 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51274 ArrayRef<SDValue> Ops) {
51275 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
51276 };
51277
51278 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
51279 for (SDValue &Op : Ops)
51280 if (Op.getValueType() != VT)
51281 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
51282 // Pad to a power-of-2 vector, split+apply and extract the original vector.
51283 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
51284 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
51285 if (NumElemsPow2 != NumElems) {
51286 for (SDValue &Op : Ops) {
51287 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
51288 for (unsigned i = 0; i != NumElems; ++i) {
51289 SDValue Idx = DAG.getIntPtrConstant(i, DL);
51290 EltsOfOp[i] =
51291 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
51292 }
51293 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
51294 }
51295 }
51296 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
51297 if (NumElemsPow2 == NumElems)
51298 return Res;
51299 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51300 DAG.getIntPtrConstant(0, DL));
51301 };
51302
51303 // Take care of the case when one of the operands is a constant vector whose
51304 // element is in the range [1, 256].
51305 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
51306 IsZExtLike(Operands[0])) {
51307 // The pattern is detected. Subtract one from the constant vector, then
51308 // demote it and emit X86ISD::AVG instruction.
51309 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
51310 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
51311 return AVGSplitter({Operands[0], Operands[1]});
51312 }
51313
51314 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
51315 // Match the or case only if its 'add-like' - can be replaced by an add.
51316 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
51317 if (ISD::ADD == V.getOpcode()) {
51318 Op0 = V.getOperand(0);
51319 Op1 = V.getOperand(1);
51320 return true;
51321 }
51322 if (ISD::ZERO_EXTEND != V.getOpcode())
51323 return false;
51324 V = V.getOperand(0);
51325 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
51326 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
51327 return false;
51328 Op0 = V.getOperand(0);
51329 Op1 = V.getOperand(1);
51330 return true;
51331 };
51332
51333 SDValue Op0, Op1;
51334 if (FindAddLike(Operands[0], Op0, Op1))
51335 std::swap(Operands[0], Operands[1]);
51336 else if (!FindAddLike(Operands[1], Op0, Op1))
51337 return SDValue();
51338 Operands[2] = Op0;
51339 Operands[1] = Op1;
51340
51341 // Now we have three operands of two additions. Check that one of them is a
51342 // constant vector with ones, and the other two can be promoted from i8/i16.
51343 for (SDValue &Op : Operands) {
51344 if (!IsConstVectorInRange(Op, 1, 1))
51345 continue;
51346 std::swap(Op, Operands[2]);
51347
51348 // Check if Operands[0] and Operands[1] are results of type promotion.
51349 for (int j = 0; j < 2; ++j)
51350 if (Operands[j].getValueType() != VT)
51351 if (!IsZExtLike(Operands[j]))
51352 return SDValue();
51353
51354 // The pattern is detected, emit X86ISD::AVG instruction(s).
51355 return AVGSplitter({Operands[0], Operands[1]});
51356 }
51357
51358 return SDValue();
51359}
51360
51361static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
51362 TargetLowering::DAGCombinerInfo &DCI,
51363 const X86Subtarget &Subtarget) {
51364 LoadSDNode *Ld = cast<LoadSDNode>(N);
51365 EVT RegVT = Ld->getValueType(0);
51366 EVT MemVT = Ld->getMemoryVT();
51367 SDLoc dl(Ld);
51368 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51369
51370 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51371 // into two 16-byte operations. Also split non-temporal aligned loads on
51372 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51373 ISD::LoadExtType Ext = Ld->getExtensionType();
51374 unsigned Fast;
51375 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51376 Ext == ISD::NON_EXTLOAD &&
51377 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51378 Ld->getAlign() >= Align(16)) ||
51379 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51380 *Ld->getMemOperand(), &Fast) &&
51381 !Fast))) {
51382 unsigned NumElems = RegVT.getVectorNumElements();
51383 if (NumElems < 2)
51384 return SDValue();
51385
51386 unsigned HalfOffset = 16;
51387 SDValue Ptr1 = Ld->getBasePtr();
51388 SDValue Ptr2 =
51389 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
51390 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51391 NumElems / 2);
51392 SDValue Load1 =
51393 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51394 Ld->getOriginalAlign(),
51395 Ld->getMemOperand()->getFlags());
51396 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51397 Ld->getPointerInfo().getWithOffset(HalfOffset),
51398 Ld->getOriginalAlign(),
51399 Ld->getMemOperand()->getFlags());
51400 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
51401 Load1.getValue(1), Load2.getValue(1));
51402
51403 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
51404 return DCI.CombineTo(N, NewVec, TF, true);
51405 }
51406
51407 // Bool vector load - attempt to cast to an integer, as we have good
51408 // (vXiY *ext(vXi1 bitcast(iX))) handling.
51409 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
51410 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
51411 unsigned NumElts = RegVT.getVectorNumElements();
51412 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51413 if (TLI.isTypeLegal(IntVT)) {
51414 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51415 Ld->getPointerInfo(),
51416 Ld->getOriginalAlign(),
51417 Ld->getMemOperand()->getFlags());
51418 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
51419 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
51420 }
51421 }
51422
51423 // If we also broadcast this as a subvector to a wider type, then just extract
51424 // the lowest subvector.
51425 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51426 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
51427 SDValue Ptr = Ld->getBasePtr();
51428 SDValue Chain = Ld->getChain();
51429 for (SDNode *User : Ptr->uses()) {
51430 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51431 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51432 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51433 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51434 MemVT.getSizeInBits() &&
51435 !User->hasAnyUseOfValue(1) &&
51436 User->getValueSizeInBits(0).getFixedValue() >
51437 RegVT.getFixedSizeInBits()) {
51438 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51439 RegVT.getSizeInBits());
51440 Extract = DAG.getBitcast(RegVT, Extract);
51441 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51442 }
51443 }
51444 }
51445
51446 // Cast ptr32 and ptr64 pointers to the default address space before a load.
51447 unsigned AddrSpace = Ld->getAddressSpace();
51448 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51449 AddrSpace == X86AS::PTR32_UPTR) {
51450 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51451 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51452 SDValue Cast =
51453 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51454 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
51455 Ld->getOriginalAlign(),
51456 Ld->getMemOperand()->getFlags());
51457 }
51458 }
51459
51460 return SDValue();
51461}
51462
51463/// If V is a build vector of boolean constants and exactly one of those
51464/// constants is true, return the operand index of that true element.
51465/// Otherwise, return -1.
51466static int getOneTrueElt(SDValue V) {
51467 // This needs to be a build vector of booleans.
51468 // TODO: Checking for the i1 type matches the IR definition for the mask,
51469 // but the mask check could be loosened to i8 or other types. That might
51470 // also require checking more than 'allOnesValue'; eg, the x86 HW
51471 // instructions only require that the MSB is set for each mask element.
51472 // The ISD::MSTORE comments/definition do not specify how the mask operand
51473 // is formatted.
51474 auto *BV = dyn_cast<BuildVectorSDNode>(V);
51475 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51476 return -1;
51477
51478 int TrueIndex = -1;
51479 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51480 for (unsigned i = 0; i < NumElts; ++i) {
51481 const SDValue &Op = BV->getOperand(i);
51482 if (Op.isUndef())
51483 continue;
51484 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
51485 if (!ConstNode)
51486 return -1;
51487 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51488 // If we already found a one, this is too many.
51489 if (TrueIndex >= 0)
51490 return -1;
51491 TrueIndex = i;
51492 }
51493 }
51494 return TrueIndex;
51495}
51496
51497/// Given a masked memory load/store operation, return true if it has one mask
51498/// bit set. If it has one mask bit set, then also return the memory address of
51499/// the scalar element to load/store, the vector index to insert/extract that
51500/// scalar element, and the alignment for the scalar memory access.
51501static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
51502 SelectionDAG &DAG, SDValue &Addr,
51503 SDValue &Index, Align &Alignment,
51504 unsigned &Offset) {
51505 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51506 if (TrueMaskElt < 0)
51507 return false;
51508
51509 // Get the address of the one scalar element that is specified by the mask
51510 // using the appropriate offset from the base pointer.
51511 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51512 Offset = 0;
51513 Addr = MaskedOp->getBasePtr();
51514 if (TrueMaskElt != 0) {
51515 Offset = TrueMaskElt * EltVT.getStoreSize();
51516 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
51517 SDLoc(MaskedOp));
51518 }
51519
51520 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
51521 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51522 EltVT.getStoreSize());
51523 return true;
51524}
51525
51526/// If exactly one element of the mask is set for a non-extending masked load,
51527/// it is a scalar load and vector insert.
51528/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51529/// mask have already been optimized in IR, so we don't bother with those here.
51530static SDValue
51531reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51532 TargetLowering::DAGCombinerInfo &DCI,
51533 const X86Subtarget &Subtarget) {
51534 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51534, __extension__
__PRETTY_FUNCTION__))
;
51535 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51536 // However, some target hooks may need to be added to know when the transform
51537 // is profitable. Endianness would also have to be considered.
51538
51539 SDValue Addr, VecIndex;
51540 Align Alignment;
51541 unsigned Offset;
51542 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
51543 return SDValue();
51544
51545 // Load the one scalar element that is specified by the mask using the
51546 // appropriate offset from the base pointer.
51547 SDLoc DL(ML);
51548 EVT VT = ML->getValueType(0);
51549 EVT EltVT = VT.getVectorElementType();
51550
51551 EVT CastVT = VT;
51552 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51553 EltVT = MVT::f64;
51554 CastVT = VT.changeVectorElementType(EltVT);
51555 }
51556
51557 SDValue Load =
51558 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51559 ML->getPointerInfo().getWithOffset(Offset),
51560 Alignment, ML->getMemOperand()->getFlags());
51561
51562 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51563
51564 // Insert the loaded element into the appropriate place in the vector.
51565 SDValue Insert =
51566 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
51567 Insert = DAG.getBitcast(VT, Insert);
51568 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
51569}
51570
51571static SDValue
51572combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51573 TargetLowering::DAGCombinerInfo &DCI) {
51574 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51574, __extension__
__PRETTY_FUNCTION__))
;
51575 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51576 return SDValue();
51577
51578 SDLoc DL(ML);
51579 EVT VT = ML->getValueType(0);
51580
51581 // If we are loading the first and last elements of a vector, it is safe and
51582 // always faster to load the whole vector. Replace the masked load with a
51583 // vector load and select.
51584 unsigned NumElts = VT.getVectorNumElements();
51585 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51586 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51587 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51588 if (LoadFirstElt && LoadLastElt) {
51589 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51590 ML->getMemOperand());
51591 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51592 ML->getPassThru());
51593 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
51594 }
51595
51596 // Convert a masked load with a constant mask into a masked load and a select.
51597 // This allows the select operation to use a faster kind of select instruction
51598 // (for example, vblendvps -> vblendps).
51599
51600 // Don't try this if the pass-through operand is already undefined. That would
51601 // cause an infinite loop because that's what we're about to create.
51602 if (ML->getPassThru().isUndef())
51603 return SDValue();
51604
51605 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51606 return SDValue();
51607
51608 // The new masked load has an undef pass-through operand. The select uses the
51609 // original pass-through operand.
51610 SDValue NewML = DAG.getMaskedLoad(
51611 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51612 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51613 ML->getAddressingMode(), ML->getExtensionType());
51614 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51615 ML->getPassThru());
51616
51617 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
51618}
51619
51620static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
51621 TargetLowering::DAGCombinerInfo &DCI,
51622 const X86Subtarget &Subtarget) {
51623 auto *Mld = cast<MaskedLoadSDNode>(N);
51624
51625 // TODO: Expanding load with constant mask may be optimized as well.
51626 if (Mld->isExpandingLoad())
51627 return SDValue();
51628
51629 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51630 if (SDValue ScalarLoad =
51631 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
51632 return ScalarLoad;
51633
51634 // TODO: Do some AVX512 subsets benefit from this transform?
51635 if (!Subtarget.hasAVX512())
51636 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
51637 return Blend;
51638 }
51639
51640 // If the mask value has been legalized to a non-boolean vector, try to
51641 // simplify ops leading up to it. We only demand the MSB of each lane.
51642 SDValue Mask = Mld->getMask();
51643 if (Mask.getScalarValueSizeInBits() != 1) {
51644 EVT VT = Mld->getValueType(0);
51645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51646 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51647 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51648 if (N->getOpcode() != ISD::DELETED_NODE)
51649 DCI.AddToWorklist(N);
51650 return SDValue(N, 0);
51651 }
51652 if (SDValue NewMask =
51653 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51654 return DAG.getMaskedLoad(
51655 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51656 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51657 Mld->getAddressingMode(), Mld->getExtensionType());
51658 }
51659
51660 return SDValue();
51661}
51662
51663/// If exactly one element of the mask is set for a non-truncating masked store,
51664/// it is a vector extract and scalar store.
51665/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51666/// mask have already been optimized in IR, so we don't bother with those here.
51667static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
51668 SelectionDAG &DAG,
51669 const X86Subtarget &Subtarget) {
51670 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51671 // However, some target hooks may need to be added to know when the transform
51672 // is profitable. Endianness would also have to be considered.
51673
51674 SDValue Addr, VecIndex;
51675 Align Alignment;
51676 unsigned Offset;
51677 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
51678 return SDValue();
51679
51680 // Extract the one scalar element that is actually being stored.
51681 SDLoc DL(MS);
51682 SDValue Value = MS->getValue();
51683 EVT VT = Value.getValueType();
51684 EVT EltVT = VT.getVectorElementType();
51685 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51686 EltVT = MVT::f64;
51687 EVT CastVT = VT.changeVectorElementType(EltVT);
51688 Value = DAG.getBitcast(CastVT, Value);
51689 }
51690 SDValue Extract =
51691 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
51692
51693 // Store that element at the appropriate offset from the base pointer.
51694 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51695 MS->getPointerInfo().getWithOffset(Offset),
51696 Alignment, MS->getMemOperand()->getFlags());
51697}
51698
51699static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
51700 TargetLowering::DAGCombinerInfo &DCI,
51701 const X86Subtarget &Subtarget) {
51702 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
51703 if (Mst->isCompressingStore())
51704 return SDValue();
51705
51706 EVT VT = Mst->getValue().getValueType();
51707 SDLoc dl(Mst);
51708 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51709
51710 if (Mst->isTruncatingStore())
51711 return SDValue();
51712
51713 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51714 return ScalarStore;
51715
51716 // If the mask value has been legalized to a non-boolean vector, try to
51717 // simplify ops leading up to it. We only demand the MSB of each lane.
51718 SDValue Mask = Mst->getMask();
51719 if (Mask.getScalarValueSizeInBits() != 1) {
51720 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51721 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51722 if (N->getOpcode() != ISD::DELETED_NODE)
51723 DCI.AddToWorklist(N);
51724 return SDValue(N, 0);
51725 }
51726 if (SDValue NewMask =
51727 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51728 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51729 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51730 Mst->getMemoryVT(), Mst->getMemOperand(),
51731 Mst->getAddressingMode());
51732 }
51733
51734 SDValue Value = Mst->getValue();
51735 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51736 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51737 Mst->getMemoryVT())) {
51738 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51739 Mst->getBasePtr(), Mst->getOffset(), Mask,
51740 Mst->getMemoryVT(), Mst->getMemOperand(),
51741 Mst->getAddressingMode(), true);
51742 }
51743
51744 return SDValue();
51745}
51746
51747static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
51748 TargetLowering::DAGCombinerInfo &DCI,
51749 const X86Subtarget &Subtarget) {
51750 StoreSDNode *St = cast<StoreSDNode>(N);
51751 EVT StVT = St->getMemoryVT();
51752 SDLoc dl(St);
51753 SDValue StoredVal = St->getValue();
51754 EVT VT = StoredVal.getValueType();
51755 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51756
51757 // Convert a store of vXi1 into a store of iX and a bitcast.
51758 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
51759 VT.getVectorElementType() == MVT::i1) {
51760
51761 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
51762 StoredVal = DAG.getBitcast(NewVT, StoredVal);
51763
51764 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51765 St->getPointerInfo(), St->getOriginalAlign(),
51766 St->getMemOperand()->getFlags());
51767 }
51768
51769 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
51770 // This will avoid a copy to k-register.
51771 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
51772 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
51773 StoredVal.getOperand(0).getValueType() == MVT::i8) {
51774 SDValue Val = StoredVal.getOperand(0);
51775 // We must store zeros to the unused bits.
51776 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
51777 return DAG.getStore(St->getChain(), dl, Val,
51778 St->getBasePtr(), St->getPointerInfo(),
51779 St->getOriginalAlign(),
51780 St->getMemOperand()->getFlags());
51781 }
51782
51783 // Widen v2i1/v4i1 stores to v8i1.
51784 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
51785 Subtarget.hasAVX512()) {
51786 unsigned NumConcats = 8 / VT.getVectorNumElements();
51787 // We must store zeros to the unused bits.
51788 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
51789 Ops[0] = StoredVal;
51790 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
51791 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51792 St->getPointerInfo(), St->getOriginalAlign(),
51793 St->getMemOperand()->getFlags());
51794 }
51795
51796 // Turn vXi1 stores of constants into a scalar store.
51797 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
51798 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
51799 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
51800 // If its a v64i1 store without 64-bit support, we need two stores.
51801 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
51802 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
51803 StoredVal->ops().slice(0, 32));
51804 Lo = combinevXi1ConstantToInteger(Lo, DAG);
51805 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
51806 StoredVal->ops().slice(32, 32));
51807 Hi = combinevXi1ConstantToInteger(Hi, DAG);
51808
51809 SDValue Ptr0 = St->getBasePtr();
51810 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
51811
51812 SDValue Ch0 =
51813 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
51814 St->getOriginalAlign(),
51815 St->getMemOperand()->getFlags());
51816 SDValue Ch1 =
51817 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
51818 St->getPointerInfo().getWithOffset(4),
51819 St->getOriginalAlign(),
51820 St->getMemOperand()->getFlags());
51821 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
51822 }
51823
51824 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
51825 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51826 St->getPointerInfo(), St->getOriginalAlign(),
51827 St->getMemOperand()->getFlags());
51828 }
51829
51830 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
51831 // Sandy Bridge, perform two 16-byte stores.
51832 unsigned Fast;
51833 if (VT.is256BitVector() && StVT == VT &&
51834 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51835 *St->getMemOperand(), &Fast) &&
51836 !Fast) {
51837 unsigned NumElems = VT.getVectorNumElements();
51838 if (NumElems < 2)
51839 return SDValue();
51840
51841 return splitVectorStore(St, DAG);
51842 }
51843
51844 // Split under-aligned vector non-temporal stores.
51845 if (St->isNonTemporal() && StVT == VT &&
51846 St->getAlign().value() < VT.getStoreSize()) {
51847 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
51848 // vectors or the legalizer can scalarize it to use MOVNTI.
51849 if (VT.is256BitVector() || VT.is512BitVector()) {
51850 unsigned NumElems = VT.getVectorNumElements();
51851 if (NumElems < 2)
51852 return SDValue();
51853 return splitVectorStore(St, DAG);
51854 }
51855
51856 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
51857 // to use MOVNTI.
51858 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
51859 MVT NTVT = Subtarget.hasSSE4A()
51860 ? MVT::v2f64
51861 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
51862 return scalarizeVectorStore(St, NTVT, DAG);
51863 }
51864 }
51865
51866 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
51867 // supported, but avx512f is by extending to v16i32 and truncating.
51868 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
51869 St->getValue().getOpcode() == ISD::TRUNCATE &&
51870 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
51871 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
51872 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51873 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51874 St->getValue().getOperand(0));
51875 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51876 MVT::v16i8, St->getMemOperand());
51877 }
51878
51879 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51880 if (!St->isTruncatingStore() &&
51881 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51882 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51883 StoredVal.hasOneUse() &&
51884 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51885 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51886 return EmitTruncSStore(IsSigned, St->getChain(),
51887 dl, StoredVal.getOperand(0), St->getBasePtr(),
51888 VT, St->getMemOperand(), DAG);
51889 }
51890
51891 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51892 if (!St->isTruncatingStore()) {
51893 auto IsExtractedElement = [](SDValue V) {
51894 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51895 V = V.getOperand(0);
51896 unsigned Opc = V.getOpcode();
51897 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51898 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51899 V.getOperand(0).hasOneUse())
51900 return V.getOperand(0);
51901 return SDValue();
51902 };
51903 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51904 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51905 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51906 SDValue Src = Trunc.getOperand(0);
51907 MVT DstVT = Trunc.getSimpleValueType();
51908 MVT SrcVT = Src.getSimpleValueType();
51909 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51910 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51911 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51912 if (NumTruncBits == VT.getSizeInBits() &&
51913 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51914 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51915 TruncVT, St->getMemOperand());
51916 }
51917 }
51918 }
51919 }
51920
51921 // Optimize trunc store (of multiple scalars) to shuffle and store.
51922 // First, pack all of the elements in one place. Next, store to memory
51923 // in fewer chunks.
51924 if (St->isTruncatingStore() && VT.isVector()) {
51925 // Check if we can detect an AVG pattern from the truncation. If yes,
51926 // replace the trunc store by a normal store with the result of X86ISD::AVG
51927 // instruction.
51928 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
51929 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
51930 Subtarget, dl))
51931 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
51932 St->getPointerInfo(), St->getOriginalAlign(),
51933 St->getMemOperand()->getFlags());
51934
51935 if (TLI.isTruncStoreLegal(VT, StVT)) {
51936 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51937 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51938 dl, Val, St->getBasePtr(),
51939 St->getMemoryVT(), St->getMemOperand(), DAG);
51940 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51941 DAG, dl))
51942 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51943 dl, Val, St->getBasePtr(),
51944 St->getMemoryVT(), St->getMemOperand(), DAG);
51945 }
51946
51947 return SDValue();
51948 }
51949
51950 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51951 unsigned AddrSpace = St->getAddressSpace();
51952 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51953 AddrSpace == X86AS::PTR32_UPTR) {
51954 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51955 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51956 SDValue Cast =
51957 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51958 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
51959 St->getPointerInfo(), St->getOriginalAlign(),
51960 St->getMemOperand()->getFlags(), St->getAAInfo());
51961 }
51962 }
51963
51964 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51965 // the FP state in cases where an emms may be missing.
51966 // A preferable solution to the general problem is to figure out the right
51967 // places to insert EMMS. This qualifies as a quick hack.
51968
51969 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51970 if (VT.getSizeInBits() != 64)
51971 return SDValue();
51972
51973 const Function &F = DAG.getMachineFunction().getFunction();
51974 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51975 bool F64IsLegal =
51976 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51977 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
51978 isa<LoadSDNode>(St->getValue()) &&
51979 cast<LoadSDNode>(St->getValue())->isSimple() &&
51980 St->getChain().hasOneUse() && St->isSimple()) {
51981 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
51982
51983 if (!ISD::isNormalLoad(Ld))
51984 return SDValue();
51985
51986 // Avoid the transformation if there are multiple uses of the loaded value.
51987 if (!Ld->hasNUsesOfValue(1, 0))
51988 return SDValue();
51989
51990 SDLoc LdDL(Ld);
51991 SDLoc StDL(N);
51992 // Lower to a single movq load/store pair.
51993 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51994 Ld->getBasePtr(), Ld->getMemOperand());
51995
51996 // Make sure new load is placed in same chain order.
51997 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51998 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51999 St->getMemOperand());
52000 }
52001
52002 // This is similar to the above case, but here we handle a scalar 64-bit
52003 // integer store that is extracted from a vector on a 32-bit target.
52004 // If we have SSE2, then we can treat it like a floating-point double
52005 // to get past legalization. The execution dependencies fixup pass will
52006 // choose the optimal machine instruction for the store if this really is
52007 // an integer or v2f32 rather than an f64.
52008 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
52009 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
52010 SDValue OldExtract = St->getOperand(1);
52011 SDValue ExtOp0 = OldExtract.getOperand(0);
52012 unsigned VecSize = ExtOp0.getValueSizeInBits();
52013 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
52014 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
52015 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
52016 BitCast, OldExtract.getOperand(1));
52017 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
52018 St->getPointerInfo(), St->getOriginalAlign(),
52019 St->getMemOperand()->getFlags());
52020 }
52021
52022 return SDValue();
52023}
52024
52025static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
52026 TargetLowering::DAGCombinerInfo &DCI,
52027 const X86Subtarget &Subtarget) {
52028 auto *St = cast<MemIntrinsicSDNode>(N);
52029
52030 SDValue StoredVal = N->getOperand(1);
52031 MVT VT = StoredVal.getSimpleValueType();
52032 EVT MemVT = St->getMemoryVT();
52033
52034 // Figure out which elements we demand.
52035 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
52036 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
52037
52038 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52039 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
52040 if (N->getOpcode() != ISD::DELETED_NODE)
52041 DCI.AddToWorklist(N);
52042 return SDValue(N, 0);
52043 }
52044
52045 return SDValue();
52046}
52047
52048/// Return 'true' if this vector operation is "horizontal"
52049/// and return the operands for the horizontal operation in LHS and RHS. A
52050/// horizontal operation performs the binary operation on successive elements
52051/// of its first operand, then on successive elements of its second operand,
52052/// returning the resulting values in a vector. For example, if
52053/// A = < float a0, float a1, float a2, float a3 >
52054/// and
52055/// B = < float b0, float b1, float b2, float b3 >
52056/// then the result of doing a horizontal operation on A and B is
52057/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
52058/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
52059/// A horizontal-op B, for some already available A and B, and if so then LHS is
52060/// set to A, RHS to B, and the routine returns 'true'.
52061static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
52062 SelectionDAG &DAG, const X86Subtarget &Subtarget,
52063 bool IsCommutative,
52064 SmallVectorImpl<int> &PostShuffleMask) {
52065 // If either operand is undef, bail out. The binop should be simplified.
52066 if (LHS.isUndef() || RHS.isUndef())
52067 return false;
52068
52069 // Look for the following pattern:
52070 // A = < float a0, float a1, float a2, float a3 >
52071 // B = < float b0, float b1, float b2, float b3 >
52072 // and
52073 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
52074 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
52075 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
52076 // which is A horizontal-op B.
52077
52078 MVT VT = LHS.getSimpleValueType();
52079 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52080, __extension__
__PRETTY_FUNCTION__))
52080 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52080, __extension__
__PRETTY_FUNCTION__))
;
52081 unsigned NumElts = VT.getVectorNumElements();
52082
52083 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
52084 SmallVectorImpl<int> &ShuffleMask) {
52085 bool UseSubVector = false;
52086 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52087 Op.getOperand(0).getValueType().is256BitVector() &&
52088 llvm::isNullConstant(Op.getOperand(1))) {
52089 Op = Op.getOperand(0);
52090 UseSubVector = true;
52091 }
52092 SmallVector<SDValue, 2> SrcOps;
52093 SmallVector<int, 16> SrcMask, ScaledMask;
52094 SDValue BC = peekThroughBitcasts(Op);
52095 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
52096 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
52097 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
52098 })) {
52099 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
52100 if (!UseSubVector && SrcOps.size() <= 2 &&
52101 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
52102 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
52103 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
52104 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
52105 }
52106 if (UseSubVector && SrcOps.size() == 1 &&
52107 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
52108 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
52109 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
52110 ShuffleMask.assign(Mask.begin(), Mask.end());
52111 }
52112 }
52113 };
52114
52115 // View LHS in the form
52116 // LHS = VECTOR_SHUFFLE A, B, LMask
52117 // If LHS is not a shuffle, then pretend it is the identity shuffle:
52118 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
52119 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
52120 SDValue A, B;
52121 SmallVector<int, 16> LMask;
52122 GetShuffle(LHS, A, B, LMask);
52123
52124 // Likewise, view RHS in the form
52125 // RHS = VECTOR_SHUFFLE C, D, RMask
52126 SDValue C, D;
52127 SmallVector<int, 16> RMask;
52128 GetShuffle(RHS, C, D, RMask);
52129
52130 // At least one of the operands should be a vector shuffle.
52131 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
52132 if (NumShuffles == 0)
52133 return false;
52134
52135 if (LMask.empty()) {
52136 A = LHS;
52137 for (unsigned i = 0; i != NumElts; ++i)
52138 LMask.push_back(i);
52139 }
52140
52141 if (RMask.empty()) {
52142 C = RHS;
52143 for (unsigned i = 0; i != NumElts; ++i)
52144 RMask.push_back(i);
52145 }
52146
52147 // If we have an unary mask, ensure the other op is set to null.
52148 if (isUndefOrInRange(LMask, 0, NumElts))
52149 B = SDValue();
52150 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
52151 A = SDValue();
52152
52153 if (isUndefOrInRange(RMask, 0, NumElts))
52154 D = SDValue();
52155 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
52156 C = SDValue();
52157
52158 // If A and B occur in reverse order in RHS, then canonicalize by commuting
52159 // RHS operands and shuffle mask.
52160 if (A != C) {
52161 std::swap(C, D);
52162 ShuffleVectorSDNode::commuteMask(RMask);
52163 }
52164 // Check that the shuffles are both shuffling the same vectors.
52165 if (!(A == C && B == D))
52166 return false;
52167
52168 PostShuffleMask.clear();
52169 PostShuffleMask.append(NumElts, SM_SentinelUndef);
52170
52171 // LHS and RHS are now:
52172 // LHS = shuffle A, B, LMask
52173 // RHS = shuffle A, B, RMask
52174 // Check that the masks correspond to performing a horizontal operation.
52175 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52176 // so we just repeat the inner loop if this is a 256-bit op.
52177 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
52178 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
52179 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
52180 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52181, __extension__
__PRETTY_FUNCTION__))
52181 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52181, __extension__
__PRETTY_FUNCTION__))
;
52182 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
52183 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
52184 // Ignore undefined components.
52185 int LIdx = LMask[i + j], RIdx = RMask[i + j];
52186 if (LIdx < 0 || RIdx < 0 ||
52187 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
52188 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
52189 continue;
52190
52191 // Check that successive odd/even elements are being operated on. If not,
52192 // this is not a horizontal operation.
52193 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
52194 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
52195 return false;
52196
52197 // Compute the post-shuffle mask index based on where the element
52198 // is stored in the HOP result, and where it needs to be moved to.
52199 int Base = LIdx & ~1u;
52200 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
52201 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52202
52203 // The low half of the 128-bit result must choose from A.
52204 // The high half of the 128-bit result must choose from B,
52205 // unless B is undef. In that case, we are always choosing from A.
52206 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
52207 Index += NumEltsPer64BitChunk;
52208 PostShuffleMask[i + j] = Index;
52209 }
52210 }
52211
52212 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
52213 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
52214
52215 bool IsIdentityPostShuffle =
52216 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
52217 if (IsIdentityPostShuffle)
52218 PostShuffleMask.clear();
52219
52220 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52221 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
52222 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
52223 return false;
52224
52225 // If the source nodes are already used in HorizOps then always accept this.
52226 // Shuffle folding should merge these back together.
52227 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
52228 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52229 });
52230 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
52231 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52232 });
52233 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
52234
52235 // Assume a SingleSource HOP if we only shuffle one input and don't need to
52236 // shuffle the result.
52237 if (!ForceHorizOp &&
52238 !shouldUseHorizontalOp(NewLHS == NewRHS &&
52239 (NumShuffles < 2 || !IsIdentityPostShuffle),
52240 DAG, Subtarget))
52241 return false;
52242
52243 LHS = DAG.getBitcast(VT, NewLHS);
52244 RHS = DAG.getBitcast(VT, NewRHS);
52245 return true;
52246}
52247
52248// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
52249static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
52250 const X86Subtarget &Subtarget) {
52251 EVT VT = N->getValueType(0);
52252 unsigned Opcode = N->getOpcode();
52253 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
52254 SmallVector<int, 8> PostShuffleMask;
52255
52256 switch (Opcode) {
52257 case ISD::FADD:
52258 case ISD::FSUB:
52259 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
52260 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
52261 SDValue LHS = N->getOperand(0);
52262 SDValue RHS = N->getOperand(1);
52263 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
52264 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52265 PostShuffleMask)) {
52266 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
52267 if (!PostShuffleMask.empty())
52268 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52269 DAG.getUNDEF(VT), PostShuffleMask);
52270 return HorizBinOp;
52271 }
52272 }
52273 break;
52274 case ISD::ADD:
52275 case ISD::SUB:
52276 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
52277 VT == MVT::v16i16 || VT == MVT::v8i32)) {
52278 SDValue LHS = N->getOperand(0);
52279 SDValue RHS = N->getOperand(1);
52280 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
52281 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52282 PostShuffleMask)) {
52283 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
52284 ArrayRef<SDValue> Ops) {
52285 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
52286 };
52287 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
52288 {LHS, RHS}, HOpBuilder);
52289 if (!PostShuffleMask.empty())
52290 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52291 DAG.getUNDEF(VT), PostShuffleMask);
52292 return HorizBinOp;
52293 }
52294 }
52295 break;
52296 }
52297
52298 return SDValue();
52299}
52300
52301// Try to combine the following nodes
52302// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
52303// <i32 -2147483648[float -0.000000e+00]> 0
52304// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
52305// <(load 4 from constant-pool)> t0, t29
52306// [t30: v16i32 = bitcast t27]
52307// t6: v16i32 = xor t7, t27[t30]
52308// t11: v16f32 = bitcast t6
52309// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
52310// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
52311// t22: v16f32 = bitcast t7
52312// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
52313// t24: v32f16 = bitcast t23
52314static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
52315 const X86Subtarget &Subtarget) {
52316 EVT VT = N->getValueType(0);
52317 SDValue LHS = N->getOperand(0);
52318 SDValue RHS = N->getOperand(1);
52319 int CombineOpcode =
52320 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52321 auto isConjugationConstant = [](const Constant *c) {
52322 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
52323 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
52324 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
52325 switch (CI->getBitWidth()) {
52326 case 16:
52327 return false;
52328 case 32:
52329 return CI->getValue() == ConjugationInt32;
52330 case 64:
52331 return CI->getValue() == ConjugationInt64;
52332 default:
52333 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52333)
;
52334 }
52335 }
52336 if (const auto *CF = dyn_cast<ConstantFP>(c))
52337 return CF->isNegativeZeroValue();
52338 return false;
52339 };
52340 auto combineConjugation = [&](SDValue &r) {
52341 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52342 SDValue XOR = LHS.getOperand(0);
52343 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52344 SDValue XORRHS = XOR.getOperand(1);
52345 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
52346 XORRHS = XORRHS.getOperand(0);
52347 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
52348 XORRHS.getOperand(1).getNumOperands()) {
52349 ConstantPoolSDNode *CP =
52350 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
52351 if (CP && isConjugationConstant(CP->getConstVal())) {
52352 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
52353 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
52354 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
52355 r = DAG.getBitcast(VT, FCMulC);
52356 return true;
52357 }
52358 }
52359 }
52360 }
52361 return false;
52362 };
52363 SDValue Res;
52364 if (combineConjugation(Res))
52365 return Res;
52366 std::swap(LHS, RHS);
52367 if (combineConjugation(Res))
52368 return Res;
52369 return Res;
52370}
52371
52372// Try to combine the following nodes:
52373// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
52374static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
52375 const X86Subtarget &Subtarget) {
52376 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
52377 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
52378 Flags.hasAllowContract();
52379 };
52380
52381 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
52382 return DAG.getTarget().Options.NoSignedZerosFPMath ||
52383 Flags.hasNoSignedZeros();
52384 };
52385 auto IsVectorAllNegativeZero = [](const SDNode *N) {
52386 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
52387 return false;
52388 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52389, __extension__
__PRETTY_FUNCTION__))
52389 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52389, __extension__
__PRETTY_FUNCTION__))
;
52390 if (ConstantPoolSDNode *CP =
52391 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
52392 APInt AI = APInt(32, 0x80008000, true);
52393 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
52394 return CI->getValue() == AI;
52395 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
52396 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
52397 }
52398 return false;
52399 };
52400
52401 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52402 !AllowContract(N->getFlags()))
52403 return SDValue();
52404
52405 EVT VT = N->getValueType(0);
52406 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
52407 return SDValue();
52408
52409 SDValue LHS = N->getOperand(0);
52410 SDValue RHS = N->getOperand(1);
52411 bool IsConj;
52412 SDValue FAddOp1, MulOp0, MulOp1;
52413 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
52414 &IsVectorAllNegativeZero,
52415 &HasNoSignedZero](SDValue N) -> bool {
52416 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
52417 return false;
52418 SDValue Op0 = N.getOperand(0);
52419 unsigned Opcode = Op0.getOpcode();
52420 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52421 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
52422 MulOp0 = Op0.getOperand(0);
52423 MulOp1 = Op0.getOperand(1);
52424 IsConj = Opcode == X86ISD::VFCMULC;
52425 return true;
52426 }
52427 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
52428 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
52429 HasNoSignedZero(Op0->getFlags())) ||
52430 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
52431 MulOp0 = Op0.getOperand(0);
52432 MulOp1 = Op0.getOperand(1);
52433 IsConj = Opcode == X86ISD::VFCMADDC;
52434 return true;
52435 }
52436 }
52437 return false;
52438 };
52439
52440 if (GetCFmulFrom(LHS))
52441 FAddOp1 = RHS;
52442 else if (GetCFmulFrom(RHS))
52443 FAddOp1 = LHS;
52444 else
52445 return SDValue();
52446
52447 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
52448 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
52449 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
52450 // FIXME: How do we handle when fast math flags of FADD are different from
52451 // CFMUL's?
52452 SDValue CFmul =
52453 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52454 return DAG.getBitcast(VT, CFmul);
52455}
52456
52457/// Do target-specific dag combines on floating-point adds/subs.
52458static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
52459 const X86Subtarget &Subtarget) {
52460 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
52461 return HOp;
52462
52463 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
52464 return COp;
52465
52466 return SDValue();
52467}
52468
52469/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52470/// the codegen.
52471/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52472/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
52473/// anything that is guaranteed to be transformed by DAGCombiner.
52474static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
52475 const X86Subtarget &Subtarget,
52476 const SDLoc &DL) {
52477 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52477, __extension__
__PRETTY_FUNCTION__))
;
52478 SDValue Src = N->getOperand(0);
52479 unsigned SrcOpcode = Src.getOpcode();
52480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52481
52482 EVT VT = N->getValueType(0);
52483 EVT SrcVT = Src.getValueType();
52484
52485 auto IsFreeTruncation = [VT](SDValue Op) {
52486 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
52487
52488 // See if this has been extended from a smaller/equal size to
52489 // the truncation size, allowing a truncation to combine with the extend.
52490 unsigned Opcode = Op.getOpcode();
52491 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
52492 Opcode == ISD::ZERO_EXTEND) &&
52493 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
52494 return true;
52495
52496 // See if this is a single use constant which can be constant folded.
52497 // NOTE: We don't peek throught bitcasts here because there is currently
52498 // no support for constant folding truncate+bitcast+vector_of_constants. So
52499 // we'll just send up with a truncate on both operands which will
52500 // get turned back into (truncate (binop)) causing an infinite loop.
52501 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
52502 };
52503
52504 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
52505 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
52506 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
52507 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
52508 };
52509
52510 // Don't combine if the operation has other uses.
52511 if (!Src.hasOneUse())
52512 return SDValue();
52513
52514 // Only support vector truncation for now.
52515 // TODO: i64 scalar math would benefit as well.
52516 if (!VT.isVector())
52517 return SDValue();
52518
52519 // In most cases its only worth pre-truncating if we're only facing the cost
52520 // of one truncation.
52521 // i.e. if one of the inputs will constant fold or the input is repeated.
52522 switch (SrcOpcode) {
52523 case ISD::MUL:
52524 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52525 // better to truncate if we have the chance.
52526 if (SrcVT.getScalarType() == MVT::i64 &&
52527 TLI.isOperationLegal(SrcOpcode, VT) &&
52528 !TLI.isOperationLegal(SrcOpcode, SrcVT))
52529 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
52530 [[fallthrough]];
52531 case ISD::AND:
52532 case ISD::XOR:
52533 case ISD::OR:
52534 case ISD::ADD:
52535 case ISD::SUB: {
52536 SDValue Op0 = Src.getOperand(0);
52537 SDValue Op1 = Src.getOperand(1);
52538 if (TLI.isOperationLegal(SrcOpcode, VT) &&
52539 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
52540 return TruncateArithmetic(Op0, Op1);
52541 break;
52542 }
52543 }
52544
52545 return SDValue();
52546}
52547
52548/// Truncate using ISD::AND mask and X86ISD::PACKUS.
52549/// e.g. trunc <8 x i32> X to <8 x i16> -->
52550/// MaskX = X & 0xffff (clear high bits to prevent saturation)
52551/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
52552static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
52553 const X86Subtarget &Subtarget,
52554 SelectionDAG &DAG) {
52555 SDValue In = N->getOperand(0);
52556 EVT InVT = In.getValueType();
52557 EVT OutVT = N->getValueType(0);
52558
52559 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
52560 OutVT.getScalarSizeInBits());
52561 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
52562 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
52563}
52564
52565/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
52566static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
52567 const X86Subtarget &Subtarget,
52568 SelectionDAG &DAG) {
52569 SDValue In = N->getOperand(0);
52570 EVT InVT = In.getValueType();
52571 EVT OutVT = N->getValueType(0);
52572 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
52573 DAG.getValueType(OutVT));
52574 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
52575}
52576
52577/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
52578/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
52579/// legalization the truncation will be translated into a BUILD_VECTOR with each
52580/// element that is extracted from a vector and then truncated, and it is
52581/// difficult to do this optimization based on them.
52582static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
52583 const X86Subtarget &Subtarget) {
52584 EVT OutVT = N->getValueType(0);
52585 if (!OutVT.isVector())
52586 return SDValue();
52587
52588 SDValue In = N->getOperand(0);
52589 if (!In.getValueType().isSimple())
52590 return SDValue();
52591
52592 EVT InVT = In.getValueType();
52593 unsigned NumElems = OutVT.getVectorNumElements();
52594
52595 // AVX512 provides fast truncate ops.
52596 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
52597 return SDValue();
52598
52599 EVT OutSVT = OutVT.getVectorElementType();
52600 EVT InSVT = InVT.getVectorElementType();
52601 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
52602 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
52603 NumElems >= 8))
52604 return SDValue();
52605
52606 // SSSE3's pshufb results in less instructions in the cases below.
52607 if (Subtarget.hasSSSE3() && NumElems == 8) {
52608 if (InSVT == MVT::i16)
52609 return SDValue();
52610 if (InSVT == MVT::i32 &&
52611 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
52612 return SDValue();
52613 }
52614
52615 SDLoc DL(N);
52616 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
52617 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
52618 // truncate 2 x v4i32 to v8i16.
52619 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
52620 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
52621 if (InSVT == MVT::i32)
52622 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
52623
52624 return SDValue();
52625}
52626
52627/// This function transforms vector truncation of 'extended sign-bits' or
52628/// 'extended zero-bits' values.
52629/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
52630static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
52631 SelectionDAG &DAG,
52632 const X86Subtarget &Subtarget) {
52633 // Requires SSE2.
52634 if (!Subtarget.hasSSE2())
52635 return SDValue();
52636
52637 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
52638 return SDValue();
52639
52640 SDValue In = N->getOperand(0);
52641 if (!In.getValueType().isSimple())
52642 return SDValue();
52643
52644 MVT VT = N->getValueType(0).getSimpleVT();
52645 MVT SVT = VT.getScalarType();
52646
52647 MVT InVT = In.getValueType().getSimpleVT();
52648 MVT InSVT = InVT.getScalarType();
52649
52650 // Check we have a truncation suited for PACKSS/PACKUS.
52651 if (!isPowerOf2_32(VT.getVectorNumElements()))
52652 return SDValue();
52653 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
52654 return SDValue();
52655 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
52656 return SDValue();
52657
52658 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
52659 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
52660 return SDValue();
52661
52662 // AVX512 has fast truncate, but if the input is already going to be split,
52663 // there's no harm in trying pack.
52664 if (Subtarget.hasAVX512() &&
52665 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
52666 InVT.is512BitVector())) {
52667 // PACK should still be worth it for 128-bit vectors if the sources were
52668 // originally concatenated from subvectors.
52669 SmallVector<SDValue> ConcatOps;
52670 if (VT.getSizeInBits() > 128 ||
52671 !collectConcatOps(In.getNode(), ConcatOps, DAG))
52672 return SDValue();
52673 }
52674
52675 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
52676 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
52677
52678 // Use PACKUS if the input has zero-bits that extend all the way to the
52679 // packed/truncated value. e.g. masks, zext_in_reg, etc.
52680 KnownBits Known = DAG.computeKnownBits(In);
52681 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
52682 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
52683 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
52684
52685 // Use PACKSS if the input has sign-bits that extend all the way to the
52686 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
52687 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
52688
52689 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
52690 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
52691 // on and combines/simplifications can't then use it.
52692 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
52693 return SDValue();
52694
52695 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
52696 if (NumSignBits > MinSignBits)
52697 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
52698
52699 // If we have a srl that only generates signbits that we will discard in
52700 // the truncation then we can use PACKSS by converting the srl to a sra.
52701 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
52702 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
52703 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
52704 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
52705 if (*ShAmt == MinSignBits) {
52706 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
52707 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
52708 Subtarget);
52709 }
52710 }
52711
52712 return SDValue();
52713}
52714
52715// Try to form a MULHU or MULHS node by looking for
52716// (trunc (srl (mul ext, ext), 16))
52717// TODO: This is X86 specific because we want to be able to handle wide types
52718// before type legalization. But we can only do it if the vector will be
52719// legalized via widening/splitting. Type legalization can't handle promotion
52720// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
52721// combiner.
52722static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
52723 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
52724 // First instruction should be a right shift of a multiply.
52725 if (Src.getOpcode() != ISD::SRL ||
52726 Src.getOperand(0).getOpcode() != ISD::MUL)
52727 return SDValue();
52728
52729 if (!Subtarget.hasSSE2())
52730 return SDValue();
52731
52732 // Only handle vXi16 types that are at least 128-bits unless they will be
52733 // widened.
52734 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
52735 return SDValue();
52736
52737 // Input type should be at least vXi32.
52738 EVT InVT = Src.getValueType();
52739 if (InVT.getVectorElementType().getSizeInBits() < 32)
52740 return SDValue();
52741
52742 // Need a shift by 16.
52743 APInt ShiftAmt;
52744 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
52745 ShiftAmt != 16)
52746 return SDValue();
52747
52748 SDValue LHS = Src.getOperand(0).getOperand(0);
52749 SDValue RHS = Src.getOperand(0).getOperand(1);
52750
52751 // Count leading sign/zero bits on both inputs - if there are enough then
52752 // truncation back to vXi16 will be cheap - either as a pack/shuffle
52753 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
52754 // truncations may actually be free by peeking through to the ext source.
52755 auto IsSext = [&DAG](SDValue V) {
52756 return DAG.ComputeMaxSignificantBits(V) <= 16;
52757 };
52758 auto IsZext = [&DAG](SDValue V) {
52759 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
52760 };
52761
52762 bool IsSigned = IsSext(LHS) && IsSext(RHS);
52763 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
52764 if (!IsSigned && !IsUnsigned)
52765 return SDValue();
52766
52767 // Check if both inputs are extensions, which will be removed by truncation.
52768 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
52769 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
52770 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
52771 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
52772 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
52773 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
52774
52775 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
52776 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
52777 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
52778 // will have to split anyway.
52779 unsigned InSizeInBits = InVT.getSizeInBits();
52780 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
52781 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
52782 (InSizeInBits % 16) == 0) {
52783 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52784 InVT.getSizeInBits() / 16);
52785 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
52786 DAG.getBitcast(BCVT, RHS));
52787 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
52788 }
52789
52790 // Truncate back to source type.
52791 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
52792 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
52793
52794 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
52795 return DAG.getNode(Opc, DL, VT, LHS, RHS);
52796}
52797
52798// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
52799// from one vector with signed bytes from another vector, adds together
52800// adjacent pairs of 16-bit products, and saturates the result before
52801// truncating to 16-bits.
52802//
52803// Which looks something like this:
52804// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
52805// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
52806static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
52807 const X86Subtarget &Subtarget,
52808 const SDLoc &DL) {
52809 if (!VT.isVector() || !Subtarget.hasSSSE3())
52810 return SDValue();
52811
52812 unsigned NumElems = VT.getVectorNumElements();
52813 EVT ScalarVT = VT.getVectorElementType();
52814 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
52815 return SDValue();
52816
52817 SDValue SSatVal = detectSSatPattern(In, VT);
52818 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
52819 return SDValue();
52820
52821 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
52822 // of multiplies from even/odd elements.
52823 SDValue N0 = SSatVal.getOperand(0);
52824 SDValue N1 = SSatVal.getOperand(1);
52825
52826 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52827 return SDValue();
52828
52829 SDValue N00 = N0.getOperand(0);
52830 SDValue N01 = N0.getOperand(1);
52831 SDValue N10 = N1.getOperand(0);
52832 SDValue N11 = N1.getOperand(1);
52833
52834 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
52835 // Canonicalize zero_extend to LHS.
52836 if (N01.getOpcode() == ISD::ZERO_EXTEND)
52837 std::swap(N00, N01);
52838 if (N11.getOpcode() == ISD::ZERO_EXTEND)
52839 std::swap(N10, N11);
52840
52841 // Ensure we have a zero_extend and a sign_extend.
52842 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
52843 N01.getOpcode() != ISD::SIGN_EXTEND ||
52844 N10.getOpcode() != ISD::ZERO_EXTEND ||
52845 N11.getOpcode() != ISD::SIGN_EXTEND)
52846 return SDValue();
52847
52848 // Peek through the extends.
52849 N00 = N00.getOperand(0);
52850 N01 = N01.getOperand(0);
52851 N10 = N10.getOperand(0);
52852 N11 = N11.getOperand(0);
52853
52854 // Ensure the extend is from vXi8.
52855 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
52856 N01.getValueType().getVectorElementType() != MVT::i8 ||
52857 N10.getValueType().getVectorElementType() != MVT::i8 ||
52858 N11.getValueType().getVectorElementType() != MVT::i8)
52859 return SDValue();
52860
52861 // All inputs should be build_vectors.
52862 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52863 N01.getOpcode() != ISD::BUILD_VECTOR ||
52864 N10.getOpcode() != ISD::BUILD_VECTOR ||
52865 N11.getOpcode() != ISD::BUILD_VECTOR)
52866 return SDValue();
52867
52868 // N00/N10 are zero extended. N01/N11 are sign extended.
52869
52870 // For each element, we need to ensure we have an odd element from one vector
52871 // multiplied by the odd element of another vector and the even element from
52872 // one of the same vectors being multiplied by the even element from the
52873 // other vector. So we need to make sure for each element i, this operator
52874 // is being performed:
52875 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52876 SDValue ZExtIn, SExtIn;
52877 for (unsigned i = 0; i != NumElems; ++i) {
52878 SDValue N00Elt = N00.getOperand(i);
52879 SDValue N01Elt = N01.getOperand(i);
52880 SDValue N10Elt = N10.getOperand(i);
52881 SDValue N11Elt = N11.getOperand(i);
52882 // TODO: Be more tolerant to undefs.
52883 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52884 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52885 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52886 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52887 return SDValue();
52888 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52889 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52890 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52891 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52892 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52893 return SDValue();
52894 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52895 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52896 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52897 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52898 // Add is commutative so indices can be reordered.
52899 if (IdxN00 > IdxN10) {
52900 std::swap(IdxN00, IdxN10);
52901 std::swap(IdxN01, IdxN11);
52902 }
52903 // N0 indices be the even element. N1 indices must be the next odd element.
52904 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52905 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52906 return SDValue();
52907 SDValue N00In = N00Elt.getOperand(0);
52908 SDValue N01In = N01Elt.getOperand(0);
52909 SDValue N10In = N10Elt.getOperand(0);
52910 SDValue N11In = N11Elt.getOperand(0);
52911 // First time we find an input capture it.
52912 if (!ZExtIn) {
52913 ZExtIn = N00In;
52914 SExtIn = N01In;
52915 }
52916 if (ZExtIn != N00In || SExtIn != N01In ||
52917 ZExtIn != N10In || SExtIn != N11In)
52918 return SDValue();
52919 }
52920
52921 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52922 ArrayRef<SDValue> Ops) {
52923 // Shrink by adding truncate nodes and let DAGCombine fold with the
52924 // sources.
52925 EVT InVT = Ops[0].getValueType();
52926 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52927, __extension__
__PRETTY_FUNCTION__))
52927 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52927, __extension__
__PRETTY_FUNCTION__))
;
52928 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52928, __extension__
__PRETTY_FUNCTION__))
;
52929 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52930 InVT.getVectorNumElements() / 2);
52931 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
52932 };
52933 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
52934 PMADDBuilder);
52935}
52936
52937static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
52938 const X86Subtarget &Subtarget) {
52939 EVT VT = N->getValueType(0);
52940 SDValue Src = N->getOperand(0);
52941 SDLoc DL(N);
52942
52943 // Attempt to pre-truncate inputs to arithmetic ops instead.
52944 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
52945 return V;
52946
52947 // Try to detect AVG pattern first.
52948 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
52949 return Avg;
52950
52951 // Try to detect PMADD
52952 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
52953 return PMAdd;
52954
52955 // Try to combine truncation with signed/unsigned saturation.
52956 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
52957 return Val;
52958
52959 // Try to combine PMULHUW/PMULHW for vXi16.
52960 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
52961 return V;
52962
52963 // The bitcast source is a direct mmx result.
52964 // Detect bitcasts between i32 to x86mmx
52965 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
52966 SDValue BCSrc = Src.getOperand(0);
52967 if (BCSrc.getValueType() == MVT::x86mmx)
52968 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
52969 }
52970
52971 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
52972 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
52973 return V;
52974
52975 return combineVectorTruncation(N, DAG, Subtarget);
52976}
52977
52978static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
52979 TargetLowering::DAGCombinerInfo &DCI) {
52980 EVT VT = N->getValueType(0);
52981 SDValue In = N->getOperand(0);
52982 SDLoc DL(N);
52983
52984 if (SDValue SSatVal = detectSSatPattern(In, VT))
52985 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
52986 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
52987 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
52988
52989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52990 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
52991 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52992 return SDValue(N, 0);
52993
52994 return SDValue();
52995}
52996
52997/// Returns the negated value if the node \p N flips sign of FP value.
52998///
52999/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53000/// or FSUB(0, x)
53001/// AVX512F does not have FXOR, so FNEG is lowered as
53002/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53003/// In this case we go though all bitcasts.
53004/// This also recognizes splat of a negated value and returns the splat of that
53005/// value.
53006static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53007 if (N->getOpcode() == ISD::FNEG)
53008 return N->getOperand(0);
53009
53010 // Don't recurse exponentially.
53011 if (Depth > SelectionDAG::MaxRecursionDepth)
53012 return SDValue();
53013
53014 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53015
53016 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
53017 EVT VT = Op->getValueType(0);
53018
53019 // Make sure the element size doesn't change.
53020 if (VT.getScalarSizeInBits() != ScalarSize)
53021 return SDValue();
53022
53023 unsigned Opc = Op.getOpcode();
53024 switch (Opc) {
53025 case ISD::VECTOR_SHUFFLE: {
53026 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
53027 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
53028 if (!Op.getOperand(1).isUndef())
53029 return SDValue();
53030 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53031 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
53032 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
53033 cast<ShuffleVectorSDNode>(Op)->getMask());
53034 break;
53035 }
53036 case ISD::INSERT_VECTOR_ELT: {
53037 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
53038 // -V, INDEX).
53039 SDValue InsVector = Op.getOperand(0);
53040 SDValue InsVal = Op.getOperand(1);
53041 if (!InsVector.isUndef())
53042 return SDValue();
53043 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
53044 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53045 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53046 NegInsVal, Op.getOperand(2));
53047 break;
53048 }
53049 case ISD::FSUB:
53050 case ISD::XOR:
53051 case X86ISD::FXOR: {
53052 SDValue Op1 = Op.getOperand(1);
53053 SDValue Op0 = Op.getOperand(0);
53054
53055 // For XOR and FXOR, we want to check if constant
53056 // bits of Op1 are sign bit masks. For FSUB, we
53057 // have to check if constant bits of Op0 are sign
53058 // bit masks and hence we swap the operands.
53059 if (Opc == ISD::FSUB)
53060 std::swap(Op0, Op1);
53061
53062 APInt UndefElts;
53063 SmallVector<APInt, 16> EltBits;
53064 // Extract constant bits and see if they are all
53065 // sign bit masks. Ignore the undef elements.
53066 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
53067 /* AllowWholeUndefs */ true,
53068 /* AllowPartialUndefs */ false)) {
53069 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
53070 if (!UndefElts[I] && !EltBits[I].isSignMask())
53071 return SDValue();
53072
53073 // Only allow bitcast from correctly-sized constant.
53074 Op0 = peekThroughBitcasts(Op0);
53075 if (Op0.getScalarValueSizeInBits() == ScalarSize)
53076 return Op0;
53077 }
53078 break;
53079 } // case
53080 } // switch
53081
53082 return SDValue();
53083}
53084
53085static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
53086 bool NegRes) {
53087 if (NegMul) {
53088 switch (Opcode) {
53089 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53089)
;
53090 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
53091 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
53092 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
53093 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
53094 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
53095 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
53096 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
53097 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
53098 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
53099 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
53100 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
53101 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
53102 }
53103 }
53104
53105 if (NegAcc) {
53106 switch (Opcode) {
53107 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53107)
;
53108 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
53109 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
53110 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53111 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
53112 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
53113 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53114 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
53115 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
53116 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53117 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
53118 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
53119 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53120 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
53121 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
53122 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
53123 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
53124 }
53125 }
53126
53127 if (NegRes) {
53128 switch (Opcode) {
53129 // For accuracy reason, we never combine fneg and fma under strict FP.
53130 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53130)
;
53131 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
53132 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53133 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
53134 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53135 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
53136 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53137 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
53138 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53139 }
53140 }
53141
53142 return Opcode;
53143}
53144
53145/// Do target-specific dag combines on floating point negations.
53146static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
53147 TargetLowering::DAGCombinerInfo &DCI,
53148 const X86Subtarget &Subtarget) {
53149 EVT OrigVT = N->getValueType(0);
53150 SDValue Arg = isFNEG(DAG, N);
53151 if (!Arg)
53152 return SDValue();
53153
53154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53155 EVT VT = Arg.getValueType();
53156 EVT SVT = VT.getScalarType();
53157 SDLoc DL(N);
53158
53159 // Let legalize expand this if it isn't a legal type yet.
53160 if (!TLI.isTypeLegal(VT))
53161 return SDValue();
53162
53163 // If we're negating a FMUL node on a target with FMA, then we can avoid the
53164 // use of a constant by performing (-0 - A*B) instead.
53165 // FIXME: Check rounding control flags as well once it becomes available.
53166 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
53167 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
53168 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
53169 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
53170 Arg.getOperand(1), Zero);
53171 return DAG.getBitcast(OrigVT, NewNode);
53172 }
53173
53174 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53175 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53176 if (SDValue NegArg =
53177 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
53178 return DAG.getBitcast(OrigVT, NegArg);
53179
53180 return SDValue();
53181}
53182
53183SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
53184 bool LegalOperations,
53185 bool ForCodeSize,
53186 NegatibleCost &Cost,
53187 unsigned Depth) const {
53188 // fneg patterns are removable even if they have multiple uses.
53189 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
53190 Cost = NegatibleCost::Cheaper;
53191 return DAG.getBitcast(Op.getValueType(), Arg);
53192 }
53193
53194 EVT VT = Op.getValueType();
53195 EVT SVT = VT.getScalarType();
53196 unsigned Opc = Op.getOpcode();
53197 SDNodeFlags Flags = Op.getNode()->getFlags();
53198 switch (Opc) {
53199 case ISD::FMA:
53200 case X86ISD::FMSUB:
53201 case X86ISD::FNMADD:
53202 case X86ISD::FNMSUB:
53203 case X86ISD::FMADD_RND:
53204 case X86ISD::FMSUB_RND:
53205 case X86ISD::FNMADD_RND:
53206 case X86ISD::FNMSUB_RND: {
53207 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
53208 !(SVT == MVT::f32 || SVT == MVT::f64) ||
53209 !isOperationLegal(ISD::FMA, VT))
53210 break;
53211
53212 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
53213 // if it may have signed zeros.
53214 if (!Flags.hasNoSignedZeros())
53215 break;
53216
53217 // This is always negatible for free but we might be able to remove some
53218 // extra operand negations as well.
53219 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
53220 for (int i = 0; i != 3; ++i)
53221 NewOps[i] = getCheaperNegatedExpression(
53222 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
53223
53224 bool NegA = !!NewOps[0];
53225 bool NegB = !!NewOps[1];
53226 bool NegC = !!NewOps[2];
53227 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
53228
53229 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
53230 : NegatibleCost::Neutral;
53231
53232 // Fill in the non-negated ops with the original values.
53233 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
53234 if (!NewOps[i])
53235 NewOps[i] = Op.getOperand(i);
53236 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
53237 }
53238 case X86ISD::FRCP:
53239 if (SDValue NegOp0 =
53240 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
53241 ForCodeSize, Cost, Depth + 1))
53242 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
53243 break;
53244 }
53245
53246 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
53247 ForCodeSize, Cost, Depth);
53248}
53249
53250static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
53251 const X86Subtarget &Subtarget) {
53252 MVT VT = N->getSimpleValueType(0);
53253 // If we have integer vector types available, use the integer opcodes.
53254 if (!VT.isVector() || !Subtarget.hasSSE2())
53255 return SDValue();
53256
53257 SDLoc dl(N);
53258
53259 unsigned IntBits = VT.getScalarSizeInBits();
53260 MVT IntSVT = MVT::getIntegerVT(IntBits);
53261 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
53262
53263 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
53264 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
53265 unsigned IntOpcode;
53266 switch (N->getOpcode()) {
53267 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53267)
;
53268 case X86ISD::FOR: IntOpcode = ISD::OR; break;
53269 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
53270 case X86ISD::FAND: IntOpcode = ISD::AND; break;
53271 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
53272 }
53273 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
53274 return DAG.getBitcast(VT, IntOp);
53275}
53276
53277
53278/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
53279static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
53280 if (N->getOpcode() != ISD::XOR)
53281 return SDValue();
53282
53283 SDValue LHS = N->getOperand(0);
53284 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
53285 return SDValue();
53286
53287 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
53288 X86::CondCode(LHS->getConstantOperandVal(0)));
53289 SDLoc DL(N);
53290 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
53291}
53292
53293static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
53294 const X86Subtarget &Subtarget) {
53295 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53296, __extension__
__PRETTY_FUNCTION__))
53296 "Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53296, __extension__
__PRETTY_FUNCTION__))
;
53297 if (Subtarget.hasFastLZCNT())
53298 return SDValue();
53299
53300 EVT VT = N->getValueType(0);
53301 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
53302 (VT != MVT::i64 || !Subtarget.is64Bit()))
53303 return SDValue();
53304
53305 SDValue N0 = N->getOperand(0);
53306 SDValue N1 = N->getOperand(1);
53307
53308 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
53309 N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
53310 return SDValue();
53311
53312 SDValue OpCTLZ;
53313 SDValue OpSizeTM1;
53314
53315 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
53316 OpCTLZ = N1;
53317 OpSizeTM1 = N0;
53318 } else if (N->getOpcode() == ISD::SUB) {
53319 return SDValue();
53320 } else {
53321 OpCTLZ = N0;
53322 OpSizeTM1 = N1;
53323 }
53324
53325 if (!OpCTLZ.hasOneUse())
53326 return SDValue();
53327 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
53328 if (!C)
53329 return SDValue();
53330
53331 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53332 return SDValue();
53333 SDLoc DL(N);
53334 EVT OpVT = VT;
53335 SDValue Op = OpCTLZ.getOperand(0);
53336 if (VT == MVT::i8) {
53337 // Zero extend to i32 since there is not an i8 bsr.
53338 OpVT = MVT::i32;
53339 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
53340 }
53341
53342 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
53343 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
53344 if (VT == MVT::i8)
53345 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
53346
53347 return Op;
53348}
53349
53350static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
53351 TargetLowering::DAGCombinerInfo &DCI,
53352 const X86Subtarget &Subtarget) {
53353 SDValue N0 = N->getOperand(0);
53354 SDValue N1 = N->getOperand(1);
53355 EVT VT = N->getValueType(0);
53356
53357 // If this is SSE1 only convert to FXOR to avoid scalarization.
53358 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
53359 return DAG.getBitcast(MVT::v4i32,
53360 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
53361 DAG.getBitcast(MVT::v4f32, N0),
53362 DAG.getBitcast(MVT::v4f32, N1)));
53363 }
53364
53365 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
53366 return Cmp;
53367
53368 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
53369 return R;
53370
53371 if (SDValue R = combineBitOpWithShift(N, DAG))
53372 return R;
53373
53374 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
53375 return FPLogic;
53376
53377 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
53378 return R;
53379
53380 if (DCI.isBeforeLegalizeOps())
53381 return SDValue();
53382
53383 if (SDValue SetCC = foldXor1SetCC(N, DAG))
53384 return SetCC;
53385
53386 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
53387 return R;
53388
53389 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
53390 return RV;
53391
53392 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53393 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53394 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
53395 N0.getOperand(0).getValueType().isVector() &&
53396 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53397 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
53398 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
53399 N0.getOperand(0).getValueType()));
53400 }
53401
53402 // Handle AVX512 mask widening.
53403 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53404 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
53405 VT.getVectorElementType() == MVT::i1 &&
53406 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
53407 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
53408 return DAG.getNode(
53409 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
53410 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
53411 N0.getOperand(2));
53412 }
53413
53414 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53415 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53416 // TODO: Under what circumstances could this be performed in DAGCombine?
53417 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
53418 N0.getOperand(0).getOpcode() == N->getOpcode()) {
53419 SDValue TruncExtSrc = N0.getOperand(0);
53420 auto *N1C = dyn_cast<ConstantSDNode>(N1);
53421 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
53422 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53423 SDLoc DL(N);
53424 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
53425 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
53426 return DAG.getNode(ISD::XOR, DL, VT, LHS,
53427 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
53428 }
53429 }
53430
53431 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
53432 return R;
53433
53434 return combineFneg(N, DAG, DCI, Subtarget);
53435}
53436
53437static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
53438 TargetLowering::DAGCombinerInfo &DCI,
53439 const X86Subtarget &Subtarget) {
53440 EVT VT = N->getValueType(0);
53441 unsigned NumBits = VT.getSizeInBits();
53442
53443 // TODO - Constant Folding.
53444
53445 // Simplify the inputs.
53446 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53447 APInt DemandedMask(APInt::getAllOnes(NumBits));
53448 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53449 return SDValue(N, 0);
53450
53451 return SDValue();
53452}
53453
53454static bool isNullFPScalarOrVectorConst(SDValue V) {
53455 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
53456}
53457
53458/// If a value is a scalar FP zero or a vector FP zero (potentially including
53459/// undefined elements), return a zero constant that may be used to fold away
53460/// that value. In the case of a vector, the returned constant will not contain
53461/// undefined elements even if the input parameter does. This makes it suitable
53462/// to be used as a replacement operand with operations (eg, bitwise-and) where
53463/// an undef should not propagate.
53464static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
53465 const X86Subtarget &Subtarget) {
53466 if (!isNullFPScalarOrVectorConst(V))
53467 return SDValue();
53468
53469 if (V.getValueType().isVector())
53470 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
53471
53472 return V;
53473}
53474
53475static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
53476 const X86Subtarget &Subtarget) {
53477 SDValue N0 = N->getOperand(0);
53478 SDValue N1 = N->getOperand(1);
53479 EVT VT = N->getValueType(0);
53480 SDLoc DL(N);
53481
53482 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
53483 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
53484 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
53485 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
53486 return SDValue();
53487
53488 auto isAllOnesConstantFP = [](SDValue V) {
53489 if (V.getSimpleValueType().isVector())
53490 return ISD::isBuildVectorAllOnes(V.getNode());
53491 auto *C = dyn_cast<ConstantFPSDNode>(V);
53492 return C && C->getConstantFPValue()->isAllOnesValue();
53493 };
53494
53495 // fand (fxor X, -1), Y --> fandn X, Y
53496 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
53497 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
53498
53499 // fand X, (fxor Y, -1) --> fandn Y, X
53500 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
53501 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
53502
53503 return SDValue();
53504}
53505
53506/// Do target-specific dag combines on X86ISD::FAND nodes.
53507static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
53508 const X86Subtarget &Subtarget) {
53509 // FAND(0.0, x) -> 0.0
53510 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53511 return V;
53512
53513 // FAND(x, 0.0) -> 0.0
53514 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53515 return V;
53516
53517 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
53518 return V;
53519
53520 return lowerX86FPLogicOp(N, DAG, Subtarget);
53521}
53522
53523/// Do target-specific dag combines on X86ISD::FANDN nodes.
53524static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
53525 const X86Subtarget &Subtarget) {
53526 // FANDN(0.0, x) -> x
53527 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53528 return N->getOperand(1);
53529
53530 // FANDN(x, 0.0) -> 0.0
53531 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53532 return V;
53533
53534 return lowerX86FPLogicOp(N, DAG, Subtarget);
53535}
53536
53537/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53538static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
53539 TargetLowering::DAGCombinerInfo &DCI,
53540 const X86Subtarget &Subtarget) {
53541 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53541, __extension__
__PRETTY_FUNCTION__))
;
53542
53543 // F[X]OR(0.0, x) -> x
53544 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53545 return N->getOperand(1);
53546
53547 // F[X]OR(x, 0.0) -> x
53548 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53549 return N->getOperand(0);
53550
53551 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
53552 return NewVal;
53553
53554 return lowerX86FPLogicOp(N, DAG, Subtarget);
53555}
53556
53557/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53558static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
53559 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53559, __extension__
__PRETTY_FUNCTION__))
;
53560
53561 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
53562 if (!DAG.getTarget().Options.NoNaNsFPMath ||
53563 !DAG.getTarget().Options.NoSignedZerosFPMath)
53564 return SDValue();
53565
53566 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53567 // into FMINC and FMAXC, which are Commutative operations.
53568 unsigned NewOp = 0;
53569 switch (N->getOpcode()) {
53570 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53570)
;
53571 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
53572 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
53573 }
53574
53575 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53576 N->getOperand(0), N->getOperand(1));
53577}
53578
53579static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
53580 const X86Subtarget &Subtarget) {
53581 EVT VT = N->getValueType(0);
53582 if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
53583 return SDValue();
53584
53585 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53586
53587 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
53588 (Subtarget.hasSSE2() && VT == MVT::f64) ||
53589 (Subtarget.hasFP16() && VT == MVT::f16) ||
53590 (VT.isVector() && TLI.isTypeLegal(VT))))
53591 return SDValue();
53592
53593 SDValue Op0 = N->getOperand(0);
53594 SDValue Op1 = N->getOperand(1);
53595 SDLoc DL(N);
53596 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53597
53598 // If we don't have to respect NaN inputs, this is a direct translation to x86
53599 // min/max instructions.
53600 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53601 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53602
53603 // If one of the operands is known non-NaN use the native min/max instructions
53604 // with the non-NaN input as second operand.
53605 if (DAG.isKnownNeverNaN(Op1))
53606 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53607 if (DAG.isKnownNeverNaN(Op0))
53608 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53609
53610 // If we have to respect NaN inputs, this takes at least 3 instructions.
53611 // Favor a library call when operating on a scalar and minimizing code size.
53612 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
53613 return SDValue();
53614
53615 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
53616 VT);
53617
53618 // There are 4 possibilities involving NaN inputs, and these are the required
53619 // outputs:
53620 // Op1
53621 // Num NaN
53622 // ----------------
53623 // Num | Max | Op0 |
53624 // Op0 ----------------
53625 // NaN | Op1 | NaN |
53626 // ----------------
53627 //
53628 // The SSE FP max/min instructions were not designed for this case, but rather
53629 // to implement:
53630 // Min = Op1 < Op0 ? Op1 : Op0
53631 // Max = Op1 > Op0 ? Op1 : Op0
53632 //
53633 // So they always return Op0 if either input is a NaN. However, we can still
53634 // use those instructions for fmaxnum by selecting away a NaN input.
53635
53636 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
53637 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
53638 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
53639
53640 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
53641 // are NaN, the NaN value of Op1 is the result.
53642 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
53643}
53644
53645static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
53646 TargetLowering::DAGCombinerInfo &DCI) {
53647 EVT VT = N->getValueType(0);
53648 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53649
53650 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53651 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
53652 return SDValue(N, 0);
53653
53654 // Convert a full vector load into vzload when not all bits are needed.
53655 SDValue In = N->getOperand(0);
53656 MVT InVT = In.getSimpleValueType();
53657 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53658 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53659 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53659, __extension__
__PRETTY_FUNCTION__))
;
53660 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53661 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53662 MVT MemVT = MVT::getIntegerVT(NumBits);
53663 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53664 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53665 SDLoc dl(N);
53666 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53667 DAG.getBitcast(InVT, VZLoad));
53668 DCI.CombineTo(N, Convert);
53669 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53670 DCI.recursivelyDeleteUnusedNodes(LN);
53671 return SDValue(N, 0);
53672 }
53673 }
53674
53675 return SDValue();
53676}
53677
53678static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
53679 TargetLowering::DAGCombinerInfo &DCI) {
53680 bool IsStrict = N->isTargetStrictFPOpcode();
53681 EVT VT = N->getValueType(0);
53682
53683 // Convert a full vector load into vzload when not all bits are needed.
53684 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53685 MVT InVT = In.getSimpleValueType();
53686 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53687 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53688 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53688, __extension__
__PRETTY_FUNCTION__))
;
53689 LoadSDNode *LN = cast<LoadSDNode>(In);
53690 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53691 MVT MemVT = MVT::getFloatingPointVT(NumBits);
53692 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53693 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53694 SDLoc dl(N);
53695 if (IsStrict) {
53696 SDValue Convert =
53697 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53698 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53699 DCI.CombineTo(N, Convert, Convert.getValue(1));
53700 } else {
53701 SDValue Convert =
53702 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53703 DCI.CombineTo(N, Convert);
53704 }
53705 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53706 DCI.recursivelyDeleteUnusedNodes(LN);
53707 return SDValue(N, 0);
53708 }
53709 }
53710
53711 return SDValue();
53712}
53713
53714/// Do target-specific dag combines on X86ISD::ANDNP nodes.
53715static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
53716 TargetLowering::DAGCombinerInfo &DCI,
53717 const X86Subtarget &Subtarget) {
53718 SDValue N0 = N->getOperand(0);
53719 SDValue N1 = N->getOperand(1);
53720 MVT VT = N->getSimpleValueType(0);
53721 int NumElts = VT.getVectorNumElements();
53722 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53723
53724 // ANDNP(undef, x) -> 0
53725 // ANDNP(x, undef) -> 0
53726 if (N0.isUndef() || N1.isUndef())
53727 return DAG.getConstant(0, SDLoc(N), VT);
53728
53729 // ANDNP(0, x) -> x
53730 if (ISD::isBuildVectorAllZeros(N0.getNode()))
53731 return N1;
53732
53733 // ANDNP(x, 0) -> 0
53734 if (ISD::isBuildVectorAllZeros(N1.getNode()))
53735 return DAG.getConstant(0, SDLoc(N), VT);
53736
53737 // Turn ANDNP back to AND if input is inverted.
53738 if (SDValue Not = IsNOT(N0, DAG))
53739 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
53740
53741 // Constant Folding
53742 APInt Undefs0, Undefs1;
53743 SmallVector<APInt> EltBits0, EltBits1;
53744 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
53745 SDLoc DL(N);
53746 APInt ResultUndefs = APInt::getZero(NumElts);
53747
53748 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
53749 SmallVector<APInt> ResultBits;
53750 for (int I = 0; I != NumElts; ++I)
53751 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
53752 return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);
53753 }
53754
53755 // Constant fold NOT(N0) to allow us to use AND.
53756 // Ensure this is only performed if we can confirm that the bitcasted source
53757 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
53758 if (N0->hasOneUse()) {
53759 SDValue BC0 = peekThroughOneUseBitcasts(N0);
53760 if (BC0.getOpcode() != ISD::BITCAST) {
53761 for (APInt &Elt : EltBits0)
53762 Elt = ~Elt;
53763 SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);
53764 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
53765 }
53766 }
53767 }
53768
53769 // Attempt to recursively combine a bitmask ANDNP with shuffles.
53770 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
53771 SDValue Op(N, 0);
53772 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
53773 return Res;
53774
53775 // If either operand is a constant mask, then only the elements that aren't
53776 // zero are actually demanded by the other operand.
53777 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
53778 APInt UndefElts;
53779 SmallVector<APInt> EltBits;
53780 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
53781 APInt DemandedElts = APInt::getAllOnes(NumElts);
53782 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
53783 EltBits)) {
53784 DemandedBits.clearAllBits();
53785 DemandedElts.clearAllBits();
53786 for (int I = 0; I != NumElts; ++I) {
53787 if (UndefElts[I]) {
53788 // We can't assume an undef src element gives an undef dst - the
53789 // other src might be zero.
53790 DemandedBits.setAllBits();
53791 DemandedElts.setBit(I);
53792 } else if ((Invert && !EltBits[I].isAllOnes()) ||
53793 (!Invert && !EltBits[I].isZero())) {
53794 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
53795 DemandedElts.setBit(I);
53796 }
53797 }
53798 }
53799 return std::make_pair(DemandedBits, DemandedElts);
53800 };
53801 APInt Bits0, Elts0;
53802 APInt Bits1, Elts1;
53803 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
53804 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
53805
53806 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53807 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
53808 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
53809 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
53810 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
53811 if (N->getOpcode() != ISD::DELETED_NODE)
53812 DCI.AddToWorklist(N);
53813 return SDValue(N, 0);
53814 }
53815 }
53816
53817 return SDValue();
53818}
53819
53820static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
53821 TargetLowering::DAGCombinerInfo &DCI) {
53822 SDValue N1 = N->getOperand(1);
53823
53824 // BT ignores high bits in the bit index operand.
53825 unsigned BitWidth = N1.getValueSizeInBits();
53826 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
53827 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
53828 if (N->getOpcode() != ISD::DELETED_NODE)
53829 DCI.AddToWorklist(N);
53830 return SDValue(N, 0);
53831 }
53832
53833 return SDValue();
53834}
53835
53836static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
53837 TargetLowering::DAGCombinerInfo &DCI) {
53838 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
53839 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53840
53841 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
53842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53843 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
53844 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
53845 if (N->getOpcode() != ISD::DELETED_NODE)
53846 DCI.AddToWorklist(N);
53847 return SDValue(N, 0);
53848 }
53849
53850 // Convert a full vector load into vzload when not all bits are needed.
53851 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
53852 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
53853 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
53854 SDLoc dl(N);
53855 if (IsStrict) {
53856 SDValue Convert = DAG.getNode(
53857 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
53858 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
53859 DCI.CombineTo(N, Convert, Convert.getValue(1));
53860 } else {
53861 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
53862 DAG.getBitcast(MVT::v8i16, VZLoad));
53863 DCI.CombineTo(N, Convert);
53864 }
53865
53866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53867 DCI.recursivelyDeleteUnusedNodes(LN);
53868 return SDValue(N, 0);
53869 }
53870 }
53871 }
53872
53873 return SDValue();
53874}
53875
53876// Try to combine sext_in_reg of a cmov of constants by extending the constants.
53877static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
53878 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53878, __extension__
__PRETTY_FUNCTION__))
;
53879
53880 EVT DstVT = N->getValueType(0);
53881
53882 SDValue N0 = N->getOperand(0);
53883 SDValue N1 = N->getOperand(1);
53884 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53885
53886 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
53887 return SDValue();
53888
53889 // Look through single use any_extends / truncs.
53890 SDValue IntermediateBitwidthOp;
53891 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
53892 N0.hasOneUse()) {
53893 IntermediateBitwidthOp = N0;
53894 N0 = N0.getOperand(0);
53895 }
53896
53897 // See if we have a single use cmov.
53898 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
53899 return SDValue();
53900
53901 SDValue CMovOp0 = N0.getOperand(0);
53902 SDValue CMovOp1 = N0.getOperand(1);
53903
53904 // Make sure both operands are constants.
53905 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53906 !isa<ConstantSDNode>(CMovOp1.getNode()))
53907 return SDValue();
53908
53909 SDLoc DL(N);
53910
53911 // If we looked through an any_extend/trunc above, add one to the constants.
53912 if (IntermediateBitwidthOp) {
53913 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
53914 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
53915 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
53916 }
53917
53918 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
53919 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
53920
53921 EVT CMovVT = DstVT;
53922 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
53923 if (DstVT == MVT::i16) {
53924 CMovVT = MVT::i32;
53925 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
53926 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
53927 }
53928
53929 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
53930 N0.getOperand(2), N0.getOperand(3));
53931
53932 if (CMovVT != DstVT)
53933 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
53934
53935 return CMov;
53936}
53937
53938static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
53939 const X86Subtarget &Subtarget) {
53940 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53940, __extension__
__PRETTY_FUNCTION__))
;
53941
53942 if (SDValue V = combineSextInRegCmov(N, DAG))
53943 return V;
53944
53945 EVT VT = N->getValueType(0);
53946 SDValue N0 = N->getOperand(0);
53947 SDValue N1 = N->getOperand(1);
53948 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53949 SDLoc dl(N);
53950
53951 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
53952 // both SSE and AVX2 since there is no sign-extended shift right
53953 // operation on a vector with 64-bit elements.
53954 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
53955 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
53956 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
53957 N0.getOpcode() == ISD::SIGN_EXTEND)) {
53958 SDValue N00 = N0.getOperand(0);
53959
53960 // EXTLOAD has a better solution on AVX2,
53961 // it may be replaced with X86ISD::VSEXT node.
53962 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
53963 if (!ISD::isNormalLoad(N00.getNode()))
53964 return SDValue();
53965
53966 // Attempt to promote any comparison mask ops before moving the
53967 // SIGN_EXTEND_INREG in the way.
53968 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
53969 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
53970
53971 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
53972 SDValue Tmp =
53973 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
53974 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
53975 }
53976 }
53977 return SDValue();
53978}
53979
53980/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
53981/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
53982/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
53983/// opportunities to combine math ops, use an LEA, or use a complex addressing
53984/// mode. This can eliminate extend, add, and shift instructions.
53985static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
53986 const X86Subtarget &Subtarget) {
53987 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53988 Ext->getOpcode() != ISD::ZERO_EXTEND)
53989 return SDValue();
53990
53991 // TODO: This should be valid for other integer types.
53992 EVT VT = Ext->getValueType(0);
53993 if (VT != MVT::i64)
53994 return SDValue();
53995
53996 SDValue Add = Ext->getOperand(0);
53997 if (Add.getOpcode() != ISD::ADD)
53998 return SDValue();
53999
54000 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
54001 bool NSW = Add->getFlags().hasNoSignedWrap();
54002 bool NUW = Add->getFlags().hasNoUnsignedWrap();
54003
54004 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
54005 // into the 'zext'
54006 if ((Sext && !NSW) || (!Sext && !NUW))
54007 return SDValue();
54008
54009 // Having a constant operand to the 'add' ensures that we are not increasing
54010 // the instruction count because the constant is extended for free below.
54011 // A constant operand can also become the displacement field of an LEA.
54012 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
54013 if (!AddOp1)
54014 return SDValue();
54015
54016 // Don't make the 'add' bigger if there's no hope of combining it with some
54017 // other 'add' or 'shl' instruction.
54018 // TODO: It may be profitable to generate simpler LEA instructions in place
54019 // of single 'add' instructions, but the cost model for selecting an LEA
54020 // currently has a high threshold.
54021 bool HasLEAPotential = false;
54022 for (auto *User : Ext->uses()) {
54023 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
54024 HasLEAPotential = true;
54025 break;
54026 }
54027 }
54028 if (!HasLEAPotential)
54029 return SDValue();
54030
54031 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
54032 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
54033 SDValue AddOp0 = Add.getOperand(0);
54034 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
54035 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
54036
54037 // The wider add is guaranteed to not wrap because both operands are
54038 // sign-extended.
54039 SDNodeFlags Flags;
54040 Flags.setNoSignedWrap(NSW);
54041 Flags.setNoUnsignedWrap(NUW);
54042 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
54043}
54044
54045// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
54046// operands and the result of CMOV is not used anywhere else - promote CMOV
54047// itself instead of promoting its result. This could be beneficial, because:
54048// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
54049// (or more) pseudo-CMOVs only when they go one-after-another and
54050// getting rid of result extension code after CMOV will help that.
54051// 2) Promotion of constant CMOV arguments is free, hence the
54052// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
54053// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
54054// promotion is also good in terms of code-size.
54055// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
54056// promotion).
54057static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
54058 SDValue CMovN = Extend->getOperand(0);
54059 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
54060 return SDValue();
54061
54062 EVT TargetVT = Extend->getValueType(0);
54063 unsigned ExtendOpcode = Extend->getOpcode();
54064 SDLoc DL(Extend);
54065
54066 EVT VT = CMovN.getValueType();
54067 SDValue CMovOp0 = CMovN.getOperand(0);
54068 SDValue CMovOp1 = CMovN.getOperand(1);
54069
54070 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54071 !isa<ConstantSDNode>(CMovOp1.getNode()))
54072 return SDValue();
54073
54074 // Only extend to i32 or i64.
54075 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
54076 return SDValue();
54077
54078 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
54079 // are free.
54080 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
54081 return SDValue();
54082
54083 // If this a zero extend to i64, we should only extend to i32 and use a free
54084 // zero extend to finish.
54085 EVT ExtendVT = TargetVT;
54086 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
54087 ExtendVT = MVT::i32;
54088
54089 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
54090 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
54091
54092 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
54093 CMovN.getOperand(2), CMovN.getOperand(3));
54094
54095 // Finish extending if needed.
54096 if (ExtendVT != TargetVT)
54097 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
54098
54099 return Res;
54100}
54101
54102// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
54103// result type.
54104static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
54105 const X86Subtarget &Subtarget) {
54106 SDValue N0 = N->getOperand(0);
54107 EVT VT = N->getValueType(0);
54108 SDLoc dl(N);
54109
54110 // Only do this combine with AVX512 for vector extends.
54111 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
54112 return SDValue();
54113
54114 // Only combine legal element types.
54115 EVT SVT = VT.getVectorElementType();
54116 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
54117 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
54118 return SDValue();
54119
54120 // We don't have CMPP Instruction for vxf16
54121 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
54122 return SDValue();
54123 // We can only do this if the vector size in 256 bits or less.
54124 unsigned Size = VT.getSizeInBits();
54125 if (Size > 256 && Subtarget.useAVX512Regs())
54126 return SDValue();
54127
54128 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
54129 // that's the only integer compares with we have.
54130 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
54131 if (ISD::isUnsignedIntSetCC(CC))
54132 return SDValue();
54133
54134 // Only do this combine if the extension will be fully consumed by the setcc.
54135 EVT N00VT = N0.getOperand(0).getValueType();
54136 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
54137 if (Size != MatchingVecType.getSizeInBits())
54138 return SDValue();
54139
54140 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
54141
54142 if (N->getOpcode() == ISD::ZERO_EXTEND)
54143 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
54144
54145 return Res;
54146}
54147
54148static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
54149 TargetLowering::DAGCombinerInfo &DCI,
54150 const X86Subtarget &Subtarget) {
54151 SDValue N0 = N->getOperand(0);
54152 EVT VT = N->getValueType(0);
54153 SDLoc DL(N);
54154
54155 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54156 if (!DCI.isBeforeLegalizeOps() &&
54157 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54158 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
54159 N0->getOperand(1));
54160 bool ReplaceOtherUses = !N0.hasOneUse();
54161 DCI.CombineTo(N, Setcc);
54162 // Replace other uses with a truncate of the widened setcc_carry.
54163 if (ReplaceOtherUses) {
54164 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54165 N0.getValueType(), Setcc);
54166 DCI.CombineTo(N0.getNode(), Trunc);
54167 }
54168
54169 return SDValue(N, 0);
54170 }
54171
54172 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54173 return NewCMov;
54174
54175 if (!DCI.isBeforeLegalizeOps())
54176 return SDValue();
54177
54178 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54179 return V;
54180
54181 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
54182 DAG, DCI, Subtarget))
54183 return V;
54184
54185 if (VT.isVector()) {
54186 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54187 return R;
54188
54189 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
54190 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
54191 }
54192
54193 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54194 return NewAdd;
54195
54196 return SDValue();
54197}
54198
54199static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
54200 TargetLowering::DAGCombinerInfo &DCI,
54201 const X86Subtarget &Subtarget) {
54202 SDLoc dl(N);
54203 EVT VT = N->getValueType(0);
54204 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54205
54206 // Let legalize expand this if it isn't a legal type yet.
54207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54208 if (!TLI.isTypeLegal(VT))
54209 return SDValue();
54210
54211 SDValue A = N->getOperand(IsStrict ? 1 : 0);
54212 SDValue B = N->getOperand(IsStrict ? 2 : 1);
54213 SDValue C = N->getOperand(IsStrict ? 3 : 2);
54214
54215 // If the operation allows fast-math and the target does not support FMA,
54216 // split this into mul+add to avoid libcall(s).
54217 SDNodeFlags Flags = N->getFlags();
54218 if (!IsStrict && Flags.hasAllowReassociation() &&
54219 TLI.isOperationExpand(ISD::FMA, VT)) {
54220 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
54221 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
54222 }
54223
54224 EVT ScalarVT = VT.getScalarType();
54225 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
54226 !Subtarget.hasAnyFMA()) &&
54227 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
54228 return SDValue();
54229
54230 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
54231 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54232 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54233 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
54234 CodeSize)) {
54235 V = NegV;
54236 return true;
54237 }
54238 // Look through extract_vector_elts. If it comes from an FNEG, create a
54239 // new extract from the FNEG input.
54240 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54241 isNullConstant(V.getOperand(1))) {
54242 SDValue Vec = V.getOperand(0);
54243 if (SDValue NegV = TLI.getCheaperNegatedExpression(
54244 Vec, DAG, LegalOperations, CodeSize)) {
54245 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
54246 NegV, V.getOperand(1));
54247 return true;
54248 }
54249 }
54250
54251 return false;
54252 };
54253
54254 // Do not convert the passthru input of scalar intrinsics.
54255 // FIXME: We could allow negations of the lower element only.
54256 bool NegA = invertIfNegative(A);
54257 bool NegB = invertIfNegative(B);
54258 bool NegC = invertIfNegative(C);
54259
54260 if (!NegA && !NegB && !NegC)
54261 return SDValue();
54262
54263 unsigned NewOpcode =
54264 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54265
54266 // Propagate fast-math-flags to new FMA node.
54267 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
54268 if (IsStrict) {
54269 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54269, __extension__
__PRETTY_FUNCTION__))
;
54270 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
54271 {N->getOperand(0), A, B, C});
54272 } else {
54273 if (N->getNumOperands() == 4)
54274 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54275 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
54276 }
54277}
54278
54279// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54280// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54281static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
54282 TargetLowering::DAGCombinerInfo &DCI) {
54283 SDLoc dl(N);
54284 EVT VT = N->getValueType(0);
54285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54286 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54287 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54288
54289 SDValue N2 = N->getOperand(2);
54290
54291 SDValue NegN2 =
54292 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
54293 if (!NegN2)
54294 return SDValue();
54295 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54296
54297 if (N->getNumOperands() == 4)
54298 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54299 NegN2, N->getOperand(3));
54300 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54301 NegN2);
54302}
54303
54304static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
54305 TargetLowering::DAGCombinerInfo &DCI,
54306 const X86Subtarget &Subtarget) {
54307 SDLoc dl(N);
54308 SDValue N0 = N->getOperand(0);
54309 EVT VT = N->getValueType(0);
54310
54311 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54312 // FIXME: Is this needed? We don't seem to have any tests for it.
54313 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54314 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54315 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54316 N0->getOperand(1));
54317 bool ReplaceOtherUses = !N0.hasOneUse();
54318 DCI.CombineTo(N, Setcc);
54319 // Replace other uses with a truncate of the widened setcc_carry.
54320 if (ReplaceOtherUses) {
54321 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54322 N0.getValueType(), Setcc);
54323 DCI.CombineTo(N0.getNode(), Trunc);
54324 }
54325
54326 return SDValue(N, 0);
54327 }
54328
54329 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54330 return NewCMov;
54331
54332 if (DCI.isBeforeLegalizeOps())
54333 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54334 return V;
54335
54336 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54337 DAG, DCI, Subtarget))
54338 return V;
54339
54340 if (VT.isVector())
54341 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54342 return R;
54343
54344 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54345 return NewAdd;
54346
54347 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
54348 return R;
54349
54350 // TODO: Combine with any target/faux shuffle.
54351 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
54352 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
54353 SDValue N00 = N0.getOperand(0);
54354 SDValue N01 = N0.getOperand(1);
54355 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
54356 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
54357 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
54358 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
54359 return concatSubVectors(N00, N01, DAG, dl);
54360 }
54361 }
54362
54363 return SDValue();
54364}
54365
54366/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
54367/// pre-promote its result type since vXi1 vectors don't get promoted
54368/// during type legalization.
54369static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
54370 SDValue RHS, ISD::CondCode CC,
54371 const SDLoc &DL, SelectionDAG &DAG,
54372 const X86Subtarget &Subtarget) {
54373 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
54374 VT.getVectorElementType() == MVT::i1 &&
54375 (OpVT.getVectorElementType() == MVT::i8 ||
54376 OpVT.getVectorElementType() == MVT::i16)) {
54377 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
54378 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
54379 }
54380 return SDValue();
54381}
54382
54383static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
54384 TargetLowering::DAGCombinerInfo &DCI,
54385 const X86Subtarget &Subtarget) {
54386 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
54387 const SDValue LHS = N->getOperand(0);
54388 const SDValue RHS = N->getOperand(1);
54389 EVT VT = N->getValueType(0);
54390 EVT OpVT = LHS.getValueType();
54391 SDLoc DL(N);
54392
54393 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
54394 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
54395 Subtarget))
54396 return V;
54397
54398 if (VT == MVT::i1) {
54399 X86::CondCode X86CC;
54400 if (SDValue V =
54401 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
54402 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
54403 }
54404
54405 if (OpVT.isScalarInteger()) {
54406 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54407 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54408 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
54409 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54410 if (N0.getOperand(0) == N1)
54411 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54412 N0.getOperand(1));
54413 if (N0.getOperand(1) == N1)
54414 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54415 N0.getOperand(0));
54416 }
54417 return SDValue();
54418 };
54419 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
54420 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54421 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
54422 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54423
54424 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54425 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54426 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
54427 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54428 if (N0.getOperand(0) == N1)
54429 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54430 DAG.getNOT(DL, N0.getOperand(1), OpVT));
54431 if (N0.getOperand(1) == N1)
54432 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54433 DAG.getNOT(DL, N0.getOperand(0), OpVT));
54434 }
54435 return SDValue();
54436 };
54437 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
54438 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54439 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
54440 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54441
54442 // cmpeq(trunc(x),0) --> cmpeq(x,0)
54443 // cmpne(trunc(x),0) --> cmpne(x,0)
54444 // iff x upper bits are zero.
54445 // TODO: Add support for RHS to be truncate as well?
54446 if (LHS.getOpcode() == ISD::TRUNCATE &&
54447 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
54448 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
54449 EVT SrcVT = LHS.getOperand(0).getValueType();
54450 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
54451 OpVT.getScalarSizeInBits());
54452 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54453 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
54454 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
54455 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
54456 DAG.getConstant(0, DL, SrcVT), CC);
54457 }
54458
54459 // With C as a power of 2 and C != 0 and C != INT_MIN:
54460 // icmp eq Abs(X) C ->
54461 // (icmp eq A, C) | (icmp eq A, -C)
54462 // icmp ne Abs(X) C ->
54463 // (icmp ne A, C) & (icmp ne A, -C)
54464 // Both of these patterns can be better optimized in
54465 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
54466 // integers which is checked above.
54467 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
54468 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
54469 const APInt &CInt = C->getAPIntValue();
54470 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
54471 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
54472 SDValue BaseOp = LHS.getOperand(0);
54473 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
54474 SDValue SETCC1 = DAG.getSetCC(
54475 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54476 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
54477 SETCC0, SETCC1);
54478 }
54479 }
54480 }
54481 }
54482 }
54483
54484 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
54485 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
54486 // Using temporaries to avoid messing up operand ordering for later
54487 // transformations if this doesn't work.
54488 SDValue Op0 = LHS;
54489 SDValue Op1 = RHS;
54490 ISD::CondCode TmpCC = CC;
54491 // Put build_vector on the right.
54492 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
54493 std::swap(Op0, Op1);
54494 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
54495 }
54496
54497 bool IsSEXT0 =
54498 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
54499 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
54500 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
54501
54502 if (IsSEXT0 && IsVZero1) {
54503 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54504, __extension__
__PRETTY_FUNCTION__))
54504 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54504, __extension__
__PRETTY_FUNCTION__))
;
54505 if (TmpCC == ISD::SETGT)
54506 return DAG.getConstant(0, DL, VT);
54507 if (TmpCC == ISD::SETLE)
54508 return DAG.getConstant(1, DL, VT);
54509 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
54510 return DAG.getNOT(DL, Op0.getOperand(0), VT);
54511
54512 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54513, __extension__
__PRETTY_FUNCTION__))
54513 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54513, __extension__
__PRETTY_FUNCTION__))
;
54514 return Op0.getOperand(0);
54515 }
54516 }
54517
54518 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
54519 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
54520 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
54521 // a mask, there are signed AVX512 comparisons).
54522 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
54523 bool CanMakeSigned = false;
54524 if (ISD::isUnsignedIntSetCC(CC)) {
54525 KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),
54526 DAG.computeKnownBits(RHS));
54527 // If we know LHS/RHS share the same sign bit at each element we can
54528 // make this signed.
54529 // NOTE: `computeKnownBits` on a vector type aggregates common bits
54530 // across all lanes. So a pattern where the sign varies from lane to
54531 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
54532 // missed. We could get around this by demanding each lane
54533 // independently, but this isn't the most important optimization and
54534 // that may eat into compile time.
54535 CanMakeSigned =
54536 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
54537 }
54538 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
54539 SDValue LHSOut = LHS;
54540 SDValue RHSOut = RHS;
54541 ISD::CondCode NewCC = CC;
54542 switch (CC) {
54543 case ISD::SETGE:
54544 case ISD::SETUGE:
54545 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
54546 /*NSW*/ true))
54547 LHSOut = NewLHS;
54548 else if (SDValue NewRHS = incDecVectorConstant(
54549 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
54550 RHSOut = NewRHS;
54551 else
54552 break;
54553
54554 [[fallthrough]];
54555 case ISD::SETUGT:
54556 NewCC = ISD::SETGT;
54557 break;
54558
54559 case ISD::SETLE:
54560 case ISD::SETULE:
54561 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
54562 /*NSW*/ true))
54563 LHSOut = NewLHS;
54564 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
54565 /*NSW*/ true))
54566 RHSOut = NewRHS;
54567 else
54568 break;
54569
54570 [[fallthrough]];
54571 case ISD::SETULT:
54572 // Will be swapped to SETGT in LowerVSETCC*.
54573 NewCC = ISD::SETLT;
54574 break;
54575 default:
54576 break;
54577 }
54578 if (NewCC != CC) {
54579 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
54580 NewCC, DL, DAG, Subtarget))
54581 return R;
54582 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
54583 }
54584 }
54585 }
54586
54587 if (SDValue R =
54588 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
54589 return R;
54590
54591 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54592 // to avoid scalarization via legalization because v4i32 is not a legal type.
54593 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
54594 LHS.getValueType() == MVT::v4f32)
54595 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
54596
54597 // X pred 0.0 --> X pred -X
54598 // If the negation of X already exists, use it in the comparison. This removes
54599 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
54600 // instructions in patterns with a 'select' node.
54601 if (isNullFPScalarOrVectorConst(RHS)) {
54602 SDVTList FNegVT = DAG.getVTList(OpVT);
54603 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
54604 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
54605 }
54606
54607 return SDValue();
54608}
54609
54610static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
54611 TargetLowering::DAGCombinerInfo &DCI,
54612 const X86Subtarget &Subtarget) {
54613 SDValue Src = N->getOperand(0);
54614 MVT SrcVT = Src.getSimpleValueType();
54615 MVT VT = N->getSimpleValueType(0);
54616 unsigned NumBits = VT.getScalarSizeInBits();
54617 unsigned NumElts = SrcVT.getVectorNumElements();
54618 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
54619 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54619, __extension__
__PRETTY_FUNCTION__))
;
54620
54621 // Perform constant folding.
54622 APInt UndefElts;
54623 SmallVector<APInt, 32> EltBits;
54624 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
54625 APInt Imm(32, 0);
54626 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
54627 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54628 Imm.setBit(Idx);
54629
54630 return DAG.getConstant(Imm, SDLoc(N), VT);
54631 }
54632
54633 // Look through int->fp bitcasts that don't change the element width.
54634 unsigned EltWidth = SrcVT.getScalarSizeInBits();
54635 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
54636 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
54637 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
54638
54639 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54640 // with scalar comparisons.
54641 if (SDValue NotSrc = IsNOT(Src, DAG)) {
54642 SDLoc DL(N);
54643 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54644 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
54645 return DAG.getNode(ISD::XOR, DL, VT,
54646 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
54647 DAG.getConstant(NotMask, DL, VT));
54648 }
54649
54650 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54651 // results with scalar comparisons.
54652 if (Src.getOpcode() == X86ISD::PCMPGT &&
54653 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
54654 SDLoc DL(N);
54655 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54656 return DAG.getNode(ISD::XOR, DL, VT,
54657 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
54658 DAG.getConstant(NotMask, DL, VT));
54659 }
54660
54661 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54662 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54663 // iff pow2splat(c1).
54664 // Use KnownBits to determine if only a single bit is non-zero
54665 // in each element (pow2 or zero), and shift that bit to the msb.
54666 if (Src.getOpcode() == X86ISD::PCMPEQ) {
54667 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
54668 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
54669 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
54670 if (KnownLHS.countMaxPopulation() == 1 &&
54671 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
54672 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
54673 SDLoc DL(N);
54674 MVT ShiftVT = SrcVT;
54675 SDValue ShiftLHS = Src.getOperand(0);
54676 SDValue ShiftRHS = Src.getOperand(1);
54677 if (ShiftVT.getScalarType() == MVT::i8) {
54678 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54679 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
54680 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
54681 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
54682 }
54683 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54684 ShiftLHS, ShiftAmt, DAG);
54685 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54686 ShiftRHS, ShiftAmt, DAG);
54687 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
54688 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
54689 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
54690 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
54691 }
54692 }
54693
54694 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54695 if (N->isOnlyUserOf(Src.getNode())) {
54696 SDValue SrcBC = peekThroughOneUseBitcasts(Src);
54697 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54698 APInt UndefElts;
54699 SmallVector<APInt, 32> EltBits;
54700 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54701 UndefElts, EltBits)) {
54702 APInt Mask = APInt::getZero(NumBits);
54703 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54704 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54705 Mask.setBit(Idx);
54706 }
54707 SDLoc DL(N);
54708 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54709 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54710 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54711 DAG.getConstant(Mask, DL, VT));
54712 }
54713 }
54714 }
54715
54716 // Simplify the inputs.
54717 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54718 APInt DemandedMask(APInt::getAllOnes(NumBits));
54719 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54720 return SDValue(N, 0);
54721
54722 return SDValue();
54723}
54724
54725static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
54726 TargetLowering::DAGCombinerInfo &DCI,
54727 const X86Subtarget &Subtarget) {
54728 MVT VT = N->getSimpleValueType(0);
54729 unsigned NumBits = VT.getScalarSizeInBits();
54730
54731 // Simplify the inputs.
54732 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54733 APInt DemandedMask(APInt::getAllOnes(NumBits));
54734 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54735 return SDValue(N, 0);
54736
54737 return SDValue();
54738}
54739
54740static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
54741 TargetLowering::DAGCombinerInfo &DCI,
54742 const X86Subtarget &Subtarget) {
54743 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
54744 SDValue BasePtr = MemOp->getBasePtr();
54745 SDValue Index = MemOp->getIndex();
54746 SDValue Scale = MemOp->getScale();
54747 SDValue Mask = MemOp->getMask();
54748
54749 // Attempt to fold an index scale into the scale value directly.
54750 // For smaller indices, implicit sext is performed BEFORE scale, preventing
54751 // this fold under most circumstances.
54752 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
54753 if ((Index.getOpcode() == X86ISD::VSHLI ||
54754 (Index.getOpcode() == ISD::ADD &&
54755 Index.getOperand(0) == Index.getOperand(1))) &&
54756 isa<ConstantSDNode>(Scale) &&
54757 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
54758 unsigned ShiftAmt =
54759 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
54760 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
54761 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
54762 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
54763 SDValue NewIndex = Index.getOperand(0);
54764 SDValue NewScale =
54765 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
54766 if (N->getOpcode() == X86ISD::MGATHER)
54767 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
54768 MemOp->getOperand(1), Mask,
54769 MemOp->getBasePtr(), NewIndex, NewScale,
54770 MemOp->getChain(), Subtarget);
54771 if (N->getOpcode() == X86ISD::MSCATTER)
54772 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
54773 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
54774 NewIndex, NewScale, MemOp->getChain(), Subtarget);
54775 }
54776 }
54777
54778 // With vector masks we only demand the upper bit of the mask.
54779 if (Mask.getScalarValueSizeInBits() != 1) {
54780 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54781 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54782 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54783 if (N->getOpcode() != ISD::DELETED_NODE)
54784 DCI.AddToWorklist(N);
54785 return SDValue(N, 0);
54786 }
54787 }
54788
54789 return SDValue();
54790}
54791
54792static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
54793 SDValue Index, SDValue Base, SDValue Scale,
54794 SelectionDAG &DAG) {
54795 SDLoc DL(GorS);
54796
54797 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
54798 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
54799 Gather->getMask(), Base, Index, Scale } ;
54800 return DAG.getMaskedGather(Gather->getVTList(),
54801 Gather->getMemoryVT(), DL, Ops,
54802 Gather->getMemOperand(),
54803 Gather->getIndexType(),
54804 Gather->getExtensionType());
54805 }
54806 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
54807 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
54808 Scatter->getMask(), Base, Index, Scale };
54809 return DAG.getMaskedScatter(Scatter->getVTList(),
54810 Scatter->getMemoryVT(), DL,
54811 Ops, Scatter->getMemOperand(),
54812 Scatter->getIndexType(),
54813 Scatter->isTruncatingStore());
54814}
54815
54816static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
54817 TargetLowering::DAGCombinerInfo &DCI) {
54818 SDLoc DL(N);
54819 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
54820 SDValue Index = GorS->getIndex();
54821 SDValue Base = GorS->getBasePtr();
54822 SDValue Scale = GorS->getScale();
54823
54824 if (DCI.isBeforeLegalize()) {
54825 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54826
54827 // Shrink constant indices if they are larger than 32-bits.
54828 // Only do this before legalize types since v2i64 could become v2i32.
54829 // FIXME: We could check that the type is legal if we're after legalize
54830 // types, but then we would need to construct test cases where that happens.
54831 // FIXME: We could support more than just constant vectors, but we need to
54832 // careful with costing. A truncate that can be optimized out would be fine.
54833 // Otherwise we might only want to create a truncate if it avoids a split.
54834 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
54835 if (BV->isConstant() && IndexWidth > 32 &&
54836 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54837 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54838 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54839 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54840 }
54841 }
54842
54843 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
54844 // there are sufficient sign bits. Only do this before legalize types to
54845 // avoid creating illegal types in truncate.
54846 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
54847 Index.getOpcode() == ISD::ZERO_EXTEND) &&
54848 IndexWidth > 32 &&
54849 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
54850 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54851 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54852 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54853 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54854 }
54855 }
54856
54857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54858 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
54859 // Try to move splat constant adders from the index operand to the base
54860 // pointer operand. Taking care to multiply by the scale. We can only do
54861 // this when index element type is the same as the pointer type.
54862 // Otherwise we need to be sure the math doesn't wrap before the scale.
54863 if (Index.getOpcode() == ISD::ADD &&
54864 Index.getValueType().getVectorElementType() == PtrVT &&
54865 isa<ConstantSDNode>(Scale)) {
54866 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
54867 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
54868 BitVector UndefElts;
54869 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
54870 // FIXME: Allow non-constant?
54871 if (UndefElts.none()) {
54872 // Apply the scale.
54873 APInt Adder = C->getAPIntValue() * ScaleAmt;
54874 // Add it to the existing base.
54875 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
54876 DAG.getConstant(Adder, DL, PtrVT));
54877 Index = Index.getOperand(0);
54878 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54879 }
54880 }
54881
54882 // It's also possible base is just a constant. In that case, just
54883 // replace it with 0 and move the displacement into the index.
54884 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
54885 isOneConstant(Scale)) {
54886 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
54887 // Combine the constant build_vector and the constant base.
54888 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54889 Index.getOperand(1), Splat);
54890 // Add to the LHS of the original Index add.
54891 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54892 Index.getOperand(0), Splat);
54893 Base = DAG.getConstant(0, DL, Base.getValueType());
54894 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54895 }
54896 }
54897 }
54898
54899 if (DCI.isBeforeLegalizeOps()) {
54900 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54901
54902 // Make sure the index is either i32 or i64
54903 if (IndexWidth != 32 && IndexWidth != 64) {
54904 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54905 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54906 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54907 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54908 }
54909 }
54910
54911 // With vector masks we only demand the upper bit of the mask.
54912 SDValue Mask = GorS->getMask();
54913 if (Mask.getScalarValueSizeInBits() != 1) {
54914 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54915 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54916 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54917 if (N->getOpcode() != ISD::DELETED_NODE)
54918 DCI.AddToWorklist(N);
54919 return SDValue(N, 0);
54920 }
54921 }
54922
54923 return SDValue();
54924}
54925
54926// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54927static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
54928 const X86Subtarget &Subtarget) {
54929 SDLoc DL(N);
54930 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54931 SDValue EFLAGS = N->getOperand(1);
54932
54933 // Try to simplify the EFLAGS and condition code operands.
54934 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54935 return getSETCC(CC, Flags, DL, DAG);
54936
54937 return SDValue();
54938}
54939
54940/// Optimize branch condition evaluation.
54941static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
54942 const X86Subtarget &Subtarget) {
54943 SDLoc DL(N);
54944 SDValue EFLAGS = N->getOperand(3);
54945 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54946
54947 // Try to simplify the EFLAGS and condition code operands.
54948 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54949 // RAUW them under us.
54950 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54951 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54952 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54953 N->getOperand(1), Cond, Flags);
54954 }
54955
54956 return SDValue();
54957}
54958
54959// TODO: Could we move this to DAGCombine?
54960static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
54961 SelectionDAG &DAG) {
54962 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54963 // to optimize away operation when it's from a constant.
54964 //
54965 // The general transformation is:
54966 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54967 // AND(VECTOR_CMP(x,y), constant2)
54968 // constant2 = UNARYOP(constant)
54969
54970 // Early exit if this isn't a vector operation, the operand of the
54971 // unary operation isn't a bitwise AND, or if the sizes of the operations
54972 // aren't the same.
54973 EVT VT = N->getValueType(0);
54974 bool IsStrict = N->isStrictFPOpcode();
54975 unsigned NumEltBits = VT.getScalarSizeInBits();
54976 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54977 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54978 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54979 VT.getSizeInBits() != Op0.getValueSizeInBits())
54980 return SDValue();
54981
54982 // Now check that the other operand of the AND is a constant. We could
54983 // make the transformation for non-constant splats as well, but it's unclear
54984 // that would be a benefit as it would not eliminate any operations, just
54985 // perform one more step in scalar code before moving to the vector unit.
54986 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54987 // Bail out if the vector isn't a constant.
54988 if (!BV->isConstant())
54989 return SDValue();
54990
54991 // Everything checks out. Build up the new and improved node.
54992 SDLoc DL(N);
54993 EVT IntVT = BV->getValueType(0);
54994 // Create a new constant of the appropriate type for the transformed
54995 // DAG.
54996 SDValue SourceConst;
54997 if (IsStrict)
54998 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54999 {N->getOperand(0), SDValue(BV, 0)});
55000 else
55001 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
55002 // The AND node needs bitcasts to/from an integer vector type around it.
55003 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
55004 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
55005 MaskConst);
55006 SDValue Res = DAG.getBitcast(VT, NewAnd);
55007 if (IsStrict)
55008 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
55009 return Res;
55010 }
55011
55012 return SDValue();
55013}
55014
55015/// If we are converting a value to floating-point, try to replace scalar
55016/// truncate of an extracted vector element with a bitcast. This tries to keep
55017/// the sequence on XMM registers rather than moving between vector and GPRs.
55018static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
55019 // TODO: This is currently only used by combineSIntToFP, but it is generalized
55020 // to allow being called by any similar cast opcode.
55021 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
55022 SDValue Trunc = N->getOperand(0);
55023 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
55024 return SDValue();
55025
55026 SDValue ExtElt = Trunc.getOperand(0);
55027 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55028 !isNullConstant(ExtElt.getOperand(1)))
55029 return SDValue();
55030
55031 EVT TruncVT = Trunc.getValueType();
55032 EVT SrcVT = ExtElt.getValueType();
55033 unsigned DestWidth = TruncVT.getSizeInBits();
55034 unsigned SrcWidth = SrcVT.getSizeInBits();
55035 if (SrcWidth % DestWidth != 0)
55036 return SDValue();
55037
55038 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
55039 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
55040 unsigned VecWidth = SrcVecVT.getSizeInBits();
55041 unsigned NumElts = VecWidth / DestWidth;
55042 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
55043 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
55044 SDLoc DL(N);
55045 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
55046 BitcastVec, ExtElt.getOperand(1));
55047 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
55048}
55049
55050static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
55051 const X86Subtarget &Subtarget) {
55052 bool IsStrict = N->isStrictFPOpcode();
55053 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55054 EVT VT = N->getValueType(0);
55055 EVT InVT = Op0.getValueType();
55056
55057 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
55058 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
55059 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
55060 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55061 unsigned ScalarSize = InVT.getScalarSizeInBits();
55062 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55063 return SDValue();
55064 SDLoc dl(N);
55065 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55066 ScalarSize < 16 ? MVT::i16
55067 : ScalarSize < 32 ? MVT::i32
55068 : MVT::i64,
55069 InVT.getVectorNumElements());
55070 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55071 if (IsStrict)
55072 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
55073 {N->getOperand(0), P});
55074 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
55075 }
55076
55077 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
55078 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
55079 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
55080 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55081 VT.getScalarType() != MVT::f16) {
55082 SDLoc dl(N);
55083 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55084 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55085
55086 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
55087 if (IsStrict)
55088 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55089 {N->getOperand(0), P});
55090 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55091 }
55092
55093 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
55094 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
55095 // the optimization here.
55096 if (DAG.SignBitIsZero(Op0)) {
55097 if (IsStrict)
55098 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
55099 {N->getOperand(0), Op0});
55100 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
55101 }
55102
55103 return SDValue();
55104}
55105
55106static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
55107 TargetLowering::DAGCombinerInfo &DCI,
55108 const X86Subtarget &Subtarget) {
55109 // First try to optimize away the conversion entirely when it's
55110 // conditionally from a constant. Vectors only.
55111 bool IsStrict = N->isStrictFPOpcode();
55112 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
55113 return Res;
55114
55115 // Now move on to more general possibilities.
55116 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55117 EVT VT = N->getValueType(0);
55118 EVT InVT = Op0.getValueType();
55119
55120 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
55121 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
55122 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
55123 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55124 unsigned ScalarSize = InVT.getScalarSizeInBits();
55125 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55126 return SDValue();
55127 SDLoc dl(N);
55128 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55129 ScalarSize < 16 ? MVT::i16
55130 : ScalarSize < 32 ? MVT::i32
55131 : MVT::i64,
55132 InVT.getVectorNumElements());
55133 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55134 if (IsStrict)
55135 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55136 {N->getOperand(0), P});
55137 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55138 }
55139
55140 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
55141 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
55142 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
55143 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55144 VT.getScalarType() != MVT::f16) {
55145 SDLoc dl(N);
55146 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55147 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55148 if (IsStrict)
55149 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55150 {N->getOperand(0), P});
55151 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55152 }
55153
55154 // Without AVX512DQ we only support i64 to float scalar conversion. For both
55155 // vectors and scalars, see if we know that the upper bits are all the sign
55156 // bit, in which case we can truncate the input to i32 and convert from that.
55157 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
55158 unsigned BitWidth = InVT.getScalarSizeInBits();
55159 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
55160 if (NumSignBits >= (BitWidth - 31)) {
55161 EVT TruncVT = MVT::i32;
55162 if (InVT.isVector())
55163 TruncVT = InVT.changeVectorElementType(TruncVT);
55164 SDLoc dl(N);
55165 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
55166 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
55167 if (IsStrict)
55168 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55169 {N->getOperand(0), Trunc});
55170 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
55171 }
55172 // If we're after legalize and the type is v2i32 we need to shuffle and
55173 // use CVTSI2P.
55174 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55174, __extension__
__PRETTY_FUNCTION__))
;
55175 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
55176 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
55177 { 0, 2, -1, -1 });
55178 if (IsStrict)
55179 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
55180 {N->getOperand(0), Shuf});
55181 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
55182 }
55183 }
55184
55185 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
55186 // a 32-bit target where SSE doesn't support i64->FP operations.
55187 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
55188 Op0.getOpcode() == ISD::LOAD) {
55189 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
55190
55191 // This transformation is not supported if the result type is f16 or f128.
55192 if (VT == MVT::f16 || VT == MVT::f128)
55193 return SDValue();
55194
55195 // If we have AVX512DQ we can use packed conversion instructions unless
55196 // the VT is f80.
55197 if (Subtarget.hasDQI() && VT != MVT::f80)
55198 return SDValue();
55199
55200 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55201 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
55202 std::pair<SDValue, SDValue> Tmp =
55203 Subtarget.getTargetLowering()->BuildFILD(
55204 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55205 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55206 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
55207 return Tmp.first;
55208 }
55209 }
55210
55211 if (IsStrict)
55212 return SDValue();
55213
55214 if (SDValue V = combineToFPTruncExtElt(N, DAG))
55215 return V;
55216
55217 return SDValue();
55218}
55219
55220static bool needCarryOrOverflowFlag(SDValue Flags) {
55221 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55221, __extension__
__PRETTY_FUNCTION__))
;
55222
55223 for (const SDNode *User : Flags->uses()) {
55224 X86::CondCode CC;
55225 switch (User->getOpcode()) {
55226 default:
55227 // Be conservative.
55228 return true;
55229 case X86ISD::SETCC:
55230 case X86ISD::SETCC_CARRY:
55231 CC = (X86::CondCode)User->getConstantOperandVal(0);
55232 break;
55233 case X86ISD::BRCOND:
55234 case X86ISD::CMOV:
55235 CC = (X86::CondCode)User->getConstantOperandVal(2);
55236 break;
55237 }
55238
55239 switch (CC) {
55240 default: break;
55241 case X86::COND_A: case X86::COND_AE:
55242 case X86::COND_B: case X86::COND_BE:
55243 case X86::COND_O: case X86::COND_NO:
55244 case X86::COND_G: case X86::COND_GE:
55245 case X86::COND_L: case X86::COND_LE:
55246 return true;
55247 }
55248 }
55249
55250 return false;
55251}
55252
55253static bool onlyZeroFlagUsed(SDValue Flags) {
55254 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55254, __extension__
__PRETTY_FUNCTION__))
;
55255
55256 for (const SDNode *User : Flags->uses()) {
55257 unsigned CCOpNo;
55258 switch (User->getOpcode()) {
55259 default:
55260 // Be conservative.
55261 return false;
55262 case X86ISD::SETCC:
55263 case X86ISD::SETCC_CARRY:
55264 CCOpNo = 0;
55265 break;
55266 case X86ISD::BRCOND:
55267 case X86ISD::CMOV:
55268 CCOpNo = 2;
55269 break;
55270 }
55271
55272 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55273 if (CC != X86::COND_E && CC != X86::COND_NE)
55274 return false;
55275 }
55276
55277 return true;
55278}
55279
55280static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
55281 // Only handle test patterns.
55282 if (!isNullConstant(N->getOperand(1)))
55283 return SDValue();
55284
55285 // If we have a CMP of a truncated binop, see if we can make a smaller binop
55286 // and use its flags directly.
55287 // TODO: Maybe we should try promoting compares that only use the zero flag
55288 // first if we can prove the upper bits with computeKnownBits?
55289 SDLoc dl(N);
55290 SDValue Op = N->getOperand(0);
55291 EVT VT = Op.getValueType();
55292
55293 // If we have a constant logical shift that's only used in a comparison
55294 // against zero turn it into an equivalent AND. This allows turning it into
55295 // a TEST instruction later.
55296 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
55297 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
55298 onlyZeroFlagUsed(SDValue(N, 0))) {
55299 unsigned BitWidth = VT.getSizeInBits();
55300 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
55301 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
55302 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55303 APInt Mask = Op.getOpcode() == ISD::SRL
55304 ? APInt::getHighBitsSet(BitWidth, MaskBits)
55305 : APInt::getLowBitsSet(BitWidth, MaskBits);
55306 if (Mask.isSignedIntN(32)) {
55307 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
55308 DAG.getConstant(Mask, dl, VT));
55309 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55310 DAG.getConstant(0, dl, VT));
55311 }
55312 }
55313 }
55314
55315 // Peek through any zero-extend if we're only testing for a zero result.
55316 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
55317 SDValue Src = Op.getOperand(0);
55318 EVT SrcVT = Src.getValueType();
55319 if (SrcVT.getScalarSizeInBits() >= 8 &&
55320 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
55321 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55322 DAG.getConstant(0, dl, SrcVT));
55323 }
55324
55325 // Look for a truncate.
55326 if (Op.getOpcode() != ISD::TRUNCATE)
55327 return SDValue();
55328
55329 SDValue Trunc = Op;
55330 Op = Op.getOperand(0);
55331
55332 // See if we can compare with zero against the truncation source,
55333 // which should help using the Z flag from many ops. Only do this for
55334 // i32 truncated op to prevent partial-reg compares of promoted ops.
55335 EVT OpVT = Op.getValueType();
55336 APInt UpperBits =
55337 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
55338 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55339 onlyZeroFlagUsed(SDValue(N, 0))) {
55340 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55341 DAG.getConstant(0, dl, OpVT));
55342 }
55343
55344 // After this the truncate and arithmetic op must have a single use.
55345 if (!Trunc.hasOneUse() || !Op.hasOneUse())
55346 return SDValue();
55347
55348 unsigned NewOpc;
55349 switch (Op.getOpcode()) {
55350 default: return SDValue();
55351 case ISD::AND:
55352 // Skip and with constant. We have special handling for and with immediate
55353 // during isel to generate test instructions.
55354 if (isa<ConstantSDNode>(Op.getOperand(1)))
55355 return SDValue();
55356 NewOpc = X86ISD::AND;
55357 break;
55358 case ISD::OR: NewOpc = X86ISD::OR; break;
55359 case ISD::XOR: NewOpc = X86ISD::XOR; break;
55360 case ISD::ADD:
55361 // If the carry or overflow flag is used, we can't truncate.
55362 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55363 return SDValue();
55364 NewOpc = X86ISD::ADD;
55365 break;
55366 case ISD::SUB:
55367 // If the carry or overflow flag is used, we can't truncate.
55368 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55369 return SDValue();
55370 NewOpc = X86ISD::SUB;
55371 break;
55372 }
55373
55374 // We found an op we can narrow. Truncate its inputs.
55375 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
55376 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
55377
55378 // Use a X86 specific opcode to avoid DAG combine messing with it.
55379 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55380 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
55381
55382 // For AND, keep a CMP so that we can match the test pattern.
55383 if (NewOpc == X86ISD::AND)
55384 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55385 DAG.getConstant(0, dl, VT));
55386
55387 // Return the flags.
55388 return Op.getValue(1);
55389}
55390
55391static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
55392 TargetLowering::DAGCombinerInfo &DCI) {
55393 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55394, __extension__
__PRETTY_FUNCTION__))
55394 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55394, __extension__
__PRETTY_FUNCTION__))
;
55395
55396 SDLoc DL(N);
55397 SDValue LHS = N->getOperand(0);
55398 SDValue RHS = N->getOperand(1);
55399 MVT VT = LHS.getSimpleValueType();
55400 bool IsSub = X86ISD::SUB == N->getOpcode();
55401 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
55402
55403 // If we don't use the flag result, simplify back to a generic ADD/SUB.
55404 if (!N->hasAnyUseOfValue(1)) {
55405 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
55406 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
55407 }
55408
55409 // Fold any similar generic ADD/SUB opcodes to reuse this node.
55410 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
55411 SDValue Ops[] = {N0, N1};
55412 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55413 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
55414 SDValue Op(N, 0);
55415 if (Negate)
55416 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
55417 DCI.CombineTo(GenericAddSub, Op);
55418 }
55419 };
55420 MatchGeneric(LHS, RHS, false);
55421 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55422
55423 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
55424 // EFLAGS result doesn't change.
55425 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
55426 /*ZeroSecondOpOnly*/ true);
55427}
55428
55429static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
55430 SDValue LHS = N->getOperand(0);
55431 SDValue RHS = N->getOperand(1);
55432 SDValue BorrowIn = N->getOperand(2);
55433
55434 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
55435 MVT VT = N->getSimpleValueType(0);
55436 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55437 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
55438 }
55439
55440 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55441 // iff the flag result is dead.
55442 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
55443 !N->hasAnyUseOfValue(1))
55444 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55445 LHS.getOperand(1), BorrowIn);
55446
55447 return SDValue();
55448}
55449
55450// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
55451static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
55452 TargetLowering::DAGCombinerInfo &DCI) {
55453 SDValue LHS = N->getOperand(0);
55454 SDValue RHS = N->getOperand(1);
55455 SDValue CarryIn = N->getOperand(2);
55456 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
55457 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
55458
55459 // Canonicalize constant to RHS.
55460 if (LHSC && !RHSC)
55461 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55462 CarryIn);
55463
55464 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
55465 // the result is either zero or one (depending on the input carry bit).
55466 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
55467 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55468 // We don't have a good way to replace an EFLAGS use, so only do this when
55469 // dead right now.
55470 SDValue(N, 1).use_empty()) {
55471 SDLoc DL(N);
55472 EVT VT = N->getValueType(0);
55473 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55474 SDValue Res1 = DAG.getNode(
55475 ISD::AND, DL, VT,
55476 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
55477 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
55478 DAG.getConstant(1, DL, VT));
55479 return DCI.CombineTo(N, Res1, CarryOut);
55480 }
55481
55482 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55483 // iff the flag result is dead.
55484 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
55485 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55486 SDLoc DL(N);
55487 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55488 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55489 DAG.getConstant(0, DL, LHS.getValueType()),
55490 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
55491 }
55492
55493 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
55494 MVT VT = N->getSimpleValueType(0);
55495 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55496 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
55497 }
55498
55499 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55500 // iff the flag result is dead.
55501 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55502 !N->hasAnyUseOfValue(1))
55503 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55504 LHS.getOperand(1), CarryIn);
55505
55506 return SDValue();
55507}
55508
55509static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
55510 const SDLoc &DL, EVT VT,
55511 const X86Subtarget &Subtarget) {
55512 // Example of pattern we try to detect:
55513 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
55514 //(add (build_vector (extract_elt t, 0),
55515 // (extract_elt t, 2),
55516 // (extract_elt t, 4),
55517 // (extract_elt t, 6)),
55518 // (build_vector (extract_elt t, 1),
55519 // (extract_elt t, 3),
55520 // (extract_elt t, 5),
55521 // (extract_elt t, 7)))
55522
55523 if (!Subtarget.hasSSE2())
55524 return SDValue();
55525
55526 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
55527 Op1.getOpcode() != ISD::BUILD_VECTOR)
55528 return SDValue();
55529
55530 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55531 VT.getVectorNumElements() < 4 ||
55532 !isPowerOf2_32(VT.getVectorNumElements()))
55533 return SDValue();
55534
55535 // Check if one of Op0,Op1 is of the form:
55536 // (build_vector (extract_elt Mul, 0),
55537 // (extract_elt Mul, 2),
55538 // (extract_elt Mul, 4),
55539 // ...
55540 // the other is of the form:
55541 // (build_vector (extract_elt Mul, 1),
55542 // (extract_elt Mul, 3),
55543 // (extract_elt Mul, 5),
55544 // ...
55545 // and identify Mul.
55546 SDValue Mul;
55547 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
55548 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55549 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55550 // TODO: Be more tolerant to undefs.
55551 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55552 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55553 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55554 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55555 return SDValue();
55556 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55557 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55558 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55559 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55560 if (!Const0L || !Const1L || !Const0H || !Const1H)
55561 return SDValue();
55562 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55563 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55564 // Commutativity of mul allows factors of a product to reorder.
55565 if (Idx0L > Idx1L)
55566 std::swap(Idx0L, Idx1L);
55567 if (Idx0H > Idx1H)
55568 std::swap(Idx0H, Idx1H);
55569 // Commutativity of add allows pairs of factors to reorder.
55570 if (Idx0L > Idx0H) {
55571 std::swap(Idx0L, Idx0H);
55572 std::swap(Idx1L, Idx1H);
55573 }
55574 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
55575 Idx1H != 2 * i + 3)
55576 return SDValue();
55577 if (!Mul) {
55578 // First time an extract_elt's source vector is visited. Must be a MUL
55579 // with 2X number of vector elements than the BUILD_VECTOR.
55580 // Both extracts must be from same MUL.
55581 Mul = Op0L->getOperand(0);
55582 if (Mul->getOpcode() != ISD::MUL ||
55583 Mul.getValueType().getVectorNumElements() != 2 * e)
55584 return SDValue();
55585 }
55586 // Check that the extract is from the same MUL previously seen.
55587 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55588 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55589 return SDValue();
55590 }
55591
55592 // Check if the Mul source can be safely shrunk.
55593 ShrinkMode Mode;
55594 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
55595 Mode == ShrinkMode::MULU16)
55596 return SDValue();
55597
55598 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55599 VT.getVectorNumElements() * 2);
55600 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
55601 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
55602
55603 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55604 ArrayRef<SDValue> Ops) {
55605 EVT InVT = Ops[0].getValueType();
55606 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55606, __extension__
__PRETTY_FUNCTION__))
;
55607 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55608 InVT.getVectorNumElements() / 2);
55609 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55610 };
55611 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
55612}
55613
55614// Attempt to turn this pattern into PMADDWD.
55615// (add (mul (sext (build_vector)), (sext (build_vector))),
55616// (mul (sext (build_vector)), (sext (build_vector)))
55617static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
55618 const SDLoc &DL, EVT VT,
55619 const X86Subtarget &Subtarget) {
55620 if (!Subtarget.hasSSE2())
55621 return SDValue();
55622
55623 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
55624 return SDValue();
55625
55626 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55627 VT.getVectorNumElements() < 4 ||
55628 !isPowerOf2_32(VT.getVectorNumElements()))
55629 return SDValue();
55630
55631 SDValue N00 = N0.getOperand(0);
55632 SDValue N01 = N0.getOperand(1);
55633 SDValue N10 = N1.getOperand(0);
55634 SDValue N11 = N1.getOperand(1);
55635
55636 // All inputs need to be sign extends.
55637 // TODO: Support ZERO_EXTEND from known positive?
55638 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
55639 N01.getOpcode() != ISD::SIGN_EXTEND ||
55640 N10.getOpcode() != ISD::SIGN_EXTEND ||
55641 N11.getOpcode() != ISD::SIGN_EXTEND)
55642 return SDValue();
55643
55644 // Peek through the extends.
55645 N00 = N00.getOperand(0);
55646 N01 = N01.getOperand(0);
55647 N10 = N10.getOperand(0);
55648 N11 = N11.getOperand(0);
55649
55650 // Must be extending from vXi16.
55651 EVT InVT = N00.getValueType();
55652 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
55653 N10.getValueType() != InVT || N11.getValueType() != InVT)
55654 return SDValue();
55655
55656 // All inputs should be build_vectors.
55657 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
55658 N01.getOpcode() != ISD::BUILD_VECTOR ||
55659 N10.getOpcode() != ISD::BUILD_VECTOR ||
55660 N11.getOpcode() != ISD::BUILD_VECTOR)
55661 return SDValue();
55662
55663 // For each element, we need to ensure we have an odd element from one vector
55664 // multiplied by the odd element of another vector and the even element from
55665 // one of the same vectors being multiplied by the even element from the
55666 // other vector. So we need to make sure for each element i, this operator
55667 // is being performed:
55668 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
55669 SDValue In0, In1;
55670 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
55671 SDValue N00Elt = N00.getOperand(i);
55672 SDValue N01Elt = N01.getOperand(i);
55673 SDValue N10Elt = N10.getOperand(i);
55674 SDValue N11Elt = N11.getOperand(i);
55675 // TODO: Be more tolerant to undefs.
55676 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55677 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55678 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55679 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55680 return SDValue();
55681 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55682 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55683 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55684 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55685 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55686 return SDValue();
55687 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55688 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55689 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55690 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55691 // Add is commutative so indices can be reordered.
55692 if (IdxN00 > IdxN10) {
55693 std::swap(IdxN00, IdxN10);
55694 std::swap(IdxN01, IdxN11);
55695 }
55696 // N0 indices be the even element. N1 indices must be the next odd element.
55697 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55698 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55699 return SDValue();
55700 SDValue N00In = N00Elt.getOperand(0);
55701 SDValue N01In = N01Elt.getOperand(0);
55702 SDValue N10In = N10Elt.getOperand(0);
55703 SDValue N11In = N11Elt.getOperand(0);
55704
55705 // First time we find an input capture it.
55706 if (!In0) {
55707 In0 = N00In;
55708 In1 = N01In;
55709
55710 // The input vectors must be at least as wide as the output.
55711 // If they are larger than the output, we extract subvector below.
55712 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55713 In1.getValueSizeInBits() < VT.getSizeInBits())
55714 return SDValue();
55715 }
55716 // Mul is commutative so the input vectors can be in any order.
55717 // Canonicalize to make the compares easier.
55718 if (In0 != N00In)
55719 std::swap(N00In, N01In);
55720 if (In0 != N10In)
55721 std::swap(N10In, N11In);
55722 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55723 return SDValue();
55724 }
55725
55726 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55727 ArrayRef<SDValue> Ops) {
55728 EVT OpVT = Ops[0].getValueType();
55729 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55730, __extension__
__PRETTY_FUNCTION__))
55730 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55730, __extension__
__PRETTY_FUNCTION__))
;
55731 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55731, __extension__
__PRETTY_FUNCTION__))
;
55732 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55733 OpVT.getVectorNumElements() / 2);
55734 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55735 };
55736
55737 // If the output is narrower than an input, extract the low part of the input
55738 // vector.
55739 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55740 VT.getVectorNumElements() * 2);
55741 if (OutVT16.bitsLT(In0.getValueType())) {
55742 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
55743 DAG.getIntPtrConstant(0, DL));
55744 }
55745 if (OutVT16.bitsLT(In1.getValueType())) {
55746 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
55747 DAG.getIntPtrConstant(0, DL));
55748 }
55749 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
55750 PMADDBuilder);
55751}
55752
55753// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
55754// If upper element in each pair of both VPMADDWD are zero then we can merge
55755// the operand elements and use the implicit add of VPMADDWD.
55756// TODO: Add support for VPMADDUBSW (which isn't commutable).
55757static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
55758 const SDLoc &DL, EVT VT) {
55759 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
55760 return SDValue();
55761
55762 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
55763 if (VT.getSizeInBits() > 128)
55764 return SDValue();
55765
55766 unsigned NumElts = VT.getVectorNumElements();
55767 MVT OpVT = N0.getOperand(0).getSimpleValueType();
55768 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
55769 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
55770
55771 bool Op0HiZero =
55772 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
55773 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
55774 bool Op1HiZero =
55775 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
55776 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
55777
55778 // TODO: Check for zero lower elements once we have actual codegen that
55779 // creates them.
55780 if (!Op0HiZero || !Op1HiZero)
55781 return SDValue();
55782
55783 // Create a shuffle mask packing the lower elements from each VPMADDWD.
55784 SmallVector<int> Mask;
55785 for (int i = 0; i != (int)NumElts; ++i) {
55786 Mask.push_back(2 * i);
55787 Mask.push_back(2 * (i + NumElts));
55788 }
55789
55790 SDValue LHS =
55791 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
55792 SDValue RHS =
55793 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
55794 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
55795}
55796
55797/// CMOV of constants requires materializing constant operands in registers.
55798/// Try to fold those constants into an 'add' instruction to reduce instruction
55799/// count. We do this with CMOV rather the generic 'select' because there are
55800/// earlier folds that may be used to turn select-of-constants into logic hacks.
55801static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
55802 const X86Subtarget &Subtarget) {
55803 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
55804 // better because we eliminate 1-2 instructions. This transform is still
55805 // an improvement without zero operands because we trade 2 move constants and
55806 // 1 add for 2 adds (LEA) as long as the constants can be represented as
55807 // immediate asm operands (fit in 32-bits).
55808 auto isSuitableCmov = [](SDValue V) {
55809 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
55810 return false;
55811 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
55812 !isa<ConstantSDNode>(V.getOperand(1)))
55813 return false;
55814 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
55815 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
55816 V.getConstantOperandAPInt(1).isSignedIntN(32));
55817 };
55818
55819 // Match an appropriate CMOV as the first operand of the add.
55820 SDValue Cmov = N->getOperand(0);
55821 SDValue OtherOp = N->getOperand(1);
55822 if (!isSuitableCmov(Cmov))
55823 std::swap(Cmov, OtherOp);
55824 if (!isSuitableCmov(Cmov))
55825 return SDValue();
55826
55827 // Don't remove a load folding opportunity for the add. That would neutralize
55828 // any improvements from removing constant materializations.
55829 if (X86::mayFoldLoad(OtherOp, Subtarget))
55830 return SDValue();
55831
55832 EVT VT = N->getValueType(0);
55833 SDLoc DL(N);
55834 SDValue FalseOp = Cmov.getOperand(0);
55835 SDValue TrueOp = Cmov.getOperand(1);
55836
55837 // We will push the add through the select, but we can potentially do better
55838 // if we know there is another add in the sequence and this is pointer math.
55839 // In that case, we can absorb an add into the trailing memory op and avoid
55840 // a 3-operand LEA which is likely slower than a 2-operand LEA.
55841 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
55842 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
55843 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
55844 all_of(N->uses(), [&](SDNode *Use) {
55845 auto *MemNode = dyn_cast<MemSDNode>(Use);
55846 return MemNode && MemNode->getBasePtr().getNode() == N;
55847 })) {
55848 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55849 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55850 // it is possible that choosing op1 might be better.
55851 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55852 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55853 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55854 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55855 Cmov.getOperand(2), Cmov.getOperand(3));
55856 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55857 }
55858
55859 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55860 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55861 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55862 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55863 Cmov.getOperand(3));
55864}
55865
55866static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
55867 TargetLowering::DAGCombinerInfo &DCI,
55868 const X86Subtarget &Subtarget) {
55869 EVT VT = N->getValueType(0);
55870 SDValue Op0 = N->getOperand(0);
55871 SDValue Op1 = N->getOperand(1);
55872 SDLoc DL(N);
55873
55874 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
55875 return Select;
55876
55877 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55878 return MAdd;
55879 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55880 return MAdd;
55881 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55882 return MAdd;
55883
55884 // Try to synthesize horizontal adds from adds of shuffles.
55885 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55886 return V;
55887
55888 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55889 // (sub Y, (sext (vXi1 X))).
55890 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55891 // generic DAG combine without a legal type check, but adding this there
55892 // caused regressions.
55893 if (VT.isVector()) {
55894 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55895 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55896 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55897 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55898 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55899 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55900 }
55901
55902 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55903 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55904 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55905 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55906 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55907 }
55908 }
55909
55910 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55911 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55912 X86::isZeroNode(Op0.getOperand(1))) {
55913 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55913, __extension__
__PRETTY_FUNCTION__))
;
55914 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55915 Op0.getOperand(0), Op0.getOperand(2));
55916 }
55917
55918 return combineAddOrSubToADCOrSBB(N, DAG);
55919}
55920
55921// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55922// condition comes from the subtract node that produced -X. This matches the
55923// cmov expansion for absolute value. By swapping the operands we convert abs
55924// to nabs.
55925static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
55926 SDValue N0 = N->getOperand(0);
55927 SDValue N1 = N->getOperand(1);
55928
55929 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55930 return SDValue();
55931
55932 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
55933 if (CC != X86::COND_S && CC != X86::COND_NS)
55934 return SDValue();
55935
55936 // Condition should come from a negate operation.
55937 SDValue Cond = N1.getOperand(3);
55938 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55939 return SDValue();
55940 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55940, __extension__
__PRETTY_FUNCTION__))
;
55941
55942 // Get the X and -X from the negate.
55943 SDValue NegX = Cond.getValue(0);
55944 SDValue X = Cond.getOperand(1);
55945
55946 SDValue FalseOp = N1.getOperand(0);
55947 SDValue TrueOp = N1.getOperand(1);
55948
55949 // Cmov operands should be X and NegX. Order doesn't matter.
55950 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55951 return SDValue();
55952
55953 // Build a new CMOV with the operands swapped.
55954 SDLoc DL(N);
55955 MVT VT = N->getSimpleValueType(0);
55956 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55957 N1.getOperand(2), Cond);
55958 // Convert sub to add.
55959 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55960}
55961
55962static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
55963 SDValue Op0 = N->getOperand(0);
55964 SDValue Op1 = N->getOperand(1);
55965
55966 // (sub C (zero_extend (setcc)))
55967 // =>
55968 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55969 // Don't disturb (sub 0 setcc), which is easily done with neg.
55970 EVT VT = N->getValueType(0);
55971 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55972 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55973 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55974 Op1.getOperand(0).hasOneUse()) {
55975 SDValue SetCC = Op1.getOperand(0);
55976 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
55977 X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
55978 uint64_t NewImm = Op0C->getZExtValue() - 1;
55979 SDLoc DL(Op1);
55980 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55981 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55982 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55983 DAG.getConstant(NewImm, DL, VT));
55984 }
55985
55986 return SDValue();
55987}
55988
55989static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
55990 TargetLowering::DAGCombinerInfo &DCI,
55991 const X86Subtarget &Subtarget) {
55992 SDValue Op0 = N->getOperand(0);
55993 SDValue Op1 = N->getOperand(1);
55994
55995 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55996 auto IsNonOpaqueConstant = [&](SDValue Op) {
55997 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
55998 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55999 return !Cst->isOpaque();
56000 return true;
56001 }
56002 return false;
56003 };
56004
56005 // X86 can't encode an immediate LHS of a sub. See if we can push the
56006 // negation into a preceding instruction. If the RHS of the sub is a XOR with
56007 // one use and a constant, invert the immediate, saving one register.
56008 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
56009 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
56010 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
56011 SDLoc DL(N);
56012 EVT VT = Op0.getValueType();
56013 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
56014 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
56015 SDValue NewAdd =
56016 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
56017 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
56018 }
56019
56020 if (SDValue V = combineSubABS(N, DAG))
56021 return V;
56022
56023 // Try to synthesize horizontal subs from subs of shuffles.
56024 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56025 return V;
56026
56027 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
56028 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
56029 X86::isZeroNode(Op1.getOperand(1))) {
56030 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56030, __extension__
__PRETTY_FUNCTION__))
;
56031 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
56032 Op1.getOperand(0), Op1.getOperand(2));
56033 }
56034
56035 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
56036 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
56037 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
56038 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
56039 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56039, __extension__
__PRETTY_FUNCTION__))
;
56040 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
56041 Op1.getOperand(1), Op1.getOperand(2));
56042 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
56043 Op1.getOperand(0));
56044 }
56045
56046 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
56047 return V;
56048
56049 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
56050 return V;
56051
56052 return combineSubSetcc(N, DAG);
56053}
56054
56055static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
56056 const X86Subtarget &Subtarget) {
56057 MVT VT = N->getSimpleValueType(0);
56058 SDLoc DL(N);
56059
56060 if (N->getOperand(0) == N->getOperand(1)) {
56061 if (N->getOpcode() == X86ISD::PCMPEQ)
56062 return DAG.getConstant(-1, DL, VT);
56063 if (N->getOpcode() == X86ISD::PCMPGT)
56064 return DAG.getConstant(0, DL, VT);
56065 }
56066
56067 return SDValue();
56068}
56069
56070/// Helper that combines an array of subvector ops as if they were the operands
56071/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
56072/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
56073static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
56074 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
56075 TargetLowering::DAGCombinerInfo &DCI,
56076 const X86Subtarget &Subtarget) {
56077 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56077, __extension__
__PRETTY_FUNCTION__))
;
56078 unsigned EltSizeInBits = VT.getScalarSizeInBits();
56079
56080 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
56081 return DAG.getUNDEF(VT);
56082
56083 if (llvm::all_of(Ops, [](SDValue Op) {
56084 return ISD::isBuildVectorAllZeros(Op.getNode());
56085 }))
56086 return getZeroVector(VT, Subtarget, DAG, DL);
56087
56088 SDValue Op0 = Ops[0];
56089 bool IsSplat = llvm::all_equal(Ops);
56090
56091 // Repeated subvectors.
56092 if (IsSplat &&
56093 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56094 // If this broadcast is inserted into both halves, use a larger broadcast.
56095 if (Op0.getOpcode() == X86ISD::VBROADCAST)
56096 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
56097
56098 // If this simple subvector or scalar/subvector broadcast_load is inserted
56099 // into both halves, use a larger broadcast_load. Update other uses to use
56100 // an extracted subvector.
56101 if (ISD::isNormalLoad(Op0.getNode()) ||
56102 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56103 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
56104 auto *Mem = cast<MemSDNode>(Op0);
56105 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56106 ? X86ISD::VBROADCAST_LOAD
56107 : X86ISD::SUBV_BROADCAST_LOAD;
56108 if (SDValue BcastLd =
56109 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56110 SDValue BcastSrc =
56111 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56112 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56113 return BcastLd;
56114 }
56115 }
56116
56117 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56118 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
56119 (Subtarget.hasAVX2() ||
56120 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
56121 VT.getScalarType(), Subtarget)))
56122 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
56123 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
56124 Op0.getOperand(0),
56125 DAG.getIntPtrConstant(0, DL)));
56126
56127 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56128 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56129 (Subtarget.hasAVX2() ||
56130 (EltSizeInBits >= 32 &&
56131 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
56132 Op0.getOperand(0).getValueType() == VT.getScalarType())
56133 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
56134
56135 // concat_vectors(extract_subvector(broadcast(x)),
56136 // extract_subvector(broadcast(x))) -> broadcast(x)
56137 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56138 Op0.getOperand(0).getValueType() == VT) {
56139 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
56140 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
56141 return Op0.getOperand(0);
56142 }
56143 }
56144
56145 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56146 // Only concat of subvector high halves which vperm2x128 is best at.
56147 // TODO: This should go in combineX86ShufflesRecursively eventually.
56148 if (VT.is256BitVector() && Ops.size() == 2) {
56149 SDValue Src0 = peekThroughBitcasts(Ops[0]);
56150 SDValue Src1 = peekThroughBitcasts(Ops[1]);
56151 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56152 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
56153 EVT SrcVT0 = Src0.getOperand(0).getValueType();
56154 EVT SrcVT1 = Src1.getOperand(0).getValueType();
56155 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
56156 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
56157 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
56158 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
56159 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
56160 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
56161 DAG.getBitcast(VT, Src0.getOperand(0)),
56162 DAG.getBitcast(VT, Src1.getOperand(0)),
56163 DAG.getTargetConstant(0x31, DL, MVT::i8));
56164 }
56165 }
56166 }
56167
56168 // Repeated opcode.
56169 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56170 // but it currently struggles with different vector widths.
56171 if (llvm::all_of(Ops, [Op0](SDValue Op) {
56172 return Op.getOpcode() == Op0.getOpcode();
56173 })) {
56174 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
56175 SmallVector<SDValue> Subs;
56176 for (SDValue SubOp : SubOps)
56177 Subs.push_back(SubOp.getOperand(I));
56178 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
56179 };
56180 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
56181 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
56182 SDValue Sub = SubOps[I].getOperand(Op);
56183 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
56184 if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
56185 Sub.getOperand(0).getValueType() != VT ||
56186 Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
56187 return false;
56188 }
56189 return true;
56190 };
56191
56192 unsigned NumOps = Ops.size();
56193 switch (Op0.getOpcode()) {
56194 case X86ISD::VBROADCAST: {
56195 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
56196 return Op.getOperand(0).getValueType().is128BitVector();
56197 })) {
56198 if (VT == MVT::v4f64 || VT == MVT::v4i64)
56199 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
56200 ConcatSubOperand(VT, Ops, 0),
56201 ConcatSubOperand(VT, Ops, 0));
56202 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
56203 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
56204 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
56205 : X86ISD::PSHUFD,
56206 DL, VT, ConcatSubOperand(VT, Ops, 0),
56207 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56208 }
56209 break;
56210 }
56211 case X86ISD::MOVDDUP:
56212 case X86ISD::MOVSHDUP:
56213 case X86ISD::MOVSLDUP: {
56214 if (!IsSplat)
56215 return DAG.getNode(Op0.getOpcode(), DL, VT,
56216 ConcatSubOperand(VT, Ops, 0));
56217 break;
56218 }
56219 case X86ISD::SHUFP: {
56220 // Add SHUFPD support if/when necessary.
56221 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56222 llvm::all_of(Ops, [Op0](SDValue Op) {
56223 return Op.getOperand(2) == Op0.getOperand(2);
56224 })) {
56225 return DAG.getNode(Op0.getOpcode(), DL, VT,
56226 ConcatSubOperand(VT, Ops, 0),
56227 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56228 }
56229 break;
56230 }
56231 case X86ISD::PSHUFHW:
56232 case X86ISD::PSHUFLW:
56233 case X86ISD::PSHUFD:
56234 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56235 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56236 return DAG.getNode(Op0.getOpcode(), DL, VT,
56237 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56238 }
56239 [[fallthrough]];
56240 case X86ISD::VPERMILPI:
56241 if (!IsSplat && VT.getScalarSizeInBits() == 32 &&
56242 (VT.is256BitVector() ||
56243 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56244 all_of(Ops, [&Op0](SDValue Op) {
56245 return Op0.getOperand(1) == Op.getOperand(1);
56246 })) {
56247 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56248 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56249 Res =
56250 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56251 return DAG.getBitcast(VT, Res);
56252 }
56253 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56254 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56255 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56256 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56257 return DAG.getNode(Op0.getOpcode(), DL, VT,
56258 ConcatSubOperand(VT, Ops, 0),
56259 DAG.getTargetConstant(Idx, DL, MVT::i8));
56260 }
56261 break;
56262 case X86ISD::PSHUFB:
56263 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56264 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56265 return DAG.getNode(Op0.getOpcode(), DL, VT,
56266 ConcatSubOperand(VT, Ops, 0),
56267 ConcatSubOperand(VT, Ops, 1));
56268 }
56269 break;
56270 case X86ISD::VPERMV:
56271 if (!IsSplat && NumOps == 2 &&
56272 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56273 MVT OpVT = Op0.getSimpleValueType();
56274 int NumSrcElts = OpVT.getVectorNumElements();
56275 SmallVector<int, 64> ConcatMask;
56276 for (unsigned i = 0; i != NumOps; ++i) {
56277 SmallVector<int, 64> SubMask;
56278 SmallVector<SDValue, 2> SubOps;
56279 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56280 SubMask))
56281 break;
56282 for (int M : SubMask) {
56283 if (0 <= M)
56284 M += i * NumSrcElts;
56285 ConcatMask.push_back(M);
56286 }
56287 }
56288 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56289 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56290 Ops[1].getOperand(1), DAG, DL);
56291 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56292 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56293 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56294 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56295 }
56296 }
56297 break;
56298 case X86ISD::VPERMV3:
56299 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56300 MVT OpVT = Op0.getSimpleValueType();
56301 int NumSrcElts = OpVT.getVectorNumElements();
56302 SmallVector<int, 64> ConcatMask;
56303 for (unsigned i = 0; i != NumOps; ++i) {
56304 SmallVector<int, 64> SubMask;
56305 SmallVector<SDValue, 2> SubOps;
56306 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56307 SubMask))
56308 break;
56309 for (int M : SubMask) {
56310 if (0 <= M) {
56311 M += M < NumSrcElts ? 0 : NumSrcElts;
56312 M += i * NumSrcElts;
56313 }
56314 ConcatMask.push_back(M);
56315 }
56316 }
56317 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56318 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56319 Ops[1].getOperand(0), DAG, DL);
56320 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56321 Ops[1].getOperand(2), DAG, DL);
56322 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56323 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56324 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56325 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56326 }
56327 }
56328 break;
56329 case ISD::TRUNCATE:
56330 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56331 EVT SrcVT = Ops[0].getOperand(0).getValueType();
56332 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56333 SrcVT == Ops[1].getOperand(0).getValueType() &&
56334 Subtarget.useAVX512Regs() &&
56335 Subtarget.getPreferVectorWidth() >= 512 &&
56336 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56337 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56338 return DAG.getNode(ISD::TRUNCATE, DL, VT,
56339 ConcatSubOperand(NewSrcVT, Ops, 0));
56340 }
56341 }
56342 break;
56343 case X86ISD::VSHLI:
56344 case X86ISD::VSRLI:
56345 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56346 // TODO: Move this to LowerShiftByScalarImmediate?
56347 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56348 llvm::all_of(Ops, [](SDValue Op) {
56349 return Op.getConstantOperandAPInt(1) == 32;
56350 })) {
56351 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56352 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56353 if (Op0.getOpcode() == X86ISD::VSHLI) {
56354 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56355 {8, 0, 8, 2, 8, 4, 8, 6});
56356 } else {
56357 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56358 {1, 8, 3, 8, 5, 8, 7, 8});
56359 }
56360 return DAG.getBitcast(VT, Res);
56361 }
56362 [[fallthrough]];
56363 case X86ISD::VSRAI:
56364 case X86ISD::VSHL:
56365 case X86ISD::VSRL:
56366 case X86ISD::VSRA:
56367 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
56368 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56369 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
56370 llvm::all_of(Ops, [Op0](SDValue Op) {
56371 return Op0.getOperand(1) == Op.getOperand(1);
56372 })) {
56373 return DAG.getNode(Op0.getOpcode(), DL, VT,
56374 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56375 }
56376 break;
56377 case X86ISD::VPERMI:
56378 case X86ISD::VROTLI:
56379 case X86ISD::VROTRI:
56380 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56381 llvm::all_of(Ops, [Op0](SDValue Op) {
56382 return Op0.getOperand(1) == Op.getOperand(1);
56383 })) {
56384 return DAG.getNode(Op0.getOpcode(), DL, VT,
56385 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56386 }
56387 break;
56388 case ISD::AND:
56389 case ISD::OR:
56390 case ISD::XOR:
56391 case X86ISD::ANDNP:
56392 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56393 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56394 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56395 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56396 NumOps * SrcVT.getVectorNumElements());
56397 return DAG.getNode(Op0.getOpcode(), DL, VT,
56398 ConcatSubOperand(SrcVT, Ops, 0),
56399 ConcatSubOperand(SrcVT, Ops, 1));
56400 }
56401 break;
56402 case X86ISD::GF2P8AFFINEQB:
56403 if (!IsSplat &&
56404 (VT.is256BitVector() ||
56405 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56406 llvm::all_of(Ops, [Op0](SDValue Op) {
56407 return Op0.getOperand(2) == Op.getOperand(2);
56408 })) {
56409 return DAG.getNode(Op0.getOpcode(), DL, VT,
56410 ConcatSubOperand(VT, Ops, 0),
56411 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56412 }
56413 break;
56414 case ISD::ADD:
56415 case ISD::SUB:
56416 case ISD::MUL:
56417 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56418 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56419 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
56420 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56421 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56422 NumOps * SrcVT.getVectorNumElements());
56423 return DAG.getNode(Op0.getOpcode(), DL, VT,
56424 ConcatSubOperand(SrcVT, Ops, 0),
56425 ConcatSubOperand(SrcVT, Ops, 1));
56426 }
56427 break;
56428 case ISD::FADD:
56429 case ISD::FSUB:
56430 case ISD::FMUL:
56431 case ISD::FDIV:
56432 if (!IsSplat && (VT.is256BitVector() ||
56433 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56434 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56435 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56436 NumOps * SrcVT.getVectorNumElements());
56437 return DAG.getNode(Op0.getOpcode(), DL, VT,
56438 ConcatSubOperand(SrcVT, Ops, 0),
56439 ConcatSubOperand(SrcVT, Ops, 1));
56440 }
56441 break;
56442 case X86ISD::HADD:
56443 case X86ISD::HSUB:
56444 case X86ISD::FHADD:
56445 case X86ISD::FHSUB:
56446 case X86ISD::PACKSS:
56447 case X86ISD::PACKUS:
56448 if (!IsSplat && VT.is256BitVector() &&
56449 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
56450 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56451 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56452 NumOps * SrcVT.getVectorNumElements());
56453 return DAG.getNode(Op0.getOpcode(), DL, VT,
56454 ConcatSubOperand(SrcVT, Ops, 0),
56455 ConcatSubOperand(SrcVT, Ops, 1));
56456 }
56457 break;
56458 case X86ISD::PALIGNR:
56459 if (!IsSplat &&
56460 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56461 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
56462 llvm::all_of(Ops, [Op0](SDValue Op) {
56463 return Op0.getOperand(2) == Op.getOperand(2);
56464 })) {
56465 return DAG.getNode(Op0.getOpcode(), DL, VT,
56466 ConcatSubOperand(VT, Ops, 0),
56467 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56468 }
56469 break;
56470 case ISD::VSELECT:
56471 if (!IsSplat && Subtarget.hasAVX512() &&
56472 (VT.is256BitVector() ||
56473 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56474 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
56475 EVT SelVT = Ops[0].getOperand(0).getValueType();
56476 if (SelVT.getVectorElementType() == MVT::i1) {
56477 SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
56478 Ops.size() * SelVT.getVectorNumElements());
56479 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56480 return DAG.getNode(Op0.getOpcode(), DL, VT,
56481 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56482 ConcatSubOperand(VT, Ops, 1),
56483 ConcatSubOperand(VT, Ops, 2));
56484 }
56485 }
56486 [[fallthrough]];
56487 case X86ISD::BLENDV:
56488 if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
56489 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
56490 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
56491 EVT SelVT = Ops[0].getOperand(0).getValueType();
56492 SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56493 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56494 return DAG.getNode(Op0.getOpcode(), DL, VT,
56495 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56496 ConcatSubOperand(VT, Ops, 1),
56497 ConcatSubOperand(VT, Ops, 2));
56498 }
56499 break;
56500 }
56501 }
56502
56503 // Fold subvector loads into one.
56504 // If needed, look through bitcasts to get to the load.
56505 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
56506 unsigned Fast;
56507 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
56508 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
56509 *FirstLd->getMemOperand(), &Fast) &&
56510 Fast) {
56511 if (SDValue Ld =
56512 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
56513 return Ld;
56514 }
56515 }
56516
56517 // Attempt to fold target constant loads.
56518 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
56519 SmallVector<APInt> EltBits;
56520 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
56521 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56522 APInt OpUndefElts;
56523 SmallVector<APInt> OpEltBits;
56524 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
56525 OpEltBits, true, false))
56526 break;
56527 EltBits.append(OpEltBits);
56528 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
56529 }
56530 if (EltBits.size() == VT.getVectorNumElements())
56531 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
56532 }
56533
56534 return SDValue();
56535}
56536
56537static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
56538 TargetLowering::DAGCombinerInfo &DCI,
56539 const X86Subtarget &Subtarget) {
56540 EVT VT = N->getValueType(0);
56541 EVT SrcVT = N->getOperand(0).getValueType();
56542 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56543 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56544
56545 if (VT.getVectorElementType() == MVT::i1) {
56546 // Attempt to constant fold.
56547 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56548 APInt Constant = APInt::getZero(VT.getSizeInBits());
56549 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56550 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56551 if (!C) break;
56552 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56553 if (I == (E - 1)) {
56554 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56555 if (TLI.isTypeLegal(IntVT))
56556 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56557 }
56558 }
56559
56560 // Don't do anything else for i1 vectors.
56561 return SDValue();
56562 }
56563
56564 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56565 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56566 DCI, Subtarget))
56567 return R;
56568 }
56569
56570 return SDValue();
56571}
56572
56573static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
56574 TargetLowering::DAGCombinerInfo &DCI,
56575 const X86Subtarget &Subtarget) {
56576 if (DCI.isBeforeLegalizeOps())
56577 return SDValue();
56578
56579 MVT OpVT = N->getSimpleValueType(0);
56580
56581 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56582
56583 SDLoc dl(N);
56584 SDValue Vec = N->getOperand(0);
56585 SDValue SubVec = N->getOperand(1);
56586
56587 uint64_t IdxVal = N->getConstantOperandVal(2);
56588 MVT SubVecVT = SubVec.getSimpleValueType();
56589
56590 if (Vec.isUndef() && SubVec.isUndef())
56591 return DAG.getUNDEF(OpVT);
56592
56593 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56594 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56595 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56596 return getZeroVector(OpVT, Subtarget, DAG, dl);
56597
56598 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
56599 // If we're inserting into a zero vector and then into a larger zero vector,
56600 // just insert into the larger zero vector directly.
56601 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56602 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
56603 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56604 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56605 getZeroVector(OpVT, Subtarget, DAG, dl),
56606 SubVec.getOperand(1),
56607 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56608 }
56609
56610 // If we're inserting into a zero vector and our input was extracted from an
56611 // insert into a zero vector of the same type and the extraction was at
56612 // least as large as the original insertion. Just insert the original
56613 // subvector into a zero vector.
56614 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56615 isNullConstant(SubVec.getOperand(1)) &&
56616 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
56617 SDValue Ins = SubVec.getOperand(0);
56618 if (isNullConstant(Ins.getOperand(2)) &&
56619 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56620 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56621 SubVecVT.getFixedSizeInBits())
56622 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56623 getZeroVector(OpVT, Subtarget, DAG, dl),
56624 Ins.getOperand(1), N->getOperand(2));
56625 }
56626 }
56627
56628 // Stop here if this is an i1 vector.
56629 if (IsI1Vector)
56630 return SDValue();
56631
56632 // Eliminate an intermediate vector widening:
56633 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56634 // insert_subvector X, Y, Idx
56635 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56636 // there?
56637 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56638 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56639 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56640 SubVec.getOperand(1), N->getOperand(2));
56641
56642 // If this is an insert of an extract, combine to a shuffle. Don't do this
56643 // if the insert or extract can be represented with a subregister operation.
56644 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56645 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56646 (IdxVal != 0 ||
56647 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56648 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56649 if (ExtIdxVal != 0) {
56650 int VecNumElts = OpVT.getVectorNumElements();
56651 int SubVecNumElts = SubVecVT.getVectorNumElements();
56652 SmallVector<int, 64> Mask(VecNumElts);
56653 // First create an identity shuffle mask.
56654 for (int i = 0; i != VecNumElts; ++i)
56655 Mask[i] = i;
56656 // Now insert the extracted portion.
56657 for (int i = 0; i != SubVecNumElts; ++i)
56658 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56659
56660 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56661 }
56662 }
56663
56664 // Match concat_vector style patterns.
56665 SmallVector<SDValue, 2> SubVectorOps;
56666 if (collectConcatOps(N, SubVectorOps, DAG)) {
56667 if (SDValue Fold =
56668 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56669 return Fold;
56670
56671 // If we're inserting all zeros into the upper half, change this to
56672 // a concat with zero. We will match this to a move
56673 // with implicit upper bit zeroing during isel.
56674 // We do this here because we don't want combineConcatVectorOps to
56675 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56676 if (SubVectorOps.size() == 2 &&
56677 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56678 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56679 getZeroVector(OpVT, Subtarget, DAG, dl),
56680 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56681 }
56682
56683 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56684 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56685 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56686
56687 // If this is a broadcast load inserted into an upper undef, use a larger
56688 // broadcast load.
56689 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56690 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56691 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56692 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56693 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56694 SDValue BcastLd =
56695 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
56696 MemIntr->getMemoryVT(),
56697 MemIntr->getMemOperand());
56698 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56699 return BcastLd;
56700 }
56701
56702 // If we're splatting the lower half subvector of a full vector load into the
56703 // upper half, attempt to create a subvector broadcast.
56704 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56705 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56706 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56707 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56708 if (VecLd && SubLd &&
56709 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56710 SubVec.getValueSizeInBits() / 8, 0))
56711 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56712 SubLd, 0, DAG);
56713 }
56714
56715 return SDValue();
56716}
56717
56718/// If we are extracting a subvector of a vector select and the select condition
56719/// is composed of concatenated vectors, try to narrow the select width. This
56720/// is a common pattern for AVX1 integer code because 256-bit selects may be
56721/// legal, but there is almost no integer math/logic available for 256-bit.
56722/// This function should only be called with legal types (otherwise, the calls
56723/// to get simple value types will assert).
56724static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
56725 SDValue Sel = Ext->getOperand(0);
56726 SmallVector<SDValue, 4> CatOps;
56727 if (Sel.getOpcode() != ISD::VSELECT ||
56728 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
56729 return SDValue();
56730
56731 // Note: We assume simple value types because this should only be called with
56732 // legal operations/types.
56733 // TODO: This can be extended to handle extraction to 256-bits.
56734 MVT VT = Ext->getSimpleValueType(0);
56735 if (!VT.is128BitVector())
56736 return SDValue();
56737
56738 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56739 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56740 return SDValue();
56741
56742 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56743 MVT SelVT = Sel.getSimpleValueType();
56744 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56745, __extension__
__PRETTY_FUNCTION__))
56745 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56745, __extension__
__PRETTY_FUNCTION__))
;
56746
56747 unsigned SelElts = SelVT.getVectorNumElements();
56748 unsigned CastedElts = WideVT.getVectorNumElements();
56749 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56750 if (SelElts % CastedElts == 0) {
56751 // The select has the same or more (narrower) elements than the extract
56752 // operand. The extraction index gets scaled by that factor.
56753 ExtIdx *= (SelElts / CastedElts);
56754 } else if (CastedElts % SelElts == 0) {
56755 // The select has less (wider) elements than the extract operand. Make sure
56756 // that the extraction index can be divided evenly.
56757 unsigned IndexDivisor = CastedElts / SelElts;
56758 if (ExtIdx % IndexDivisor != 0)
56759 return SDValue();
56760 ExtIdx /= IndexDivisor;
56761 } else {
56762 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56762)
;
56763 }
56764
56765 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56766 unsigned NarrowElts = SelElts / NarrowingFactor;
56767 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56768 SDLoc DL(Ext);
56769 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56770 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56771 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56772 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56773 return DAG.getBitcast(VT, NarrowSel);
56774}
56775
56776static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
56777 TargetLowering::DAGCombinerInfo &DCI,
56778 const X86Subtarget &Subtarget) {
56779 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56780 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56781 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56782 // We let generic combining take over from there to simplify the
56783 // insert/extract and 'not'.
56784 // This pattern emerges during AVX1 legalization. We handle it before lowering
56785 // to avoid complications like splitting constant vector loads.
56786
56787 // Capture the original wide type in the likely case that we need to bitcast
56788 // back to this type.
56789 if (!N->getValueType(0).isSimple())
56790 return SDValue();
56791
56792 MVT VT = N->getSimpleValueType(0);
56793 SDValue InVec = N->getOperand(0);
56794 unsigned IdxVal = N->getConstantOperandVal(1);
56795 SDValue InVecBC = peekThroughBitcasts(InVec);
56796 EVT InVecVT = InVec.getValueType();
56797 unsigned SizeInBits = VT.getSizeInBits();
56798 unsigned InSizeInBits = InVecVT.getSizeInBits();
56799 unsigned NumSubElts = VT.getVectorNumElements();
56800 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56801
56802 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56803 TLI.isTypeLegal(InVecVT) &&
56804 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56805 auto isConcatenatedNot = [](SDValue V) {
56806 V = peekThroughBitcasts(V);
56807 if (!isBitwiseNot(V))
56808 return false;
56809 SDValue NotOp = V->getOperand(0);
56810 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
56811 };
56812 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56813 isConcatenatedNot(InVecBC.getOperand(1))) {
56814 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56815 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
56816 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
56817 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56818 }
56819 }
56820
56821 if (DCI.isBeforeLegalizeOps())
56822 return SDValue();
56823
56824 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
56825 return V;
56826
56827 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
56828 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
56829
56830 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56831 if (VT.getScalarType() == MVT::i1)
56832 return DAG.getConstant(1, SDLoc(N), VT);
56833 return getOnesVector(VT, DAG, SDLoc(N));
56834 }
56835
56836 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56837 return DAG.getBuildVector(VT, SDLoc(N),
56838 InVec->ops().slice(IdxVal, NumSubElts));
56839
56840 // If we are extracting from an insert into a larger vector, replace with a
56841 // smaller insert if we don't access less than the original subvector. Don't
56842 // do this for i1 vectors.
56843 // TODO: Relax the matching indices requirement?
56844 if (VT.getVectorElementType() != MVT::i1 &&
56845 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56846 IdxVal == InVec.getConstantOperandVal(2) &&
56847 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56848 SDLoc DL(N);
56849 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56850 InVec.getOperand(0), N->getOperand(1));
56851 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56852 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56853 InVec.getOperand(1),
56854 DAG.getVectorIdxConstant(NewIdxVal, DL));
56855 }
56856
56857 // If we're extracting an upper subvector from a broadcast we should just
56858 // extract the lowest subvector instead which should allow
56859 // SimplifyDemandedVectorElts do more simplifications.
56860 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56861 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56862 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56863 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
56864
56865 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56866 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56867 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56868 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
56869
56870 // Attempt to extract from the source of a shuffle vector.
56871 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56872 SmallVector<int, 32> ShuffleMask;
56873 SmallVector<int, 32> ScaledMask;
56874 SmallVector<SDValue, 2> ShuffleInputs;
56875 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56876 // Decode the shuffle mask and scale it so its shuffling subvectors.
56877 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56878 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56879 unsigned SubVecIdx = IdxVal / NumSubElts;
56880 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56881 return DAG.getUNDEF(VT);
56882 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56883 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
56884 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56885 if (Src.getValueSizeInBits() == InSizeInBits) {
56886 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56887 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56888 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56889 SDLoc(N), SizeInBits);
56890 }
56891 }
56892 }
56893
56894 // If we're extracting the lowest subvector and we're the only user,
56895 // we may be able to perform this with a smaller vector width.
56896 unsigned InOpcode = InVec.getOpcode();
56897 if (InVec.hasOneUse()) {
56898 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56899 // v2f64 CVTDQ2PD(v4i32).
56900 if (InOpcode == ISD::SINT_TO_FP &&
56901 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56902 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
56903 }
56904 // v2f64 CVTUDQ2PD(v4i32).
56905 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56906 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56907 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
56908 }
56909 // v2f64 CVTPS2PD(v4f32).
56910 if (InOpcode == ISD::FP_EXTEND &&
56911 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56912 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
56913 }
56914 }
56915 if (IdxVal == 0 &&
56916 (InOpcode == ISD::ANY_EXTEND ||
56917 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
56918 InOpcode == ISD::ZERO_EXTEND ||
56919 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
56920 InOpcode == ISD::SIGN_EXTEND ||
56921 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56922 (SizeInBits == 128 || SizeInBits == 256) &&
56923 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56924 SDLoc DL(N);
56925 SDValue Ext = InVec.getOperand(0);
56926 if (Ext.getValueSizeInBits() > SizeInBits)
56927 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56928 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56929 return DAG.getNode(ExtOp, DL, VT, Ext);
56930 }
56931 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56932 InVec.getOperand(0).getValueType().is256BitVector() &&
56933 InVec.getOperand(1).getValueType().is256BitVector() &&
56934 InVec.getOperand(2).getValueType().is256BitVector()) {
56935 SDLoc DL(N);
56936 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56937 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56938 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56939 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56940 }
56941 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56942 (VT.is128BitVector() || VT.is256BitVector())) {
56943 SDLoc DL(N);
56944 SDValue InVecSrc = InVec.getOperand(0);
56945 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56946 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56947 return DAG.getNode(InOpcode, DL, VT, Ext);
56948 }
56949 if (InOpcode == X86ISD::MOVDDUP &&
56950 (VT.is128BitVector() || VT.is256BitVector())) {
56951 SDLoc DL(N);
56952 SDValue Ext0 =
56953 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56954 return DAG.getNode(InOpcode, DL, VT, Ext0);
56955 }
56956 }
56957
56958 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56959 // as this is very likely to fold into a shuffle/truncation.
56960 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56961 InVecVT.getScalarSizeInBits() == 64 &&
56962 InVec.getConstantOperandAPInt(1) == 32) {
56963 SDLoc DL(N);
56964 SDValue Ext =
56965 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56966 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56967 }
56968
56969 return SDValue();
56970}
56971
56972static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
56973 EVT VT = N->getValueType(0);
56974 SDValue Src = N->getOperand(0);
56975 SDLoc DL(N);
56976
56977 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56978 // This occurs frequently in our masked scalar intrinsic code and our
56979 // floating point select lowering with AVX512.
56980 // TODO: SimplifyDemandedBits instead?
56981 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56982 isOneConstant(Src.getOperand(1)))
56983 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56984
56985 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56986 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56987 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56988 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
56989 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
56990 if (C->isZero())
56991 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56992 Src.getOperand(1));
56993
56994 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56995 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56996 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56997 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56998 if (Op.getValueType() != MVT::i64)
56999 return SDValue();
57000 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
57001 if (Op.getOpcode() == Opc &&
57002 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
57003 return Op.getOperand(0);
57004 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
57005 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
57006 if (Ld->getExtensionType() == Ext &&
57007 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57008 return Op;
57009 if (IsZeroExt) {
57010 KnownBits Known = DAG.computeKnownBits(Op);
57011 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
57012 return Op;
57013 }
57014 return SDValue();
57015 };
57016
57017 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
57018 return DAG.getBitcast(
57019 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57020 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
57021
57022 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
57023 return DAG.getBitcast(
57024 VT,
57025 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
57026 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57027 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
57028 }
57029
57030 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
57031 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
57032 Src.getOperand(0).getValueType() == MVT::x86mmx)
57033 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
57034
57035 // See if we're broadcasting the scalar value, in which case just reuse that.
57036 // Ensure the same SDValue from the SDNode use is being used.
57037 if (VT.getScalarType() == Src.getValueType())
57038 for (SDNode *User : Src->uses())
57039 if (User->getOpcode() == X86ISD::VBROADCAST &&
57040 Src == User->getOperand(0)) {
57041 unsigned SizeInBits = VT.getFixedSizeInBits();
57042 unsigned BroadcastSizeInBits =
57043 User->getValueSizeInBits(0).getFixedValue();
57044 if (BroadcastSizeInBits == SizeInBits)
57045 return SDValue(User, 0);
57046 if (BroadcastSizeInBits > SizeInBits)
57047 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57048 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57049 // coverage.
57050 }
57051
57052 return SDValue();
57053}
57054
57055// Simplify PMULDQ and PMULUDQ operations.
57056static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
57057 TargetLowering::DAGCombinerInfo &DCI,
57058 const X86Subtarget &Subtarget) {
57059 SDValue LHS = N->getOperand(0);
57060 SDValue RHS = N->getOperand(1);
57061
57062 // Canonicalize constant to RHS.
57063 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
57064 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
57065 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57066
57067 // Multiply by zero.
57068 // Don't return RHS as it may contain UNDEFs.
57069 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57070 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57071
57072 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57073 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57074 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57075 return SDValue(N, 0);
57076
57077 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57078 // convert it to any_extend_invec, due to the LegalOperations check, do the
57079 // conversion directly to a vector shuffle manually. This exposes combine
57080 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57081 // combineX86ShufflesRecursively on SSE4.1 targets.
57082 // FIXME: This is basically a hack around several other issues related to
57083 // ANY_EXTEND_VECTOR_INREG.
57084 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57085 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57086 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57087 LHS.getOperand(0).getValueType() == MVT::v4i32) {
57088 SDLoc dl(N);
57089 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57090 LHS.getOperand(0), { 0, -1, 1, -1 });
57091 LHS = DAG.getBitcast(MVT::v2i64, LHS);
57092 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57093 }
57094 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57095 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57096 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57097 RHS.getOperand(0).getValueType() == MVT::v4i32) {
57098 SDLoc dl(N);
57099 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57100 RHS.getOperand(0), { 0, -1, 1, -1 });
57101 RHS = DAG.getBitcast(MVT::v2i64, RHS);
57102 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57103 }
57104
57105 return SDValue();
57106}
57107
57108// Simplify VPMADDUBSW/VPMADDWD operations.
57109static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
57110 TargetLowering::DAGCombinerInfo &DCI) {
57111 EVT VT = N->getValueType(0);
57112 SDValue LHS = N->getOperand(0);
57113 SDValue RHS = N->getOperand(1);
57114
57115 // Multiply by zero.
57116 // Don't return LHS/RHS as it may contain UNDEFs.
57117 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57118 ISD::isBuildVectorAllZeros(RHS.getNode()))
57119 return DAG.getConstant(0, SDLoc(N), VT);
57120
57121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57122 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57123 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57124 return SDValue(N, 0);
57125
57126 return SDValue();
57127}
57128
57129static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
57130 TargetLowering::DAGCombinerInfo &DCI,
57131 const X86Subtarget &Subtarget) {
57132 EVT VT = N->getValueType(0);
57133 SDValue In = N->getOperand(0);
57134 unsigned Opcode = N->getOpcode();
57135 unsigned InOpcode = In.getOpcode();
57136 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57137 SDLoc DL(N);
57138
57139 // Try to merge vector loads and extend_inreg to an extload.
57140 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57141 In.hasOneUse()) {
57142 auto *Ld = cast<LoadSDNode>(In);
57143 if (Ld->isSimple()) {
57144 MVT SVT = In.getSimpleValueType().getVectorElementType();
57145 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
57146 ? ISD::SEXTLOAD
57147 : ISD::ZEXTLOAD;
57148 EVT MemVT = VT.changeVectorElementType(SVT);
57149 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57150 SDValue Load = DAG.getExtLoad(
57151 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57152 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57153 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57154 return Load;
57155 }
57156 }
57157 }
57158
57159 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57160 if (Opcode == InOpcode)
57161 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57162
57163 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57164 // -> EXTEND_VECTOR_INREG(X).
57165 // TODO: Handle non-zero subvector indices.
57166 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57167 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57168 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57169 In.getValueSizeInBits())
57170 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57171
57172 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57173 // TODO: Move to DAGCombine?
57174 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57175 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57176 In.getValueSizeInBits() == VT.getSizeInBits()) {
57177 unsigned NumElts = VT.getVectorNumElements();
57178 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57179 EVT EltVT = In.getOperand(0).getValueType();
57180 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57181 for (unsigned I = 0; I != NumElts; ++I)
57182 Elts[I * Scale] = In.getOperand(I);
57183 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57184 }
57185
57186 // Attempt to combine as a shuffle on SSE41+ targets.
57187 if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
57188 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
57189 Subtarget.hasSSE41()) {
57190 SDValue Op(N, 0);
57191 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57192 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57193 return Res;
57194 }
57195
57196 return SDValue();
57197}
57198
57199static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
57200 TargetLowering::DAGCombinerInfo &DCI) {
57201 EVT VT = N->getValueType(0);
57202
57203 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57204 return DAG.getConstant(0, SDLoc(N), VT);
57205
57206 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57207 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57208 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57209 return SDValue(N, 0);
57210
57211 return SDValue();
57212}
57213
57214// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57215// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57216// extra instructions between the conversion due to going to scalar and back.
57217static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
57218 const X86Subtarget &Subtarget) {
57219 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57220 return SDValue();
57221
57222 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57223 return SDValue();
57224
57225 if (N->getValueType(0) != MVT::f32 ||
57226 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57227 return SDValue();
57228
57229 SDLoc dl(N);
57230 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57231 N->getOperand(0).getOperand(0));
57232 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57233 DAG.getTargetConstant(4, dl, MVT::i32));
57234 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57235 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57236 DAG.getIntPtrConstant(0, dl));
57237}
57238
57239static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
57240 const X86Subtarget &Subtarget) {
57241 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57242 return SDValue();
57243
57244 if (Subtarget.hasFP16())
57245 return SDValue();
57246
57247 bool IsStrict = N->isStrictFPOpcode();
57248 EVT VT = N->getValueType(0);
57249 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57250 EVT SrcVT = Src.getValueType();
57251
57252 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57253 return SDValue();
57254
57255 if (VT.getVectorElementType() != MVT::f32 &&
57256 VT.getVectorElementType() != MVT::f64)
57257 return SDValue();
57258
57259 unsigned NumElts = VT.getVectorNumElements();
57260 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57261 return SDValue();
57262
57263 SDLoc dl(N);
57264
57265 // Convert the input to vXi16.
57266 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57267 Src = DAG.getBitcast(IntVT, Src);
57268
57269 // Widen to at least 8 input elements.
57270 if (NumElts < 8) {
57271 unsigned NumConcats = 8 / NumElts;
57272 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57273 : DAG.getConstant(0, dl, IntVT);
57274 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57275 Ops[0] = Src;
57276 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57277 }
57278
57279 // Destination is vXf32 with at least 4 elements.
57280 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57281 std::max(4U, NumElts));
57282 SDValue Cvt, Chain;
57283 if (IsStrict) {
57284 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57285 {N->getOperand(0), Src});
57286 Chain = Cvt.getValue(1);
57287 } else {
57288 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57289 }
57290
57291 if (NumElts < 4) {
57292 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57292, __extension__
__PRETTY_FUNCTION__))
;
57293 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57294 DAG.getIntPtrConstant(0, dl));
57295 }
57296
57297 if (IsStrict) {
57298 // Extend to the original VT if necessary.
57299 if (Cvt.getValueType() != VT) {
57300 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57301 {Chain, Cvt});
57302 Chain = Cvt.getValue(1);
57303 }
57304 return DAG.getMergeValues({Cvt, Chain}, dl);
57305 }
57306
57307 // Extend to the original VT if necessary.
57308 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57309}
57310
57311// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57312// from. Limit this to cases where the loads have the same input chain and the
57313// output chains are unused. This avoids any memory ordering issues.
57314static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
57315 TargetLowering::DAGCombinerInfo &DCI) {
57316 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57318, __extension__
__PRETTY_FUNCTION__))
57317 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57318, __extension__
__PRETTY_FUNCTION__))
57318 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57318, __extension__
__PRETTY_FUNCTION__))
;
57319
57320 // Only do this if the chain result is unused.
57321 if (N->hasAnyUseOfValue(1))
57322 return SDValue();
57323
57324 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57325
57326 SDValue Ptr = MemIntrin->getBasePtr();
57327 SDValue Chain = MemIntrin->getChain();
57328 EVT VT = N->getSimpleValueType(0);
57329 EVT MemVT = MemIntrin->getMemoryVT();
57330
57331 // Look at other users of our base pointer and try to find a wider broadcast.
57332 // The input chain and the size of the memory VT must match.
57333 for (SDNode *User : Ptr->uses())
57334 if (User != N && User->getOpcode() == N->getOpcode() &&
57335 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57336 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57337 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57338 MemVT.getSizeInBits() &&
57339 !User->hasAnyUseOfValue(1) &&
57340 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57341 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57342 VT.getSizeInBits());
57343 Extract = DAG.getBitcast(VT, Extract);
57344 return DCI.CombineTo(N, Extract, SDValue(User, 1));
57345 }
57346
57347 return SDValue();
57348}
57349
57350static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
57351 const X86Subtarget &Subtarget) {
57352 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57353 return SDValue();
57354
57355 bool IsStrict = N->isStrictFPOpcode();
57356 EVT VT = N->getValueType(0);
57357 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57358 EVT SrcVT = Src.getValueType();
57359
57360 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57361 SrcVT.getVectorElementType() != MVT::f32)
57362 return SDValue();
57363
57364 SDLoc dl(N);
57365
57366 SDValue Cvt, Chain;
57367 unsigned NumElts = VT.getVectorNumElements();
57368 if (Subtarget.hasFP16()) {
57369 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
57370 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
57371 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
57372 SDValue Cvt0, Cvt1;
57373 SDValue Op0 = Src.getOperand(0);
57374 SDValue Op1 = Src.getOperand(1);
57375 bool IsOp0Strict = Op0->isStrictFPOpcode();
57376 if (Op0.getOpcode() != Op1.getOpcode() ||
57377 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
57378 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
57379 return SDValue();
57380 }
57381 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
57382 if (IsStrict) {
57383 assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57383, __extension__
__PRETTY_FUNCTION__))
;
57384 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
57385 ? X86ISD::STRICT_CVTSI2P
57386 : X86ISD::STRICT_CVTUI2P;
57387 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57388 {Op0.getOperand(0), Op0.getOperand(1)});
57389 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57390 {Op1.getOperand(0), Op1.getOperand(1)});
57391 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57392 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
57393 }
57394 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
57395 : X86ISD::CVTUI2P;
57396 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
57397 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
57398 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57399 }
57400 return SDValue();
57401 }
57402
57403 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57404 return SDValue();
57405
57406 // Widen to at least 4 input elements.
57407 if (NumElts < 4)
57408 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
57409 DAG.getConstantFP(0.0, dl, SrcVT));
57410
57411 // Destination is v8i16 with at least 8 elements.
57412 EVT CvtVT =
57413 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
57414 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
57415 if (IsStrict) {
57416 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
57417 {N->getOperand(0), Src, Rnd});
57418 Chain = Cvt.getValue(1);
57419 } else {
57420 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
57421 }
57422
57423 // Extract down to real number of elements.
57424 if (NumElts < 8) {
57425 EVT IntVT = VT.changeVectorElementTypeToInteger();
57426 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
57427 DAG.getIntPtrConstant(0, dl));
57428 }
57429
57430 Cvt = DAG.getBitcast(VT, Cvt);
57431
57432 if (IsStrict)
57433 return DAG.getMergeValues({Cvt, Chain}, dl);
57434
57435 return Cvt;
57436}
57437
57438static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
57439 SDValue Src = N->getOperand(0);
57440
57441 // Turn MOVDQ2Q+simple_load into an mmx load.
57442 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
57443 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
57444
57445 if (LN->isSimple()) {
57446 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57447 LN->getBasePtr(),
57448 LN->getPointerInfo(),
57449 LN->getOriginalAlign(),
57450 LN->getMemOperand()->getFlags());
57451 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
57452 return NewLd;
57453 }
57454 }
57455
57456 return SDValue();
57457}
57458
57459static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
57460 TargetLowering::DAGCombinerInfo &DCI) {
57461 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57463 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
57464 return SDValue(N, 0);
57465
57466 return SDValue();
57467}
57468
57469SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
57470 DAGCombinerInfo &DCI) const {
57471 SelectionDAG &DAG = DCI.DAG;
57472 switch (N->getOpcode()) {
57473 default: break;
57474 case ISD::SCALAR_TO_VECTOR:
57475 return combineScalarToVector(N, DAG);
57476 case ISD::EXTRACT_VECTOR_ELT:
57477 case X86ISD::PEXTRW:
57478 case X86ISD::PEXTRB:
57479 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57480 case ISD::CONCAT_VECTORS:
57481 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57482 case ISD::INSERT_SUBVECTOR:
57483 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57484 case ISD::EXTRACT_SUBVECTOR:
57485 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57486 case ISD::VSELECT:
57487 case ISD::SELECT:
57488 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57489 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57490 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57491 case X86ISD::CMP: return combineCMP(N, DAG);
57492 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57493 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57494 case X86ISD::ADD:
57495 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
57496 case X86ISD::SBB: return combineSBB(N, DAG);
57497 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57498 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57499 case ISD::SHL: return combineShiftLeft(N, DAG);
57500 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57501 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57502 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57503 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57504 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57505 case X86ISD::BEXTR:
57506 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57507 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57508 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57509 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57510 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57511 case X86ISD::VEXTRACT_STORE:
57512 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57513 case ISD::SINT_TO_FP:
57514 case ISD::STRICT_SINT_TO_FP:
57515 return combineSIntToFP(N, DAG, DCI, Subtarget);
57516 case ISD::UINT_TO_FP:
57517 case ISD::STRICT_UINT_TO_FP:
57518 return combineUIntToFP(N, DAG, Subtarget);
57519 case ISD::FADD:
57520 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57521 case X86ISD::VFCMULC:
57522 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57523 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57524 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57525 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57526 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57527 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57528 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57529 case X86ISD::FXOR:
57530 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57531 case X86ISD::FMIN:
57532 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57533 case ISD::FMINNUM:
57534 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57535 case X86ISD::CVTSI2P:
57536 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57537 case X86ISD::CVTP2SI:
57538 case X86ISD::CVTP2UI:
57539 case X86ISD::STRICT_CVTTP2SI:
57540 case X86ISD::CVTTP2SI:
57541 case X86ISD::STRICT_CVTTP2UI:
57542 case X86ISD::CVTTP2UI:
57543 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57544 case X86ISD::STRICT_CVTPH2PS:
57545 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57546 case X86ISD::BT: return combineBT(N, DAG, DCI);
57547 case ISD::ANY_EXTEND:
57548 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57549 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57550 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57551 case ISD::ANY_EXTEND_VECTOR_INREG:
57552 case ISD::SIGN_EXTEND_VECTOR_INREG:
57553 case ISD::ZERO_EXTEND_VECTOR_INREG:
57554 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57555 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57556 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57557 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57558 case X86ISD::PACKSS:
57559 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57560 case X86ISD::HADD:
57561 case X86ISD::HSUB:
57562 case X86ISD::FHADD:
57563 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57564 case X86ISD::VSHL:
57565 case X86ISD::VSRA:
57566 case X86ISD::VSRL:
57567 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57568 case X86ISD::VSHLI:
57569 case X86ISD::VSRAI:
57570 case X86ISD::VSRLI:
57571 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57572 case ISD::INSERT_VECTOR_ELT:
57573 case X86ISD::PINSRB:
57574 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57575 case X86ISD::SHUFP: // Handle all target specific shuffles
57576 case X86ISD::INSERTPS:
57577 case X86ISD::EXTRQI:
57578 case X86ISD::INSERTQI:
57579 case X86ISD::VALIGN:
57580 case X86ISD::PALIGNR:
57581 case X86ISD::VSHLDQ:
57582 case X86ISD::VSRLDQ:
57583 case X86ISD::BLENDI:
57584 case X86ISD::UNPCKH:
57585 case X86ISD::UNPCKL:
57586 case X86ISD::MOVHLPS:
57587 case X86ISD::MOVLHPS:
57588 case X86ISD::PSHUFB:
57589 case X86ISD::PSHUFD:
57590 case X86ISD::PSHUFHW:
57591 case X86ISD::PSHUFLW:
57592 case X86ISD::MOVSHDUP:
57593 case X86ISD::MOVSLDUP:
57594 case X86ISD::MOVDDUP:
57595 case X86ISD::MOVSS:
57596 case X86ISD::MOVSD:
57597 case X86ISD::MOVSH:
57598 case X86ISD::VBROADCAST:
57599 case X86ISD::VPPERM:
57600 case X86ISD::VPERMI:
57601 case X86ISD::VPERMV:
57602 case X86ISD::VPERMV3:
57603 case X86ISD::VPERMIL2:
57604 case X86ISD::VPERMILPI:
57605 case X86ISD::VPERMILPV:
57606 case X86ISD::VPERM2X128:
57607 case X86ISD::SHUF128:
57608 case X86ISD::VZEXT_MOVL:
57609 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57610 case X86ISD::FMADD_RND:
57611 case X86ISD::FMSUB:
57612 case X86ISD::STRICT_FMSUB:
57613 case X86ISD::FMSUB_RND:
57614 case X86ISD::FNMADD:
57615 case X86ISD::STRICT_FNMADD:
57616 case X86ISD::FNMADD_RND:
57617 case X86ISD::FNMSUB:
57618 case X86ISD::STRICT_FNMSUB:
57619 case X86ISD::FNMSUB_RND:
57620 case ISD::FMA:
57621 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57622 case X86ISD::FMADDSUB_RND:
57623 case X86ISD::FMSUBADD_RND:
57624 case X86ISD::FMADDSUB:
57625 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57626 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57627 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57628 case X86ISD::MGATHER:
57629 case X86ISD::MSCATTER:
57630 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
57631 case ISD::MGATHER:
57632 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57633 case X86ISD::PCMPEQ:
57634 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57635 case X86ISD::PMULDQ:
57636 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57637 case X86ISD::VPMADDUBSW:
57638 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57639 case X86ISD::KSHIFTL:
57640 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57641 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57642 case ISD::STRICT_FP_EXTEND:
57643 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
57644 case ISD::STRICT_FP_ROUND:
57645 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57646 case X86ISD::VBROADCAST_LOAD:
57647 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57648 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57649 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57650 }
57651
57652 return SDValue();
57653}
57654
57655bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
57656 return false;
57657}
57658
57659bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57660 if (!isTypeLegal(VT))
57661 return false;
57662
57663 // There are no vXi8 shifts.
57664 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57665 return false;
57666
57667 // TODO: Almost no 8-bit ops are desirable because they have no actual
57668 // size/speed advantages vs. 32-bit ops, but they do have a major
57669 // potential disadvantage by causing partial register stalls.
57670 //
57671 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57672 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57673 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57674 // check for a constant operand to the multiply.
57675 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57676 return false;
57677
57678 // i16 instruction encodings are longer and some i16 instructions are slow,
57679 // so those are not desirable.
57680 if (VT == MVT::i16) {
57681 switch (Opc) {
57682 default:
57683 break;
57684 case ISD::LOAD:
57685 case ISD::SIGN_EXTEND:
57686 case ISD::ZERO_EXTEND:
57687 case ISD::ANY_EXTEND:
57688 case ISD::SHL:
57689 case ISD::SRA:
57690 case ISD::SRL:
57691 case ISD::SUB:
57692 case ISD::ADD:
57693 case ISD::MUL:
57694 case ISD::AND:
57695 case ISD::OR:
57696 case ISD::XOR:
57697 return false;
57698 }
57699 }
57700
57701 // Any legal type not explicitly accounted for above here is desirable.
57702 return true;
57703}
57704
57705SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
57706 SDValue Value, SDValue Addr,
57707 SelectionDAG &DAG) const {
57708 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57709 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57710 if (IsCFProtectionSupported) {
57711 // In case control-flow branch protection is enabled, we need to add
57712 // notrack prefix to the indirect branch.
57713 // In order to do that we create NT_BRIND SDNode.
57714 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57715 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
57716 }
57717
57718 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
57719}
57720
57721TargetLowering::AndOrSETCCFoldKind
57722X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
57723 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57724 using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
57725 EVT VT = LogicOp->getValueType(0);
57726 EVT OpVT = SETCC0->getOperand(0).getValueType();
57727 if (!VT.isInteger())
57728 return AndOrSETCCFoldKind::None;
57729
57730 if (VT.isVector())
57731 return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
57732 (isOperationLegal(ISD::ABS, OpVT)
57733 ? AndOrSETCCFoldKind::ABS
57734 : AndOrSETCCFoldKind::None));
57735
57736 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57737 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57738 // `NotAnd` applies, `AddAnd` does as well.
57739 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57740 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57741 return AndOrSETCCFoldKind::AddAnd;
57742}
57743
57744bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
57745 EVT VT = Op.getValueType();
57746 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57747 isa<ConstantSDNode>(Op.getOperand(1));
57748
57749 // i16 is legal, but undesirable since i16 instruction encodings are longer
57750 // and some i16 instructions are slow.
57751 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57752 // using LEA and/or other ALU ops.
57753 if (VT != MVT::i16 && !Is8BitMulByConstant)
57754 return false;
57755
57756 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57757 if (!Op.hasOneUse())
57758 return false;
57759 SDNode *User = *Op->use_begin();
57760 if (!ISD::isNormalStore(User))
57761 return false;
57762 auto *Ld = cast<LoadSDNode>(Load);
57763 auto *St = cast<StoreSDNode>(User);
57764 return Ld->getBasePtr() == St->getBasePtr();
57765 };
57766
57767 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57768 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57769 return false;
57770 if (!Op.hasOneUse())
57771 return false;
57772 SDNode *User = *Op->use_begin();
57773 if (User->getOpcode() != ISD::ATOMIC_STORE)
57774 return false;
57775 auto *Ld = cast<AtomicSDNode>(Load);
57776 auto *St = cast<AtomicSDNode>(User);
57777 return Ld->getBasePtr() == St->getBasePtr();
57778 };
57779
57780 bool Commute = false;
57781 switch (Op.getOpcode()) {
57782 default: return false;
57783 case ISD::SIGN_EXTEND:
57784 case ISD::ZERO_EXTEND:
57785 case ISD::ANY_EXTEND:
57786 break;
57787 case ISD::SHL:
57788 case ISD::SRA:
57789 case ISD::SRL: {
57790 SDValue N0 = Op.getOperand(0);
57791 // Look out for (store (shl (load), x)).
57792 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57793 return false;
57794 break;
57795 }
57796 case ISD::ADD:
57797 case ISD::MUL:
57798 case ISD::AND:
57799 case ISD::OR:
57800 case ISD::XOR:
57801 Commute = true;
57802 [[fallthrough]];
57803 case ISD::SUB: {
57804 SDValue N0 = Op.getOperand(0);
57805 SDValue N1 = Op.getOperand(1);
57806 // Avoid disabling potential load folding opportunities.
57807 if (X86::mayFoldLoad(N1, Subtarget) &&
57808 (!Commute || !isa<ConstantSDNode>(N0) ||
57809 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57810 return false;
57811 if (X86::mayFoldLoad(N0, Subtarget) &&
57812 ((Commute && !isa<ConstantSDNode>(N1)) ||
57813 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57814 return false;
57815 if (IsFoldableAtomicRMW(N0, Op) ||
57816 (Commute && IsFoldableAtomicRMW(N1, Op)))
57817 return false;
57818 }
57819 }
57820
57821 PVT = MVT::i32;
57822 return true;
57823}
57824
57825//===----------------------------------------------------------------------===//
57826// X86 Inline Assembly Support
57827//===----------------------------------------------------------------------===//
57828
57829// Helper to match a string separated by whitespace.
57830static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
57831 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57832
57833 for (StringRef Piece : Pieces) {
57834 if (!S.startswith(Piece)) // Check if the piece matches.
57835 return false;
57836
57837 S = S.substr(Piece.size());
57838 StringRef::size_type Pos = S.find_first_not_of(" \t");
57839 if (Pos == 0) // We matched a prefix.
57840 return false;
57841
57842 S = S.substr(Pos);
57843 }
57844
57845 return S.empty();
57846}
57847
57848static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
57849
57850 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57851 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57852 llvm::is_contained(AsmPieces, "~{flags}") &&
57853 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57854
57855 if (AsmPieces.size() == 3)
57856 return true;
57857 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57858 return true;
57859 }
57860 }
57861 return false;
57862}
57863
57864bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
57865 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57866
57867 const std::string &AsmStr = IA->getAsmString();
57868
57869 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57870 if (!Ty || Ty->getBitWidth() % 16 != 0)
57871 return false;
57872
57873 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57874 SmallVector<StringRef, 4> AsmPieces;
57875 SplitString(AsmStr, AsmPieces, ";\n");
57876
57877 switch (AsmPieces.size()) {
57878 default: return false;
57879 case 1:
57880 // FIXME: this should verify that we are targeting a 486 or better. If not,
57881 // we will turn this bswap into something that will be lowered to logical
57882 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57883 // lower so don't worry about this.
57884 // bswap $0
57885 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57886 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57887 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57888 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57889 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57890 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57891 // No need to check constraints, nothing other than the equivalent of
57892 // "=r,0" would be valid here.
57893 return IntrinsicLowering::LowerToByteSwap(CI);
57894 }
57895
57896 // rorw $$8, ${0:w} --> llvm.bswap.i16
57897 if (CI->getType()->isIntegerTy(16) &&
57898 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57899 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57900 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57901 AsmPieces.clear();
57902 StringRef ConstraintsStr = IA->getConstraintString();
57903 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57904 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57905 if (clobbersFlagRegisters(AsmPieces))
57906 return IntrinsicLowering::LowerToByteSwap(CI);
57907 }
57908 break;
57909 case 3:
57910 if (CI->getType()->isIntegerTy(32) &&
57911 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57912 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57913 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57914 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57915 AsmPieces.clear();
57916 StringRef ConstraintsStr = IA->getConstraintString();
57917 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57918 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57919 if (clobbersFlagRegisters(AsmPieces))
57920 return IntrinsicLowering::LowerToByteSwap(CI);
57921 }
57922
57923 if (CI->getType()->isIntegerTy(64)) {
57924 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57925 if (Constraints.size() >= 2 &&
57926 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57927 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57928 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57929 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57930 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57931 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57932 return IntrinsicLowering::LowerToByteSwap(CI);
57933 }
57934 }
57935 break;
57936 }
57937 return false;
57938}
57939
57940static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
57941 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
57942 .Case("{@cca}", X86::COND_A)
57943 .Case("{@ccae}", X86::COND_AE)
57944 .Case("{@ccb}", X86::COND_B)
57945 .Case("{@ccbe}", X86::COND_BE)
57946 .Case("{@ccc}", X86::COND_B)
57947 .Case("{@cce}", X86::COND_E)
57948 .Case("{@ccz}", X86::COND_E)
57949 .Case("{@ccg}", X86::COND_G)
57950 .Case("{@ccge}", X86::COND_GE)
57951 .Case("{@ccl}", X86::COND_L)
57952 .Case("{@ccle}", X86::COND_LE)
57953 .Case("{@ccna}", X86::COND_BE)
57954 .Case("{@ccnae}", X86::COND_B)
57955 .Case("{@ccnb}", X86::COND_AE)
57956 .Case("{@ccnbe}", X86::COND_A)
57957 .Case("{@ccnc}", X86::COND_AE)
57958 .Case("{@ccne}", X86::COND_NE)
57959 .Case("{@ccnz}", X86::COND_NE)
57960 .Case("{@ccng}", X86::COND_LE)
57961 .Case("{@ccnge}", X86::COND_L)
57962 .Case("{@ccnl}", X86::COND_GE)
57963 .Case("{@ccnle}", X86::COND_G)
57964 .Case("{@ccno}", X86::COND_NO)
57965 .Case("{@ccnp}", X86::COND_NP)
57966 .Case("{@ccns}", X86::COND_NS)
57967 .Case("{@cco}", X86::COND_O)
57968 .Case("{@ccp}", X86::COND_P)
57969 .Case("{@ccs}", X86::COND_S)
57970 .Default(X86::COND_INVALID);
57971 return Cond;
57972}
57973
57974/// Given a constraint letter, return the type of constraint for this target.
57975X86TargetLowering::ConstraintType
57976X86TargetLowering::getConstraintType(StringRef Constraint) const {
57977 if (Constraint.size() == 1) {
57978 switch (Constraint[0]) {
57979 case 'R':
57980 case 'q':
57981 case 'Q':
57982 case 'f':
57983 case 't':
57984 case 'u':
57985 case 'y':
57986 case 'x':
57987 case 'v':
57988 case 'l':
57989 case 'k': // AVX512 masking registers.
57990 return C_RegisterClass;
57991 case 'a':
57992 case 'b':
57993 case 'c':
57994 case 'd':
57995 case 'S':
57996 case 'D':
57997 case 'A':
57998 return C_Register;
57999 case 'I':
58000 case 'J':
58001 case 'K':
58002 case 'N':
58003 case 'G':
58004 case 'L':
58005 case 'M':
58006 return C_Immediate;
58007 case 'C':
58008 case 'e':
58009 case 'Z':
58010 return C_Other;
58011 default:
58012 break;
58013 }
58014 }
58015 else if (Constraint.size() == 2) {
58016 switch (Constraint[0]) {
58017 default:
58018 break;
58019 case 'Y':
58020 switch (Constraint[1]) {
58021 default:
58022 break;
58023 case 'z':
58024 return C_Register;
58025 case 'i':
58026 case 'm':
58027 case 'k':
58028 case 't':
58029 case '2':
58030 return C_RegisterClass;
58031 }
58032 }
58033 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58034 return C_Other;
58035 return TargetLowering::getConstraintType(Constraint);
58036}
58037
58038/// Examine constraint type and operand type and determine a weight value.
58039/// This object must already have been set up with the operand type
58040/// and the current alternative constraint selected.
58041TargetLowering::ConstraintWeight
58042 X86TargetLowering::getSingleConstraintMatchWeight(
58043 AsmOperandInfo &info, const char *constraint) const {
58044 ConstraintWeight weight = CW_Invalid;
58045 Value *CallOperandVal = info.CallOperandVal;
58046 // If we don't have a value, we can't do a match,
58047 // but allow it at the lowest weight.
58048 if (!CallOperandVal)
58049 return CW_Default;
58050 Type *type = CallOperandVal->getType();
58051 // Look at the constraint type.
58052 switch (*constraint) {
58053 default:
58054 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
58055 [[fallthrough]];
58056 case 'R':
58057 case 'q':
58058 case 'Q':
58059 case 'a':
58060 case 'b':
58061 case 'c':
58062 case 'd':
58063 case 'S':
58064 case 'D':
58065 case 'A':
58066 if (CallOperandVal->getType()->isIntegerTy())
58067 weight = CW_SpecificReg;
58068 break;
58069 case 'f':
58070 case 't':
58071 case 'u':
58072 if (type->isFloatingPointTy())
58073 weight = CW_SpecificReg;
58074 break;
58075 case 'y':
58076 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58077 weight = CW_SpecificReg;
58078 break;
58079 case 'Y':
58080 if (StringRef(constraint).size() != 2)
58081 break;
58082 switch (constraint[1]) {
58083 default:
58084 return CW_Invalid;
58085 // XMM0
58086 case 'z':
58087 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58088 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58089 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58090 return CW_SpecificReg;
58091 return CW_Invalid;
58092 // Conditional OpMask regs (AVX512)
58093 case 'k':
58094 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58095 return CW_Register;
58096 return CW_Invalid;
58097 // Any MMX reg
58098 case 'm':
58099 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58100 return weight;
58101 return CW_Invalid;
58102 // Any SSE reg when ISA >= SSE2, same as 'x'
58103 case 'i':
58104 case 't':
58105 case '2':
58106 if (!Subtarget.hasSSE2())
58107 return CW_Invalid;
58108 break;
58109 }
58110 break;
58111 case 'v':
58112 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58113 weight = CW_Register;
58114 [[fallthrough]];
58115 case 'x':
58116 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58117 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58118 weight = CW_Register;
58119 break;
58120 case 'k':
58121 // Enable conditional vector operations using %k<#> registers.
58122 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58123 weight = CW_Register;
58124 break;
58125 case 'I':
58126 if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
58127 if (C->getZExtValue() <= 31)
58128 weight = CW_Constant;
58129 }
58130 break;
58131 case 'J':
58132 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58133 if (C->getZExtValue() <= 63)
58134 weight = CW_Constant;
58135 }
58136 break;
58137 case 'K':
58138 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58139 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58140 weight = CW_Constant;
58141 }
58142 break;
58143 case 'L':
58144 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58145 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58146 weight = CW_Constant;
58147 }
58148 break;
58149 case 'M':
58150 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58151 if (C->getZExtValue() <= 3)
58152 weight = CW_Constant;
58153 }
58154 break;
58155 case 'N':
58156 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58157 if (C->getZExtValue() <= 0xff)
58158 weight = CW_Constant;
58159 }
58160 break;
58161 case 'G':
58162 case 'C':
58163 if (isa<ConstantFP>(CallOperandVal)) {
58164 weight = CW_Constant;
58165 }
58166 break;
58167 case 'e':
58168 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58169 if ((C->getSExtValue() >= -0x80000000LL) &&
58170 (C->getSExtValue() <= 0x7fffffffLL))
58171 weight = CW_Constant;
58172 }
58173 break;
58174 case 'Z':
58175 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58176 if (C->getZExtValue() <= 0xffffffff)
58177 weight = CW_Constant;
58178 }
58179 break;
58180 }
58181 return weight;
58182}
58183
58184/// Try to replace an X constraint, which matches anything, with another that
58185/// has more specific requirements based on the type of the corresponding
58186/// operand.
58187const char *X86TargetLowering::
58188LowerXConstraint(EVT ConstraintVT) const {
58189 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58190 // 'f' like normal targets.
58191 if (ConstraintVT.isFloatingPoint()) {
58192 if (Subtarget.hasSSE1())
58193 return "x";
58194 }
58195
58196 return TargetLowering::LowerXConstraint(ConstraintVT);
58197}
58198
58199// Lower @cc targets via setcc.
58200SDValue X86TargetLowering::LowerAsmOutputForConstraint(
58201 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58202 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58203 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
58204 if (Cond == X86::COND_INVALID)
58205 return SDValue();
58206 // Check that return type is valid.
58207 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58208 OpInfo.ConstraintVT.getSizeInBits() < 8)
58209 report_fatal_error("Glue output operand is of invalid type");
58210
58211 // Get EFLAGS register. Only update chain when copyfrom is glued.
58212 if (Glue.getNode()) {
58213 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58214 Chain = Glue.getValue(1);
58215 } else
58216 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58217 // Extract CC code.
58218 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58219 // Extend to 32-bits
58220 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58221
58222 return Result;
58223}
58224
58225/// Lower the specified operand into the Ops vector.
58226/// If it is invalid, don't add anything to Ops.
58227void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
58228 std::string &Constraint,
58229 std::vector<SDValue>&Ops,
58230 SelectionDAG &DAG) const {
58231 SDValue Result;
58232
58233 // Only support length 1 constraints for now.
58234 if (Constraint.length() > 1) return;
58235
58236 char ConstraintLetter = Constraint[0];
58237 switch (ConstraintLetter) {
58238 default: break;
58239 case 'I':
58240 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58241 if (C->getZExtValue() <= 31) {
58242 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58243 Op.getValueType());
58244 break;
58245 }
58246 }
58247 return;
58248 case 'J':
58249 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58250 if (C->getZExtValue() <= 63) {
58251 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58252 Op.getValueType());
58253 break;
58254 }
58255 }
58256 return;
58257 case 'K':
58258 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58259 if (isInt<8>(C->getSExtValue())) {
58260 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58261 Op.getValueType());
58262 break;
58263 }
58264 }
58265 return;
58266 case 'L':
58267 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58268 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58269 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58270 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58271 Op.getValueType());
58272 break;
58273 }
58274 }
58275 return;
58276 case 'M':
58277 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58278 if (C->getZExtValue() <= 3) {
58279 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58280 Op.getValueType());
58281 break;
58282 }
58283 }
58284 return;
58285 case 'N':
58286 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58287 if (C->getZExtValue() <= 255) {
58288 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58289 Op.getValueType());
58290 break;
58291 }
58292 }
58293 return;
58294 case 'O':
58295 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58296 if (C->getZExtValue() <= 127) {
58297 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58298 Op.getValueType());
58299 break;
58300 }
58301 }
58302 return;
58303 case 'e': {
58304 // 32-bit signed value
58305 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58306 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58307 C->getSExtValue())) {
58308 // Widen to 64 bits here to get it sign extended.
58309 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58310 break;
58311 }
58312 // FIXME gcc accepts some relocatable values here too, but only in certain
58313 // memory models; it's complicated.
58314 }
58315 return;
58316 }
58317 case 'Z': {
58318 // 32-bit unsigned value
58319 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58320 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58321 C->getZExtValue())) {
58322 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58323 Op.getValueType());
58324 break;
58325 }
58326 }
58327 // FIXME gcc accepts some relocatable values here too, but only in certain
58328 // memory models; it's complicated.
58329 return;
58330 }
58331 case 'i': {
58332 // Literal immediates are always ok.
58333 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58334 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58335 BooleanContent BCont = getBooleanContents(MVT::i64);
58336 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58337 : ISD::SIGN_EXTEND;
58338 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58339 : CST->getSExtValue();
58340 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58341 break;
58342 }
58343
58344 // In any sort of PIC mode addresses need to be computed at runtime by
58345 // adding in a register or some sort of table lookup. These can't
58346 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58347 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58348 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58349 return;
58350
58351 // If we are in non-pic codegen mode, we allow the address of a global (with
58352 // an optional displacement) to be used with 'i'.
58353 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58354 // If we require an extra load to get this address, as in PIC mode, we
58355 // can't accept it.
58356 if (isGlobalStubReference(
58357 Subtarget.classifyGlobalReference(GA->getGlobal())))
58358 return;
58359 break;
58360 }
58361 }
58362
58363 if (Result.getNode()) {
58364 Ops.push_back(Result);
58365 return;
58366 }
58367 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58368}
58369
58370/// Check if \p RC is a general purpose register class.
58371/// I.e., GR* or one of their variant.
58372static bool isGRClass(const TargetRegisterClass &RC) {
58373 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
58374 RC.hasSuperClassEq(&X86::GR16RegClass) ||
58375 RC.hasSuperClassEq(&X86::GR32RegClass) ||
58376 RC.hasSuperClassEq(&X86::GR64RegClass) ||
58377 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
58378}
58379
58380/// Check if \p RC is a vector register class.
58381/// I.e., FR* / VR* or one of their variant.
58382static bool isFRClass(const TargetRegisterClass &RC) {
58383 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
58384 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
58385 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
58386 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
58387 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
58388 RC.hasSuperClassEq(&X86::VR512RegClass);
58389}
58390
58391/// Check if \p RC is a mask register class.
58392/// I.e., VK* or one of their variant.
58393static bool isVKClass(const TargetRegisterClass &RC) {
58394 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
58395 RC.hasSuperClassEq(&X86::VK2RegClass) ||
58396 RC.hasSuperClassEq(&X86::VK4RegClass) ||
58397 RC.hasSuperClassEq(&X86::VK8RegClass) ||
58398 RC.hasSuperClassEq(&X86::VK16RegClass) ||
58399 RC.hasSuperClassEq(&X86::VK32RegClass) ||
58400 RC.hasSuperClassEq(&X86::VK64RegClass);
58401}
58402
58403std::pair<unsigned, const TargetRegisterClass *>
58404X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
58405 StringRef Constraint,
58406 MVT VT) const {
58407 // First, see if this is a constraint that directly corresponds to an LLVM
58408 // register class.
58409 if (Constraint.size() == 1) {
58410 // GCC Constraint Letters
58411 switch (Constraint[0]) {
58412 default: break;
58413 // 'A' means [ER]AX + [ER]DX.
58414 case 'A':
58415 if (Subtarget.is64Bit())
58416 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
58417 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58418, __extension__
__PRETTY_FUNCTION__))
58418 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58418, __extension__
__PRETTY_FUNCTION__))
;
58419 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58420
58421 // TODO: Slight differences here in allocation order and leaving
58422 // RIP in the class. Do they matter any more here than they do
58423 // in the normal allocation?
58424 case 'k':
58425 if (Subtarget.hasAVX512()) {
58426 if (VT == MVT::i1)
58427 return std::make_pair(0U, &X86::VK1RegClass);
58428 if (VT == MVT::i8)
58429 return std::make_pair(0U, &X86::VK8RegClass);
58430 if (VT == MVT::i16)
58431 return std::make_pair(0U, &X86::VK16RegClass);
58432 }
58433 if (Subtarget.hasBWI()) {
58434 if (VT == MVT::i32)
58435 return std::make_pair(0U, &X86::VK32RegClass);
58436 if (VT == MVT::i64)
58437 return std::make_pair(0U, &X86::VK64RegClass);
58438 }
58439 break;
58440 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58441 if (Subtarget.is64Bit()) {
58442 if (VT == MVT::i8 || VT == MVT::i1)
58443 return std::make_pair(0U, &X86::GR8RegClass);
58444 if (VT == MVT::i16)
58445 return std::make_pair(0U, &X86::GR16RegClass);
58446 if (VT == MVT::i32 || VT == MVT::f32)
58447 return std::make_pair(0U, &X86::GR32RegClass);
58448 if (VT != MVT::f80 && !VT.isVector())
58449 return std::make_pair(0U, &X86::GR64RegClass);
58450 break;
58451 }
58452 [[fallthrough]];
58453 // 32-bit fallthrough
58454 case 'Q': // Q_REGS
58455 if (VT == MVT::i8 || VT == MVT::i1)
58456 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58457 if (VT == MVT::i16)
58458 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58459 if (VT == MVT::i32 || VT == MVT::f32 ||
58460 (!VT.isVector() && !Subtarget.is64Bit()))
58461 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58462 if (VT != MVT::f80 && !VT.isVector())
58463 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58464 break;
58465 case 'r': // GENERAL_REGS
58466 case 'l': // INDEX_REGS
58467 if (VT == MVT::i8 || VT == MVT::i1)
58468 return std::make_pair(0U, &X86::GR8RegClass);
58469 if (VT == MVT::i16)
58470 return std::make_pair(0U, &X86::GR16RegClass);
58471 if (VT == MVT::i32 || VT == MVT::f32 ||
58472 (!VT.isVector() && !Subtarget.is64Bit()))
58473 return std::make_pair(0U, &X86::GR32RegClass);
58474 if (VT != MVT::f80 && !VT.isVector())
58475 return std::make_pair(0U, &X86::GR64RegClass);
58476 break;
58477 case 'R': // LEGACY_REGS
58478 if (VT == MVT::i8 || VT == MVT::i1)
58479 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58480 if (VT == MVT::i16)
58481 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58482 if (VT == MVT::i32 || VT == MVT::f32 ||
58483 (!VT.isVector() && !Subtarget.is64Bit()))
58484 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58485 if (VT != MVT::f80 && !VT.isVector())
58486 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58487 break;
58488 case 'f': // FP Stack registers.
58489 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58490 // value to the correct fpstack register class.
58491 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58492 return std::make_pair(0U, &X86::RFP32RegClass);
58493 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58494 return std::make_pair(0U, &X86::RFP64RegClass);
58495 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58496 return std::make_pair(0U, &X86::RFP80RegClass);
58497 break;
58498 case 'y': // MMX_REGS if MMX allowed.
58499 if (!Subtarget.hasMMX()) break;
58500 return std::make_pair(0U, &X86::VR64RegClass);
58501 case 'v':
58502 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58503 if (!Subtarget.hasSSE1()) break;
58504 bool VConstraint = (Constraint[0] == 'v');
58505
58506 switch (VT.SimpleTy) {
58507 default: break;
58508 // Scalar SSE types.
58509 case MVT::f16:
58510 if (VConstraint && Subtarget.hasFP16())
58511 return std::make_pair(0U, &X86::FR16XRegClass);
58512 break;
58513 case MVT::f32:
58514 case MVT::i32:
58515 if (VConstraint && Subtarget.hasVLX())
58516 return std::make_pair(0U, &X86::FR32XRegClass);
58517 return std::make_pair(0U, &X86::FR32RegClass);
58518 case MVT::f64:
58519 case MVT::i64:
58520 if (VConstraint && Subtarget.hasVLX())
58521 return std::make_pair(0U, &X86::FR64XRegClass);
58522 return std::make_pair(0U, &X86::FR64RegClass);
58523 case MVT::i128:
58524 if (Subtarget.is64Bit()) {
58525 if (VConstraint && Subtarget.hasVLX())
58526 return std::make_pair(0U, &X86::VR128XRegClass);
58527 return std::make_pair(0U, &X86::VR128RegClass);
58528 }
58529 break;
58530 // Vector types and fp128.
58531 case MVT::v8f16:
58532 if (!Subtarget.hasFP16())
58533 break;
58534 [[fallthrough]];
58535 case MVT::f128:
58536 case MVT::v16i8:
58537 case MVT::v8i16:
58538 case MVT::v4i32:
58539 case MVT::v2i64:
58540 case MVT::v4f32:
58541 case MVT::v2f64:
58542 if (VConstraint && Subtarget.hasVLX())
58543 return std::make_pair(0U, &X86::VR128XRegClass);
58544 return std::make_pair(0U, &X86::VR128RegClass);
58545 // AVX types.
58546 case MVT::v16f16:
58547 if (!Subtarget.hasFP16())
58548 break;
58549 [[fallthrough]];
58550 case MVT::v32i8:
58551 case MVT::v16i16:
58552 case MVT::v8i32:
58553 case MVT::v4i64:
58554 case MVT::v8f32:
58555 case MVT::v4f64:
58556 if (VConstraint && Subtarget.hasVLX())
58557 return std::make_pair(0U, &X86::VR256XRegClass);
58558 if (Subtarget.hasAVX())
58559 return std::make_pair(0U, &X86::VR256RegClass);
58560 break;
58561 case MVT::v32f16:
58562 if (!Subtarget.hasFP16())
58563 break;
58564 [[fallthrough]];
58565 case MVT::v64i8:
58566 case MVT::v32i16:
58567 case MVT::v8f64:
58568 case MVT::v16f32:
58569 case MVT::v16i32:
58570 case MVT::v8i64:
58571 if (!Subtarget.hasAVX512()) break;
58572 if (VConstraint)
58573 return std::make_pair(0U, &X86::VR512RegClass);
58574 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58575 }
58576 break;
58577 }
58578 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58579 switch (Constraint[1]) {
58580 default:
58581 break;
58582 case 'i':
58583 case 't':
58584 case '2':
58585 return getRegForInlineAsmConstraint(TRI, "x", VT);
58586 case 'm':
58587 if (!Subtarget.hasMMX()) break;
58588 return std::make_pair(0U, &X86::VR64RegClass);
58589 case 'z':
58590 if (!Subtarget.hasSSE1()) break;
58591 switch (VT.SimpleTy) {
58592 default: break;
58593 // Scalar SSE types.
58594 case MVT::f16:
58595 if (!Subtarget.hasFP16())
58596 break;
58597 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58598 case MVT::f32:
58599 case MVT::i32:
58600 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58601 case MVT::f64:
58602 case MVT::i64:
58603 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58604 case MVT::v8f16:
58605 if (!Subtarget.hasFP16())
58606 break;
58607 [[fallthrough]];
58608 case MVT::f128:
58609 case MVT::v16i8:
58610 case MVT::v8i16:
58611 case MVT::v4i32:
58612 case MVT::v2i64:
58613 case MVT::v4f32:
58614 case MVT::v2f64:
58615 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58616 // AVX types.
58617 case MVT::v16f16:
58618 if (!Subtarget.hasFP16())
58619 break;
58620 [[fallthrough]];
58621 case MVT::v32i8:
58622 case MVT::v16i16:
58623 case MVT::v8i32:
58624 case MVT::v4i64:
58625 case MVT::v8f32:
58626 case MVT::v4f64:
58627 if (Subtarget.hasAVX())
58628 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58629 break;
58630 case MVT::v32f16:
58631 if (!Subtarget.hasFP16())
58632 break;
58633 [[fallthrough]];
58634 case MVT::v64i8:
58635 case MVT::v32i16:
58636 case MVT::v8f64:
58637 case MVT::v16f32:
58638 case MVT::v16i32:
58639 case MVT::v8i64:
58640 if (Subtarget.hasAVX512())
58641 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58642 break;
58643 }
58644 break;
58645 case 'k':
58646 // This register class doesn't allocate k0 for masked vector operation.
58647 if (Subtarget.hasAVX512()) {
58648 if (VT == MVT::i1)
58649 return std::make_pair(0U, &X86::VK1WMRegClass);
58650 if (VT == MVT::i8)
58651 return std::make_pair(0U, &X86::VK8WMRegClass);
58652 if (VT == MVT::i16)
58653 return std::make_pair(0U, &X86::VK16WMRegClass);
58654 }
58655 if (Subtarget.hasBWI()) {
58656 if (VT == MVT::i32)
58657 return std::make_pair(0U, &X86::VK32WMRegClass);
58658 if (VT == MVT::i64)
58659 return std::make_pair(0U, &X86::VK64WMRegClass);
58660 }
58661 break;
58662 }
58663 }
58664
58665 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58666 return std::make_pair(0U, &X86::GR32RegClass);
58667
58668 // Use the default implementation in TargetLowering to convert the register
58669 // constraint into a member of a register class.
58670 std::pair<Register, const TargetRegisterClass*> Res;
58671 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
58672
58673 // Not found as a standard register?
58674 if (!Res.second) {
58675 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58676 // to/from f80.
58677 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58678 // Map st(0) -> st(7) -> ST0
58679 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58680 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58681 Constraint[3] == '(' &&
58682 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58683 Constraint[5] == ')' && Constraint[6] == '}') {
58684 // st(7) is not allocatable and thus not a member of RFP80. Return
58685 // singleton class in cases where we have a reference to it.
58686 if (Constraint[4] == '7')
58687 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58688 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58689 &X86::RFP80RegClass);
58690 }
58691
58692 // GCC allows "st(0)" to be called just plain "st".
58693 if (StringRef("{st}").equals_insensitive(Constraint))
58694 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58695 }
58696
58697 // flags -> EFLAGS
58698 if (StringRef("{flags}").equals_insensitive(Constraint))
58699 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58700
58701 // dirflag -> DF
58702 // Only allow for clobber.
58703 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58704 VT == MVT::Other)
58705 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58706
58707 // fpsr -> FPSW
58708 if (StringRef("{fpsr}").equals_insensitive(Constraint))
58709 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58710
58711 return Res;
58712 }
58713
58714 // Make sure it isn't a register that requires 64-bit mode.
58715 if (!Subtarget.is64Bit() &&
58716 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58717 TRI->getEncodingValue(Res.first) >= 8) {
58718 // Register requires REX prefix, but we're in 32-bit mode.
58719 return std::make_pair(0, nullptr);
58720 }
58721
58722 // Make sure it isn't a register that requires AVX512.
58723 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58724 TRI->getEncodingValue(Res.first) & 0x10) {
58725 // Register requires EVEX prefix.
58726 return std::make_pair(0, nullptr);
58727 }
58728
58729 // Otherwise, check to see if this is a register class of the wrong value
58730 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58731 // turn into {ax},{dx}.
58732 // MVT::Other is used to specify clobber names.
58733 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58734 return Res; // Correct type already, nothing to do.
58735
58736 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58737 // return "eax". This should even work for things like getting 64bit integer
58738 // registers when given an f64 type.
58739 const TargetRegisterClass *Class = Res.second;
58740 // The generic code will match the first register class that contains the
58741 // given register. Thus, based on the ordering of the tablegened file,
58742 // the "plain" GR classes might not come first.
58743 // Therefore, use a helper method.
58744 if (isGRClass(*Class)) {
58745 unsigned Size = VT.getSizeInBits();
58746 if (Size == 1) Size = 8;
58747 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58748 return std::make_pair(0, nullptr);
58749 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58750 if (DestReg.isValid()) {
58751 bool is64Bit = Subtarget.is64Bit();
58752 const TargetRegisterClass *RC =
58753 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58754 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58755 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58756 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58757 if (Size == 64 && !is64Bit) {
58758 // Model GCC's behavior here and select a fixed pair of 32-bit
58759 // registers.
58760 switch (DestReg) {
58761 case X86::RAX:
58762 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58763 case X86::RDX:
58764 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58765 case X86::RCX:
58766 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58767 case X86::RBX:
58768 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58769 case X86::RSI:
58770 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58771 case X86::RDI:
58772 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58773 case X86::RBP:
58774 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58775 default:
58776 return std::make_pair(0, nullptr);
58777 }
58778 }
58779 if (RC && RC->contains(DestReg))
58780 return std::make_pair(DestReg, RC);
58781 return Res;
58782 }
58783 // No register found/type mismatch.
58784 return std::make_pair(0, nullptr);
58785 } else if (isFRClass(*Class)) {
58786 // Handle references to XMM physical registers that got mapped into the
58787 // wrong class. This can happen with constraints like {xmm0} where the
58788 // target independent register mapper will just pick the first match it can
58789 // find, ignoring the required type.
58790
58791 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58792 if (VT == MVT::f16)
58793 Res.second = &X86::FR16XRegClass;
58794 else if (VT == MVT::f32 || VT == MVT::i32)
58795 Res.second = &X86::FR32XRegClass;
58796 else if (VT == MVT::f64 || VT == MVT::i64)
58797 Res.second = &X86::FR64XRegClass;
58798 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58799 Res.second = &X86::VR128XRegClass;
58800 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58801 Res.second = &X86::VR256XRegClass;
58802 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58803 Res.second = &X86::VR512RegClass;
58804 else {
58805 // Type mismatch and not a clobber: Return an error;
58806 Res.first = 0;
58807 Res.second = nullptr;
58808 }
58809 } else if (isVKClass(*Class)) {
58810 if (VT == MVT::i1)
58811 Res.second = &X86::VK1RegClass;
58812 else if (VT == MVT::i8)
58813 Res.second = &X86::VK8RegClass;
58814 else if (VT == MVT::i16)
58815 Res.second = &X86::VK16RegClass;
58816 else if (VT == MVT::i32)
58817 Res.second = &X86::VK32RegClass;
58818 else if (VT == MVT::i64)
58819 Res.second = &X86::VK64RegClass;
58820 else {
58821 // Type mismatch and not a clobber: Return an error;
58822 Res.first = 0;
58823 Res.second = nullptr;
58824 }
58825 }
58826
58827 return Res;
58828}
58829
58830bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
58831 // Integer division on x86 is expensive. However, when aggressively optimizing
58832 // for code size, we prefer to use a div instruction, as it is usually smaller
58833 // than the alternative sequence.
58834 // The exception to this is vector division. Since x86 doesn't have vector
58835 // integer division, leaving the division as-is is a loss even in terms of
58836 // size, because it will have to be scalarized, while the alternative code
58837 // sequence can be performed in vector form.
58838 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
58839 return OptSize && !VT.isVector();
58840}
58841
58842void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
58843 if (!Subtarget.is64Bit())
58844 return;
58845
58846 // Update IsSplitCSR in X86MachineFunctionInfo.
58847 X86MachineFunctionInfo *AFI =
58848 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
58849 AFI->setIsSplitCSR(true);
58850}
58851
58852void X86TargetLowering::insertCopiesSplitCSR(
58853 MachineBasicBlock *Entry,
58854 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
58855 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
58856 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
58857 if (!IStart)
58858 return;
58859
58860 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
58861 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
58862 MachineBasicBlock::iterator MBBI = Entry->begin();
58863 for (const MCPhysReg *I = IStart; *I; ++I) {
58864 const TargetRegisterClass *RC = nullptr;
58865 if (X86::GR64RegClass.contains(*I))
58866 RC = &X86::GR64RegClass;
58867 else
58868 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58868)
;
58869
58870 Register NewVR = MRI->createVirtualRegister(RC);
58871 // Create copy from CSR to a virtual register.
58872 // FIXME: this currently does not emit CFI pseudo-instructions, it works
58873 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
58874 // nounwind. If we want to generalize this later, we may need to emit
58875 // CFI pseudo-instructions.
58876 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58878, __extension__
__PRETTY_FUNCTION__))
58877 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58878, __extension__
__PRETTY_FUNCTION__))
58878 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58878, __extension__
__PRETTY_FUNCTION__))
;
58879 Entry->addLiveIn(*I);
58880 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
58881 .addReg(*I);
58882
58883 // Insert the copy-back instructions right before the terminator.
58884 for (auto *Exit : Exits)
58885 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
58886 TII->get(TargetOpcode::COPY), *I)
58887 .addReg(NewVR);
58888 }
58889}
58890
58891bool X86TargetLowering::supportSwiftError() const {
58892 return Subtarget.is64Bit();
58893}
58894
58895/// Returns true if stack probing through a function call is requested.
58896bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
58897 return !getStackProbeSymbolName(MF).empty();
58898}
58899
58900/// Returns true if stack probing through inline assembly is requested.
58901bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
58902
58903 // No inline stack probe for Windows, they have their own mechanism.
58904 if (Subtarget.isOSWindows() ||
58905 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58906 return false;
58907
58908 // If the function specifically requests inline stack probes, emit them.
58909 if (MF.getFunction().hasFnAttribute("probe-stack"))
58910 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
58911 "inline-asm";
58912
58913 return false;
58914}
58915
58916/// Returns the name of the symbol used to emit stack probes or the empty
58917/// string if not applicable.
58918StringRef
58919X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
58920 // Inline Stack probes disable stack probe call
58921 if (hasInlineStackProbe(MF))
58922 return "";
58923
58924 // If the function specifically requests stack probes, emit them.
58925 if (MF.getFunction().hasFnAttribute("probe-stack"))
58926 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
58927
58928 // Generally, if we aren't on Windows, the platform ABI does not include
58929 // support for stack probes, so don't emit them.
58930 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
58931 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58932 return "";
58933
58934 // We need a stack probe to conform to the Windows ABI. Choose the right
58935 // symbol.
58936 if (Subtarget.is64Bit())
58937 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
58938 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
58939}
58940
58941unsigned
58942X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
58943 // The default stack probe size is 4096 if the function has no stackprobesize
58944 // attribute.
58945 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
58946 4096);
58947}
58948
58949Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
58950 if (ML->isInnermost() &&
58951 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
58952 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
58953 return TargetLowering::getPrefLoopAlignment();
58954}