Bug Summary

File:build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 13330, column 55
The result of the '%' expression is undefined

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-17/lib/clang/17 -I lib/Target/X86 -I /build/source/llvm/lib/Target/X86 -I include -I /build/source/llvm/include -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-04-13-070520-16501-1 -x c++ /build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/ObjCARCUtil.h"
31#include "llvm/Analysis/ProfileSummaryInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/CodeGen/IntrinsicLowering.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineJumpTableInfo.h"
38#include "llvm/CodeGen/MachineLoopInfo.h"
39#include "llvm/CodeGen/MachineModuleInfo.h"
40#include "llvm/CodeGen/MachineRegisterInfo.h"
41#include "llvm/CodeGen/TargetLowering.h"
42#include "llvm/CodeGen/WinEHFuncInfo.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
45#include "llvm/IR/DerivedTypes.h"
46#include "llvm/IR/DiagnosticInfo.h"
47#include "llvm/IR/EHPersonalities.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107/// Returns true if a CC can dynamically exclude a register from the list of
108/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
109/// the return registers.
110static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
111 switch (CC) {
112 default:
113 return false;
114 case CallingConv::X86_RegCall:
115 case CallingConv::PreserveMost:
116 case CallingConv::PreserveAll:
117 return true;
118 }
119}
120
121/// Returns true if a CC can dynamically exclude a register from the list of
122/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
123/// the parameters.
124static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
125 return CC == CallingConv::X86_RegCall;
126}
127
128X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
129 const X86Subtarget &STI)
130 : TargetLowering(TM), Subtarget(STI) {
131 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
132 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
133
134 // Set up the TargetLowering object.
135
136 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 setBooleanContents(ZeroOrOneBooleanContent);
138 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
140
141 // For 64-bit, since we have so many registers, use the ILP scheduler.
142 // For 32-bit, use the register pressure specific scheduling.
143 // For Atom, always use ILP scheduling.
144 if (Subtarget.isAtom())
145 setSchedulingPreference(Sched::ILP);
146 else if (Subtarget.is64Bit())
147 setSchedulingPreference(Sched::ILP);
148 else
149 setSchedulingPreference(Sched::RegPressure);
150 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
151 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
152
153 // Bypass expensive divides and use cheaper ones.
154 if (TM.getOptLevel() >= CodeGenOpt::Default) {
155 if (Subtarget.hasSlowDivide32())
156 addBypassSlowDiv(32, 8);
157 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
158 addBypassSlowDiv(64, 32);
159 }
160
161 // Setup Windows compiler runtime calls.
162 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
163 static const struct {
164 const RTLIB::Libcall Op;
165 const char * const Name;
166 const CallingConv::ID CC;
167 } LibraryCalls[] = {
168 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
169 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
170 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
171 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
172 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
173 };
174
175 for (const auto &LC : LibraryCalls) {
176 setLibcallName(LC.Op, LC.Name);
177 setLibcallCallingConv(LC.Op, LC.CC);
178 }
179 }
180
181 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
182 // MSVCRT doesn't have powi; fall back to pow
183 setLibcallName(RTLIB::POWI_F32, nullptr);
184 setLibcallName(RTLIB::POWI_F64, nullptr);
185 }
186
187 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
188 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
189 // FIXME: Should we be limiting the atomic size on other configs? Default is
190 // 1024.
191 if (!Subtarget.canUseCMPXCHG8B())
192 setMaxAtomicSizeInBitsSupported(32);
193
194 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
195
196 setMaxLargeFPConvertBitWidthSupported(128);
197
198 // Set up the register classes.
199 addRegisterClass(MVT::i8, &X86::GR8RegClass);
200 addRegisterClass(MVT::i16, &X86::GR16RegClass);
201 addRegisterClass(MVT::i32, &X86::GR32RegClass);
202 if (Subtarget.is64Bit())
203 addRegisterClass(MVT::i64, &X86::GR64RegClass);
204
205 for (MVT VT : MVT::integer_valuetypes())
206 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
207
208 // We don't accept any truncstore of integer registers.
209 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
211 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
212 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
213 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
214 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
215
216 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
217
218 // SETOEQ and SETUNE require checking two conditions.
219 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
220 setCondCodeAction(ISD::SETOEQ, VT, Expand);
221 setCondCodeAction(ISD::SETUNE, VT, Expand);
222 }
223
224 // Integer absolute.
225 if (Subtarget.canUseCMOV()) {
226 setOperationAction(ISD::ABS , MVT::i16 , Custom);
227 setOperationAction(ISD::ABS , MVT::i32 , Custom);
228 if (Subtarget.is64Bit())
229 setOperationAction(ISD::ABS , MVT::i64 , Custom);
230 }
231
232 // Signed saturation subtraction.
233 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
234 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
235 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
238
239 // Funnel shifts.
240 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
241 // For slow shld targets we only lower for code size.
242 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
243
244 setOperationAction(ShiftOp , MVT::i8 , Custom);
245 setOperationAction(ShiftOp , MVT::i16 , Custom);
246 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
247 if (Subtarget.is64Bit())
248 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
249 }
250
251 if (!Subtarget.useSoftFloat()) {
252 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
253 // operation.
254 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
255 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
256 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
257 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
258 // We have an algorithm for SSE2, and we turn this into a 64-bit
259 // FILD or VCVTUSI2SS/SD for other targets.
260 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
261 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
262 // We have an algorithm for SSE2->double, and we turn this into a
263 // 64-bit FILD followed by conditional FADD for other targets.
264 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
265 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
266
267 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
268 // this operation.
269 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
270 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
271 // SSE has no i16 to fp conversion, only i32. We promote in the handler
272 // to allow f80 to use i16 and f64 to use i16 with sse1 only
273 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
274 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
275 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
276 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
277 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
278 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
279 // are Legal, f80 is custom lowered.
280 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
281 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
282
283 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
284 // this operation.
285 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
286 // FIXME: This doesn't generate invalid exception when it should. PR44019.
287 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
288 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
289 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
290 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
291 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
292 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
293 // are Legal, f80 is custom lowered.
294 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
295 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
296
297 // Handle FP_TO_UINT by promoting the destination to a larger signed
298 // conversion.
299 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
300 // FIXME: This doesn't generate invalid exception when it should. PR44019.
301 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
302 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
303 // FIXME: This doesn't generate invalid exception when it should. PR44019.
304 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
305 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
306 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
307 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
308 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
309
310 setOperationAction(ISD::LRINT, MVT::f32, Custom);
311 setOperationAction(ISD::LRINT, MVT::f64, Custom);
312 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
313 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
314
315 if (!Subtarget.is64Bit()) {
316 setOperationAction(ISD::LRINT, MVT::i64, Custom);
317 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
318 }
319 }
320
321 if (Subtarget.hasSSE2()) {
322 // Custom lowering for saturating float to int conversions.
323 // We handle promotion to larger result types manually.
324 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
325 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
326 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
327 }
328 if (Subtarget.is64Bit()) {
329 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
330 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
331 }
332 }
333
334 // Handle address space casts between mixed sized pointers.
335 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
336 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
337
338 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
339 if (!Subtarget.hasSSE2()) {
340 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
341 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
342 if (Subtarget.is64Bit()) {
343 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
344 // Without SSE, i64->f64 goes through memory.
345 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
346 }
347 } else if (!Subtarget.is64Bit())
348 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
349
350 // Scalar integer divide and remainder are lowered to use operations that
351 // produce two results, to match the available instructions. This exposes
352 // the two-result form to trivial CSE, which is able to combine x/y and x%y
353 // into a single instruction.
354 //
355 // Scalar integer multiply-high is also lowered to use two-result
356 // operations, to match the available instructions. However, plain multiply
357 // (low) operations are left as Legal, as there are single-result
358 // instructions for this in x86. Using the two-result multiply instructions
359 // when both high and low results are needed must be arranged by dagcombine.
360 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
361 setOperationAction(ISD::MULHS, VT, Expand);
362 setOperationAction(ISD::MULHU, VT, Expand);
363 setOperationAction(ISD::SDIV, VT, Expand);
364 setOperationAction(ISD::UDIV, VT, Expand);
365 setOperationAction(ISD::SREM, VT, Expand);
366 setOperationAction(ISD::UREM, VT, Expand);
367 }
368
369 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
370 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
371 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
372 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
373 setOperationAction(ISD::BR_CC, VT, Expand);
374 setOperationAction(ISD::SELECT_CC, VT, Expand);
375 }
376 if (Subtarget.is64Bit())
377 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
378 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
379 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
380 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
381
382 setOperationAction(ISD::FREM , MVT::f32 , Expand);
383 setOperationAction(ISD::FREM , MVT::f64 , Expand);
384 setOperationAction(ISD::FREM , MVT::f80 , Expand);
385 setOperationAction(ISD::FREM , MVT::f128 , Expand);
386
387 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
388 setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
389 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
390 }
391
392 // Promote the i8 variants and force them on up to i32 which has a shorter
393 // encoding.
394 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
395 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
396 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
397 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
398 // promote that too.
399 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
400 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
401
402 if (!Subtarget.hasBMI()) {
403 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
404 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
405 if (Subtarget.is64Bit()) {
406 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
407 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
408 }
409 }
410
411 if (Subtarget.hasLZCNT()) {
412 // When promoting the i8 variants, force them to i32 for a shorter
413 // encoding.
414 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
415 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
416 } else {
417 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
418 if (VT == MVT::i64 && !Subtarget.is64Bit())
419 continue;
420 setOperationAction(ISD::CTLZ , VT, Custom);
421 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
422 }
423 }
424
425 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
426 ISD::STRICT_FP_TO_FP16}) {
427 // Special handling for half-precision floating point conversions.
428 // If we don't have F16C support, then lower half float conversions
429 // into library calls.
430 setOperationAction(
431 Op, MVT::f32,
432 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
433 // There's never any support for operations beyond MVT::f32.
434 setOperationAction(Op, MVT::f64, Expand);
435 setOperationAction(Op, MVT::f80, Expand);
436 setOperationAction(Op, MVT::f128, Expand);
437 }
438
439 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
440 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
441 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
442 setTruncStoreAction(VT, MVT::f16, Expand);
443 setTruncStoreAction(VT, MVT::bf16, Expand);
444
445 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
446 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
447 }
448
449 setOperationAction(ISD::PARITY, MVT::i8, Custom);
450 setOperationAction(ISD::PARITY, MVT::i16, Custom);
451 setOperationAction(ISD::PARITY, MVT::i32, Custom);
452 if (Subtarget.is64Bit())
453 setOperationAction(ISD::PARITY, MVT::i64, Custom);
454 if (Subtarget.hasPOPCNT()) {
455 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
456 // popcntw is longer to encode than popcntl and also has a false dependency
457 // on the dest that popcntl hasn't had since Cannon Lake.
458 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
459 } else {
460 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
461 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
462 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
463 if (Subtarget.is64Bit())
464 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
465 else
466 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
467 }
468
469 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
470
471 if (!Subtarget.hasMOVBE())
472 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
473
474 // X86 wants to expand cmov itself.
475 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
476 setOperationAction(ISD::SELECT, VT, Custom);
477 setOperationAction(ISD::SETCC, VT, Custom);
478 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
479 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
480 }
481 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
482 if (VT == MVT::i64 && !Subtarget.is64Bit())
483 continue;
484 setOperationAction(ISD::SELECT, VT, Custom);
485 setOperationAction(ISD::SETCC, VT, Custom);
486 }
487
488 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
489 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
490 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
491
492 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
493 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
494 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
495 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
496 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
497 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
498 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
499 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
500
501 // Darwin ABI issue.
502 for (auto VT : { MVT::i32, MVT::i64 }) {
503 if (VT == MVT::i64 && !Subtarget.is64Bit())
504 continue;
505 setOperationAction(ISD::ConstantPool , VT, Custom);
506 setOperationAction(ISD::JumpTable , VT, Custom);
507 setOperationAction(ISD::GlobalAddress , VT, Custom);
508 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
509 setOperationAction(ISD::ExternalSymbol , VT, Custom);
510 setOperationAction(ISD::BlockAddress , VT, Custom);
511 }
512
513 // 64-bit shl, sra, srl (iff 32-bit x86)
514 for (auto VT : { MVT::i32, MVT::i64 }) {
515 if (VT == MVT::i64 && !Subtarget.is64Bit())
516 continue;
517 setOperationAction(ISD::SHL_PARTS, VT, Custom);
518 setOperationAction(ISD::SRA_PARTS, VT, Custom);
519 setOperationAction(ISD::SRL_PARTS, VT, Custom);
520 }
521
522 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
523 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
524
525 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
526
527 // Expand certain atomics
528 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
529 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
530 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
531 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
532 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
533 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
534 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
535 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
536 }
537
538 if (!Subtarget.is64Bit())
539 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
540
541 if (Subtarget.canUseCMPXCHG16B())
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
543
544 // FIXME - use subtarget debug flags
545 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
546 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
547 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
548 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
549 }
550
551 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
552 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
553
554 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
555 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
556
557 setOperationAction(ISD::TRAP, MVT::Other, Legal);
558 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
559 if (Subtarget.isTargetPS())
560 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
561 else
562 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
563
564 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
565 setOperationAction(ISD::VASTART , MVT::Other, Custom);
566 setOperationAction(ISD::VAEND , MVT::Other, Expand);
567 bool Is64Bit = Subtarget.is64Bit();
568 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
569 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
570
571 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
572 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
573
574 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
575
576 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
577 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
578 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
579
580 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
581
582 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
583 setOperationAction(ISD::FABS, VT, Action);
584 setOperationAction(ISD::FNEG, VT, Action);
585 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
586 setOperationAction(ISD::FREM, VT, Action);
587 setOperationAction(ISD::FMA, VT, Action);
588 setOperationAction(ISD::FMINNUM, VT, Action);
589 setOperationAction(ISD::FMAXNUM, VT, Action);
590 setOperationAction(ISD::FMINIMUM, VT, Action);
591 setOperationAction(ISD::FMAXIMUM, VT, Action);
592 setOperationAction(ISD::FSIN, VT, Action);
593 setOperationAction(ISD::FCOS, VT, Action);
594 setOperationAction(ISD::FSINCOS, VT, Action);
595 setOperationAction(ISD::FSQRT, VT, Action);
596 setOperationAction(ISD::FPOW, VT, Action);
597 setOperationAction(ISD::FLOG, VT, Action);
598 setOperationAction(ISD::FLOG2, VT, Action);
599 setOperationAction(ISD::FLOG10, VT, Action);
600 setOperationAction(ISD::FEXP, VT, Action);
601 setOperationAction(ISD::FEXP2, VT, Action);
602 setOperationAction(ISD::FCEIL, VT, Action);
603 setOperationAction(ISD::FFLOOR, VT, Action);
604 setOperationAction(ISD::FNEARBYINT, VT, Action);
605 setOperationAction(ISD::FRINT, VT, Action);
606 setOperationAction(ISD::BR_CC, VT, Action);
607 setOperationAction(ISD::SETCC, VT, Action);
608 setOperationAction(ISD::SELECT, VT, Custom);
609 setOperationAction(ISD::SELECT_CC, VT, Action);
610 setOperationAction(ISD::FROUND, VT, Action);
611 setOperationAction(ISD::FROUNDEVEN, VT, Action);
612 setOperationAction(ISD::FTRUNC, VT, Action);
613 };
614
615 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
616 // f16, f32 and f64 use SSE.
617 // Set up the FP register classes.
618 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
619 : &X86::FR16RegClass);
620 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
621 : &X86::FR32RegClass);
622 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
623 : &X86::FR64RegClass);
624
625 // Disable f32->f64 extload as we can only generate this in one instruction
626 // under optsize. So its easier to pattern match (fpext (load)) for that
627 // case instead of needing to emit 2 instructions for extload in the
628 // non-optsize case.
629 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
630
631 for (auto VT : { MVT::f32, MVT::f64 }) {
632 // Use ANDPD to simulate FABS.
633 setOperationAction(ISD::FABS, VT, Custom);
634
635 // Use XORP to simulate FNEG.
636 setOperationAction(ISD::FNEG, VT, Custom);
637
638 // Use ANDPD and ORPD to simulate FCOPYSIGN.
639 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
640
641 // These might be better off as horizontal vector ops.
642 setOperationAction(ISD::FADD, VT, Custom);
643 setOperationAction(ISD::FSUB, VT, Custom);
644
645 // We don't support sin/cos/fmod
646 setOperationAction(ISD::FSIN , VT, Expand);
647 setOperationAction(ISD::FCOS , VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 }
650
651 // Half type will be promoted by default.
652 setF16Action(MVT::f16, Promote);
653 setOperationAction(ISD::FADD, MVT::f16, Promote);
654 setOperationAction(ISD::FSUB, MVT::f16, Promote);
655 setOperationAction(ISD::FMUL, MVT::f16, Promote);
656 setOperationAction(ISD::FDIV, MVT::f16, Promote);
657 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
658 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
659 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
660
661 setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
662 setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
663 setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
664 setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
665 setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
666 setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
667 setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
668 setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
669 setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
670 setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
671 setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
672 setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
673 setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
674 setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
675 setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
676 setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
677 setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
678 setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
679 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
680 setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
681 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
682 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
683 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
684 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
685 setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
686 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
687 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
689
690 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
691 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
692
693 // Lower this to MOVMSK plus an AND.
694 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
695 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
696
697 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
698 (UseX87 || Is64Bit)) {
699 // Use SSE for f32, x87 for f64.
700 // Set up the FP register classes.
701 addRegisterClass(MVT::f32, &X86::FR32RegClass);
702 if (UseX87)
703 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
704
705 // Use ANDPS to simulate FABS.
706 setOperationAction(ISD::FABS , MVT::f32, Custom);
707
708 // Use XORP to simulate FNEG.
709 setOperationAction(ISD::FNEG , MVT::f32, Custom);
710
711 if (UseX87)
712 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
713
714 // Use ANDPS and ORPS to simulate FCOPYSIGN.
715 if (UseX87)
716 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
717 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
718
719 // We don't support sin/cos/fmod
720 setOperationAction(ISD::FSIN , MVT::f32, Expand);
721 setOperationAction(ISD::FCOS , MVT::f32, Expand);
722 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
723
724 if (UseX87) {
725 // Always expand sin/cos functions even though x87 has an instruction.
726 setOperationAction(ISD::FSIN, MVT::f64, Expand);
727 setOperationAction(ISD::FCOS, MVT::f64, Expand);
728 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
729 }
730 } else if (UseX87) {
731 // f32 and f64 in x87.
732 // Set up the FP register classes.
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
735
736 for (auto VT : { MVT::f32, MVT::f64 }) {
737 setOperationAction(ISD::UNDEF, VT, Expand);
738 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
739
740 // Always expand sin/cos functions even though x87 has an instruction.
741 setOperationAction(ISD::FSIN , VT, Expand);
742 setOperationAction(ISD::FCOS , VT, Expand);
743 setOperationAction(ISD::FSINCOS, VT, Expand);
744 }
745 }
746
747 // Expand FP32 immediates into loads from the stack, save special cases.
748 if (isTypeLegal(MVT::f32)) {
749 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
750 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
751 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
752 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
753 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
754 } else // SSE immediates.
755 addLegalFPImmediate(APFloat(+0.0f)); // xorps
756 }
757 // Expand FP64 immediates into loads from the stack, save special cases.
758 if (isTypeLegal(MVT::f64)) {
759 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
760 addLegalFPImmediate(APFloat(+0.0)); // FLD0
761 addLegalFPImmediate(APFloat(+1.0)); // FLD1
762 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
763 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
764 } else // SSE immediates.
765 addLegalFPImmediate(APFloat(+0.0)); // xorpd
766 }
767 // Support fp16 0 immediate.
768 if (isTypeLegal(MVT::f16))
769 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
770
771 // Handle constrained floating-point operations of scalar.
772 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
773 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
774 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
775 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
776 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
777 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
778 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
779 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
780 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
781 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
782 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
783 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
784
785 // We don't support FMA.
786 setOperationAction(ISD::FMA, MVT::f64, Expand);
787 setOperationAction(ISD::FMA, MVT::f32, Expand);
788
789 // f80 always uses X87.
790 if (UseX87) {
791 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
792 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
793 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
794 {
795 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
796 addLegalFPImmediate(TmpFlt); // FLD0
797 TmpFlt.changeSign();
798 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
799
800 bool ignored;
801 APFloat TmpFlt2(+1.0);
802 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
803 &ignored);
804 addLegalFPImmediate(TmpFlt2); // FLD1
805 TmpFlt2.changeSign();
806 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
807 }
808
809 // Always expand sin/cos functions even though x87 has an instruction.
810 setOperationAction(ISD::FSIN , MVT::f80, Expand);
811 setOperationAction(ISD::FCOS , MVT::f80, Expand);
812 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
813
814 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
815 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
816 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
817 setOperationAction(ISD::FRINT, MVT::f80, Expand);
818 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
819 setOperationAction(ISD::FMA, MVT::f80, Expand);
820 setOperationAction(ISD::LROUND, MVT::f80, Expand);
821 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
822 setOperationAction(ISD::LRINT, MVT::f80, Custom);
823 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
824
825 // Handle constrained floating-point operations of scalar.
826 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
827 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
828 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
829 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
830 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
831 if (isTypeLegal(MVT::f16)) {
832 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
833 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
834 } else {
835 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
836 }
837 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
838 // as Custom.
839 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
840 }
841
842 // f128 uses xmm registers, but most operations require libcalls.
843 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
844 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
845 : &X86::VR128RegClass);
846
847 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
848
849 setOperationAction(ISD::FADD, MVT::f128, LibCall);
850 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
851 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
852 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
853 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
854 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
855 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
856 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
857 setOperationAction(ISD::FMA, MVT::f128, LibCall);
858 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
859
860 setOperationAction(ISD::FABS, MVT::f128, Custom);
861 setOperationAction(ISD::FNEG, MVT::f128, Custom);
862 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
863
864 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
865 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
866 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
867 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
868 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
869 // No STRICT_FSINCOS
870 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
871 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
872
873 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
874 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
875 // We need to custom handle any FP_ROUND with an f128 input, but
876 // LegalizeDAG uses the result type to know when to run a custom handler.
877 // So we have to list all legal floating point result types here.
878 if (isTypeLegal(MVT::f32)) {
879 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
880 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
881 }
882 if (isTypeLegal(MVT::f64)) {
883 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
884 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
885 }
886 if (isTypeLegal(MVT::f80)) {
887 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
888 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
889 }
890
891 setOperationAction(ISD::SETCC, MVT::f128, Custom);
892
893 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
894 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
895 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
896 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
897 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
898 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
899 }
900
901 // Always use a library call for pow.
902 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
903 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
904 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
905 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
906
907 setOperationAction(ISD::FLOG, MVT::f80, Expand);
908 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
909 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
910 setOperationAction(ISD::FEXP, MVT::f80, Expand);
911 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
912 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
913 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
914
915 // Some FP actions are always expanded for vector types.
916 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
917 MVT::v4f32, MVT::v8f32, MVT::v16f32,
918 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
919 setOperationAction(ISD::FSIN, VT, Expand);
920 setOperationAction(ISD::FSINCOS, VT, Expand);
921 setOperationAction(ISD::FCOS, VT, Expand);
922 setOperationAction(ISD::FREM, VT, Expand);
923 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
924 setOperationAction(ISD::FPOW, VT, Expand);
925 setOperationAction(ISD::FLOG, VT, Expand);
926 setOperationAction(ISD::FLOG2, VT, Expand);
927 setOperationAction(ISD::FLOG10, VT, Expand);
928 setOperationAction(ISD::FEXP, VT, Expand);
929 setOperationAction(ISD::FEXP2, VT, Expand);
930 }
931
932 // First set operation action for all vector types to either promote
933 // (for widening) or expand (for scalarization). Then we will selectively
934 // turn on ones that can be effectively codegen'd.
935 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
936 setOperationAction(ISD::SDIV, VT, Expand);
937 setOperationAction(ISD::UDIV, VT, Expand);
938 setOperationAction(ISD::SREM, VT, Expand);
939 setOperationAction(ISD::UREM, VT, Expand);
940 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
941 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
942 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
943 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
944 setOperationAction(ISD::FMA, VT, Expand);
945 setOperationAction(ISD::FFLOOR, VT, Expand);
946 setOperationAction(ISD::FCEIL, VT, Expand);
947 setOperationAction(ISD::FTRUNC, VT, Expand);
948 setOperationAction(ISD::FRINT, VT, Expand);
949 setOperationAction(ISD::FNEARBYINT, VT, Expand);
950 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
951 setOperationAction(ISD::MULHS, VT, Expand);
952 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
953 setOperationAction(ISD::MULHU, VT, Expand);
954 setOperationAction(ISD::SDIVREM, VT, Expand);
955 setOperationAction(ISD::UDIVREM, VT, Expand);
956 setOperationAction(ISD::CTPOP, VT, Expand);
957 setOperationAction(ISD::CTTZ, VT, Expand);
958 setOperationAction(ISD::CTLZ, VT, Expand);
959 setOperationAction(ISD::ROTL, VT, Expand);
960 setOperationAction(ISD::ROTR, VT, Expand);
961 setOperationAction(ISD::BSWAP, VT, Expand);
962 setOperationAction(ISD::SETCC, VT, Expand);
963 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
964 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
965 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
966 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
967 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
968 setOperationAction(ISD::TRUNCATE, VT, Expand);
969 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
970 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
971 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
972 setOperationAction(ISD::SELECT_CC, VT, Expand);
973 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
974 setTruncStoreAction(InnerVT, VT, Expand);
975
976 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
977 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
978
979 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
980 // types, we have to deal with them whether we ask for Expansion or not.
981 // Setting Expand causes its own optimisation problems though, so leave
982 // them legal.
983 if (VT.getVectorElementType() == MVT::i1)
984 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
985
986 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
987 // split/scalarized right now.
988 if (VT.getVectorElementType() == MVT::f16 ||
989 VT.getVectorElementType() == MVT::bf16)
990 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
991 }
992 }
993
994 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
995 // with -msoft-float, disable use of MMX as well.
996 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
997 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
998 // No operations on x86mmx supported, everything uses intrinsics.
999 }
1000
1001 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1002 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1003 : &X86::VR128RegClass);
1004
1005 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1006 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1007 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
1008 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
1009 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
1010 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1011 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
1013
1014 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1015 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1016
1017 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1018 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1019 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1020 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1021 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1022 }
1023
1024 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1025 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1026 : &X86::VR128RegClass);
1027
1028 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1029 // registers cannot be used even for integer operations.
1030 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1031 : &X86::VR128RegClass);
1032 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1033 : &X86::VR128RegClass);
1034 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1035 : &X86::VR128RegClass);
1036 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1037 : &X86::VR128RegClass);
1038 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1039 : &X86::VR128RegClass);
1040
1041 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1042 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1043 setOperationAction(ISD::SDIV, VT, Custom);
1044 setOperationAction(ISD::SREM, VT, Custom);
1045 setOperationAction(ISD::UDIV, VT, Custom);
1046 setOperationAction(ISD::UREM, VT, Custom);
1047 }
1048
1049 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1052
1053 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1055 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1056 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1057 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1060 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1061 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1062 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1063 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
1064 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
1065
1066 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1067 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1068 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1069
1070 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1071 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1072 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
1073
1074 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1075 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1076 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1077 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1078 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1079 }
1080
1081 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1082 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1083
1084 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
1085 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
1086 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
1087 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
1088 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
1089 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
1090 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
1091 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
1092 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
1093 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
1094
1095 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1096 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1097 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1098 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1099
1100 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1101 setOperationAction(ISD::SETCC, VT, Custom);
1102 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1103 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1104 setOperationAction(ISD::CTPOP, VT, Custom);
1105 setOperationAction(ISD::ABS, VT, Custom);
1106
1107 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1108 // setcc all the way to isel and prefer SETGT in some isel patterns.
1109 setCondCodeAction(ISD::SETLT, VT, Custom);
1110 setCondCodeAction(ISD::SETLE, VT, Custom);
1111 }
1112
1113 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1114 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1115 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1116 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1117 setOperationAction(ISD::VSELECT, VT, Custom);
1118 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1119 }
1120
1121 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1122 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1123 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1124 setOperationAction(ISD::VSELECT, VT, Custom);
1125
1126 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1127 continue;
1128
1129 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1130 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1131 }
1132 setF16Action(MVT::v8f16, Expand);
1133 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1134 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1135 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1136 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1137
1138 // Custom lower v2i64 and v2f64 selects.
1139 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1140 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1141 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1142 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1143 setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
1144 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1145
1146 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
1147 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1148 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1149 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1150 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
1151 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1152
1153 // Custom legalize these to avoid over promotion or custom promotion.
1154 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1155 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1156 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1157 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1158 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1159 }
1160
1161 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1162 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
1163 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1164 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1165
1166 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1167 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1168
1169 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1170 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1171
1172 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1173 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1174 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1175 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1176 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1177
1178 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1179 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1180 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1181 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1182
1183 // We want to legalize this to an f64 load rather than an i64 load on
1184 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1185 // store.
1186 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1187 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1188 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1189 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1190 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1191 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1192
1193 // Add 32-bit vector stores to help vectorization opportunities.
1194 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1195 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1196
1197 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1198 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1199 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1200 if (!Subtarget.hasAVX512())
1201 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1202
1203 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1204 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1205 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1206
1207 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1208
1209 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1210 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1211 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1212 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1213 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1214 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1215
1216 // In the customized shift lowering, the legal v4i32/v2i64 cases
1217 // in AVX2 will be recognized.
1218 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1219 setOperationAction(ISD::SRL, VT, Custom);
1220 setOperationAction(ISD::SHL, VT, Custom);
1221 setOperationAction(ISD::SRA, VT, Custom);
1222 if (VT == MVT::v2i64) continue;
1223 setOperationAction(ISD::ROTL, VT, Custom);
1224 setOperationAction(ISD::ROTR, VT, Custom);
1225 setOperationAction(ISD::FSHL, VT, Custom);
1226 setOperationAction(ISD::FSHR, VT, Custom);
1227 }
1228
1229 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1230 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1231 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1232 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1233 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1234 }
1235
1236 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1237 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1238 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1239 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1240 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1241 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1242 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1243 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1244 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1245
1246 // These might be better off as horizontal vector ops.
1247 setOperationAction(ISD::ADD, MVT::i16, Custom);
1248 setOperationAction(ISD::ADD, MVT::i32, Custom);
1249 setOperationAction(ISD::SUB, MVT::i16, Custom);
1250 setOperationAction(ISD::SUB, MVT::i32, Custom);
1251 }
1252
1253 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1254 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1255 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1256 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1257 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1258 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1259 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1260 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1261 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1262 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1263 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1264 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1265 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1266 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1267
1268 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1269 }
1270
1271 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1272 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1273 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1274 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1275 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1276 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1277 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1278 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1279
1280 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1281 setOperationAction(ISD::ABDS, VT, Custom);
1282 setOperationAction(ISD::ABDU, VT, Custom);
1283 }
1284
1285 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1286 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1287 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1288
1289 // FIXME: Do we need to handle scalar-to-vector here?
1290 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1291 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1292
1293 // We directly match byte blends in the backend as they match the VSELECT
1294 // condition form.
1295 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1296
1297 // SSE41 brings specific instructions for doing vector sign extend even in
1298 // cases where we don't have SRA.
1299 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1300 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1301 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1302 }
1303
1304 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1305 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1306 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1307 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1308 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1309 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1310 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1311 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1312 }
1313
1314 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1315 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1316 // do the pre and post work in the vector domain.
1317 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1318 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1319 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1320 // so that DAG combine doesn't try to turn it into uint_to_fp.
1321 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1322 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1323 }
1324 }
1325
1326 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1327 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1328 }
1329
1330 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1331 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1332 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1333 setOperationAction(ISD::ROTL, VT, Custom);
1334 setOperationAction(ISD::ROTR, VT, Custom);
1335 }
1336
1337 // XOP can efficiently perform BITREVERSE with VPPERM.
1338 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1339 setOperationAction(ISD::BITREVERSE, VT, Custom);
1340
1341 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1342 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1343 setOperationAction(ISD::BITREVERSE, VT, Custom);
1344 }
1345
1346 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1347 bool HasInt256 = Subtarget.hasInt256();
1348
1349 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1350 : &X86::VR256RegClass);
1351 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1352 : &X86::VR256RegClass);
1353 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1354 : &X86::VR256RegClass);
1355 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1356 : &X86::VR256RegClass);
1357 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1358 : &X86::VR256RegClass);
1359 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1360 : &X86::VR256RegClass);
1361 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1362 : &X86::VR256RegClass);
1363
1364 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1365 setOperationAction(ISD::FFLOOR, VT, Legal);
1366 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1367 setOperationAction(ISD::FCEIL, VT, Legal);
1368 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1369 setOperationAction(ISD::FTRUNC, VT, Legal);
1370 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1371 setOperationAction(ISD::FRINT, VT, Legal);
1372 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1373 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1374 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1375 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1376 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1377
1378 setOperationAction(ISD::FROUND, VT, Custom);
1379
1380 setOperationAction(ISD::FNEG, VT, Custom);
1381 setOperationAction(ISD::FABS, VT, Custom);
1382 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1383 }
1384
1385 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1386 // even though v8i16 is a legal type.
1387 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1388 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1389 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1390 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1391 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
1392 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1393 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
1394
1395 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
1396 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
1397 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1398 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
1399 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1400 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
1401
1402 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1403 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1404 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1405 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1406 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1407 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1408 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1409 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1410 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1411 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1412 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1413
1414 if (!Subtarget.hasAVX512())
1415 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1416
1417 // In the customized shift lowering, the legal v8i32/v4i64 cases
1418 // in AVX2 will be recognized.
1419 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1420 setOperationAction(ISD::SRL, VT, Custom);
1421 setOperationAction(ISD::SHL, VT, Custom);
1422 setOperationAction(ISD::SRA, VT, Custom);
1423 setOperationAction(ISD::ABDS, VT, Custom);
1424 setOperationAction(ISD::ABDU, VT, Custom);
1425 if (VT == MVT::v4i64) continue;
1426 setOperationAction(ISD::ROTL, VT, Custom);
1427 setOperationAction(ISD::ROTR, VT, Custom);
1428 setOperationAction(ISD::FSHL, VT, Custom);
1429 setOperationAction(ISD::FSHR, VT, Custom);
1430 }
1431
1432 // These types need custom splitting if their input is a 128-bit vector.
1433 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1434 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1435 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1436 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1437
1438 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1439 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1440 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1441 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1442 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1443 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1444 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1445
1446 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1447 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1448 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1449 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1450 }
1451
1452 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1453 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1454 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1455 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1456
1457 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1458 setOperationAction(ISD::SETCC, VT, Custom);
1459 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1460 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1461 setOperationAction(ISD::CTPOP, VT, Custom);
1462 setOperationAction(ISD::CTLZ, VT, Custom);
1463
1464 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1465 // setcc all the way to isel and prefer SETGT in some isel patterns.
1466 setCondCodeAction(ISD::SETLT, VT, Custom);
1467 setCondCodeAction(ISD::SETLE, VT, Custom);
1468 }
1469
1470 if (Subtarget.hasAnyFMA()) {
1471 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1472 MVT::v2f64, MVT::v4f64 }) {
1473 setOperationAction(ISD::FMA, VT, Legal);
1474 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1475 }
1476 }
1477
1478 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1479 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1480 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1481 }
1482
1483 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1484 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1485 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1486 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1487
1488 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1489 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1490 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1491 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1492 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1493 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1494 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1495 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1496
1497 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1498 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1499
1500 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1501 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1502 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1503 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1504 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1505
1506 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1507 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1508 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1509 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1510 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1511 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1512 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1513 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1514 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1515 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1516 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1517 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1518
1519 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1520 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1521 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1522 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1523 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1524 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1525 }
1526
1527 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1528 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1529 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1530 }
1531
1532 if (HasInt256) {
1533 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1534 // when we have a 256bit-wide blend with immediate.
1535 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1536 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1537
1538 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1539 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1540 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1541 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1542 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1543 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1544 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1545 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1546 }
1547 }
1548
1549 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1550 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1551 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1552 setOperationAction(ISD::MSTORE, VT, Legal);
1553 }
1554
1555 // Extract subvector is special because the value type
1556 // (result) is 128-bit but the source is 256-bit wide.
1557 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1558 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1559 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1560 }
1561
1562 // Custom lower several nodes for 256-bit types.
1563 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1564 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1565 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1566 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1567 setOperationAction(ISD::VSELECT, VT, Custom);
1568 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1569 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1570 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1571 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1572 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1573 setOperationAction(ISD::STORE, VT, Custom);
1574 }
1575 setF16Action(MVT::v16f16, Expand);
1576 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1577 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1578 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1579 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1580
1581 if (HasInt256) {
1582 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1583
1584 // Custom legalize 2x32 to get a little better code.
1585 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1586 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1587
1588 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1589 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1590 setOperationAction(ISD::MGATHER, VT, Custom);
1591 }
1592 }
1593
1594 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1595 Subtarget.hasF16C()) {
1596 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1597 setOperationAction(ISD::FP_ROUND, VT, Custom);
1598 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1599 }
1600 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
1601 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1602 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
1603 }
1604 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1605 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1606 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1607 }
1608
1609 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1610 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
1611 }
1612
1613 // This block controls legalization of the mask vector sizes that are
1614 // available with AVX512. 512-bit vectors are in a separate block controlled
1615 // by useAVX512Regs.
1616 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1617 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1618 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1619 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1620 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1621 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1622
1623 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1624 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1625 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1626
1627 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1628 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1629 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1630 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1631 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1632 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1633 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1634 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1635 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1636 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1637 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1638 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1639
1640 // There is no byte sized k-register load or store without AVX512DQ.
1641 if (!Subtarget.hasDQI()) {
1642 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1643 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1644 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1645 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1646
1647 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1648 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1649 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1650 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1651 }
1652
1653 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1654 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1655 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1656 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1657 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1658 }
1659
1660 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1661 setOperationAction(ISD::VSELECT, VT, Expand);
1662
1663 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1664 setOperationAction(ISD::SETCC, VT, Custom);
1665 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1666 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1667 setOperationAction(ISD::SELECT, VT, Custom);
1668 setOperationAction(ISD::TRUNCATE, VT, Custom);
1669
1670 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1671 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1672 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1673 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1674 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1675 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1676 }
1677
1678 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1679 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1680 }
1681
1682 // This block controls legalization for 512-bit operations with 32/64 bit
1683 // elements. 512-bits can be disabled based on prefer-vector-width and
1684 // required-vector-width function attributes.
1685 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1686 bool HasBWI = Subtarget.hasBWI();
1687
1688 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1689 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1690 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1691 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1692 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1693 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1694 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1695
1696 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1697 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1698 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1699 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1700 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1701 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1702 if (HasBWI)
1703 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1704 }
1705
1706 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1707 setOperationAction(ISD::FNEG, VT, Custom);
1708 setOperationAction(ISD::FABS, VT, Custom);
1709 setOperationAction(ISD::FMA, VT, Legal);
1710 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1711 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1712 }
1713
1714 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1715 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1716 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1717 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1718 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1719 }
1720
1721 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1722 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1723 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1724 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1725 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1726 }
1727
1728 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
1729 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
1730 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1731 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1732 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1733 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
1734
1735 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1736 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1737 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1738 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1739 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1740 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1741 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1742 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1743 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1744 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1745 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1746
1747 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1748 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1749 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1750 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1751 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1752 if (HasBWI)
1753 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1754
1755 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1756 // to 512-bit rather than use the AVX2 instructions so that we can use
1757 // k-masks.
1758 if (!Subtarget.hasVLX()) {
1759 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1760 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1761 setOperationAction(ISD::MLOAD, VT, Custom);
1762 setOperationAction(ISD::MSTORE, VT, Custom);
1763 }
1764 }
1765
1766 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1767 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1768 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1769 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1770 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1771 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1772 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1773 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1774 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1775 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1776 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1777 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1778 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1779
1780 if (HasBWI) {
1781 // Extends from v64i1 masks to 512-bit vectors.
1782 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1783 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1784 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1785 }
1786
1787 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1788 setOperationAction(ISD::FFLOOR, VT, Legal);
1789 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1790 setOperationAction(ISD::FCEIL, VT, Legal);
1791 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1792 setOperationAction(ISD::FTRUNC, VT, Legal);
1793 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1794 setOperationAction(ISD::FRINT, VT, Legal);
1795 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1796 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1797 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1798 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1799 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1800
1801 setOperationAction(ISD::FROUND, VT, Custom);
1802 }
1803
1804 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1805 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1806 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1807 }
1808
1809 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1810 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1811 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1812 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1813
1814 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1815 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1816 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1817 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1818
1819 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1820 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1821 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1822 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1823 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1824 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1825 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1826 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1827
1828 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1829 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1830
1831 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1832
1833 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1834 setOperationAction(ISD::SRL, VT, Custom);
1835 setOperationAction(ISD::SHL, VT, Custom);
1836 setOperationAction(ISD::SRA, VT, Custom);
1837 setOperationAction(ISD::ROTL, VT, Custom);
1838 setOperationAction(ISD::ROTR, VT, Custom);
1839 setOperationAction(ISD::SETCC, VT, Custom);
1840 setOperationAction(ISD::ABDS, VT, Custom);
1841 setOperationAction(ISD::ABDU, VT, Custom);
1842
1843 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1844 // setcc all the way to isel and prefer SETGT in some isel patterns.
1845 setCondCodeAction(ISD::SETLT, VT, Custom);
1846 setCondCodeAction(ISD::SETLE, VT, Custom);
1847 }
1848 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1849 setOperationAction(ISD::SMAX, VT, Legal);
1850 setOperationAction(ISD::UMAX, VT, Legal);
1851 setOperationAction(ISD::SMIN, VT, Legal);
1852 setOperationAction(ISD::UMIN, VT, Legal);
1853 setOperationAction(ISD::ABS, VT, Legal);
1854 setOperationAction(ISD::CTPOP, VT, Custom);
1855 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1856 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1857 }
1858
1859 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1860 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1861 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1862 setOperationAction(ISD::CTLZ, VT, Custom);
1863 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1864 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1865 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1866 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1867 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1868 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1869 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1870 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1871 }
1872
1873 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1874 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1875 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1876 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1877 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1878 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1879
1880 if (Subtarget.hasDQI()) {
1881 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1882 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1883 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1884 setOperationAction(Opc, MVT::v8i64, Custom);
1885 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1886 }
1887
1888 if (Subtarget.hasCDI()) {
1889 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1890 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1891 setOperationAction(ISD::CTLZ, VT, Legal);
1892 }
1893 } // Subtarget.hasCDI()
1894
1895 if (Subtarget.hasVPOPCNTDQ()) {
1896 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1897 setOperationAction(ISD::CTPOP, VT, Legal);
1898 }
1899
1900 // Extract subvector is special because the value type
1901 // (result) is 256-bit but the source is 512-bit wide.
1902 // 128-bit was made Legal under AVX1.
1903 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1904 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1905 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1906
1907 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1908 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1909 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1910 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1911 setOperationAction(ISD::SELECT, VT, Custom);
1912 setOperationAction(ISD::VSELECT, VT, Custom);
1913 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1914 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1915 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1916 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1917 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1918 }
1919 setF16Action(MVT::v32f16, Expand);
1920 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1921 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1922 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
1923 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1924 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1925 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1926 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1927 }
1928
1929 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1930 setOperationAction(ISD::MLOAD, VT, Legal);
1931 setOperationAction(ISD::MSTORE, VT, Legal);
1932 setOperationAction(ISD::MGATHER, VT, Custom);
1933 setOperationAction(ISD::MSCATTER, VT, Custom);
1934 }
1935 if (HasBWI) {
1936 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1937 setOperationAction(ISD::MLOAD, VT, Legal);
1938 setOperationAction(ISD::MSTORE, VT, Legal);
1939 }
1940 } else {
1941 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1942 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1943 }
1944
1945 if (Subtarget.hasVBMI2()) {
1946 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1947 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1948 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1949 setOperationAction(ISD::FSHL, VT, Custom);
1950 setOperationAction(ISD::FSHR, VT, Custom);
1951 }
1952
1953 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1954 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1955 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1956 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1957 }
1958 }// useAVX512Regs
1959
1960 // This block controls legalization for operations that don't have
1961 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1962 // narrower widths.
1963 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1964 // These operations are handled on non-VLX by artificially widening in
1965 // isel patterns.
1966
1967 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
1968 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
1969 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1970
1971 if (Subtarget.hasDQI()) {
1972 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1973 // v2f32 UINT_TO_FP is already custom under SSE2.
1974 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))
1975 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))
1976 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))
;
1977 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1978 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1979 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1980 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1981 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1982 }
1983
1984 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1985 setOperationAction(ISD::SMAX, VT, Legal);
1986 setOperationAction(ISD::UMAX, VT, Legal);
1987 setOperationAction(ISD::SMIN, VT, Legal);
1988 setOperationAction(ISD::UMIN, VT, Legal);
1989 setOperationAction(ISD::ABS, VT, Legal);
1990 }
1991
1992 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1993 setOperationAction(ISD::ROTL, VT, Custom);
1994 setOperationAction(ISD::ROTR, VT, Custom);
1995 }
1996
1997 // Custom legalize 2x32 to get a little better code.
1998 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1999 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2000
2001 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2002 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2003 setOperationAction(ISD::MSCATTER, VT, Custom);
2004
2005 if (Subtarget.hasDQI()) {
2006 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2007 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2008 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2009 setOperationAction(Opc, MVT::v2i64, Custom);
2010 setOperationAction(Opc, MVT::v4i64, Custom);
2011 }
2012 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2013 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2014 }
2015
2016 if (Subtarget.hasCDI()) {
2017 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2018 setOperationAction(ISD::CTLZ, VT, Legal);
2019 }
2020 } // Subtarget.hasCDI()
2021
2022 if (Subtarget.hasVPOPCNTDQ()) {
2023 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2024 setOperationAction(ISD::CTPOP, VT, Legal);
2025 }
2026 }
2027
2028 // This block control legalization of v32i1/v64i1 which are available with
2029 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
2030 // useBWIRegs.
2031 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2032 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2033 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2034
2035 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2036 setOperationAction(ISD::VSELECT, VT, Expand);
2037 setOperationAction(ISD::TRUNCATE, VT, Custom);
2038 setOperationAction(ISD::SETCC, VT, Custom);
2039 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2040 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
2041 setOperationAction(ISD::SELECT, VT, Custom);
2042 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2043 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2044 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
2045 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
2046 }
2047
2048 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2049 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2050
2051 // Extends from v32i1 masks to 256-bit vectors.
2052 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
2053 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
2054 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
2055
2056 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2057 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2058 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2059 }
2060
2061 // These operations are handled on non-VLX by artificially widening in
2062 // isel patterns.
2063 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2064
2065 if (Subtarget.hasBITALG()) {
2066 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2067 setOperationAction(ISD::CTPOP, VT, Legal);
2068 }
2069 }
2070
2071 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2072 auto setGroup = [&] (MVT VT) {
2073 setOperationAction(ISD::FADD, VT, Legal);
2074 setOperationAction(ISD::STRICT_FADD, VT, Legal);
2075 setOperationAction(ISD::FSUB, VT, Legal);
2076 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
2077 setOperationAction(ISD::FMUL, VT, Legal);
2078 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
2079 setOperationAction(ISD::FDIV, VT, Legal);
2080 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
2081 setOperationAction(ISD::FSQRT, VT, Legal);
2082 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
2083
2084 setOperationAction(ISD::FFLOOR, VT, Legal);
2085 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
2086 setOperationAction(ISD::FCEIL, VT, Legal);
2087 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
2088 setOperationAction(ISD::FTRUNC, VT, Legal);
2089 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
2090 setOperationAction(ISD::FRINT, VT, Legal);
2091 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
2092 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2093 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
2094
2095 setOperationAction(ISD::FROUND, VT, Custom);
2096
2097 setOperationAction(ISD::LOAD, VT, Legal);
2098 setOperationAction(ISD::STORE, VT, Legal);
2099
2100 setOperationAction(ISD::FMA, VT, Legal);
2101 setOperationAction(ISD::STRICT_FMA, VT, Legal);
2102 setOperationAction(ISD::VSELECT, VT, Legal);
2103 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2104 setOperationAction(ISD::SELECT, VT, Custom);
2105
2106 setOperationAction(ISD::FNEG, VT, Custom);
2107 setOperationAction(ISD::FABS, VT, Custom);
2108 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
2109 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2110 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2111 };
2112
2113 // AVX512_FP16 scalar operations
2114 setGroup(MVT::f16);
2115 setOperationAction(ISD::FREM, MVT::f16, Promote);
2116 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
2117 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
2118 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2119 setOperationAction(ISD::SETCC, MVT::f16, Custom);
2120 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
2121 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
2122 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
2123 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2124 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
2125 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
2126 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
2127 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2128 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
2129
2130 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2131 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2132
2133 if (Subtarget.useAVX512Regs()) {
2134 setGroup(MVT::v32f16);
2135 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
2136 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
2137 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
2138 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
2139 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
2140 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2141 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
2142 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
2143 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
2144 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
2145 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
2146 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
2147
2148 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
2149 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
2150 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
2151 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
2152 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2153 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2154 MVT::v32i16);
2155 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2156 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2157 MVT::v32i16);
2158 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2159 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2160 MVT::v32i16);
2161 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2162 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2163 MVT::v32i16);
2164
2165 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2166 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2167 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2168
2169 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2170 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2171
2172 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2173 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2174 }
2175
2176 if (Subtarget.hasVLX()) {
2177 setGroup(MVT::v8f16);
2178 setGroup(MVT::v16f16);
2179
2180 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2181 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2183 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2184 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2185 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2186 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2187 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2188 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2189 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2190
2191 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2192 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2193 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2194 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2195 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
2196 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2197 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
2198 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2199 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
2200 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
2201
2202 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2203 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2204 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2205
2206 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2207 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2208 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2209
2210 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2211 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2212 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2214
2215 // Need to custom widen these to prevent scalarization.
2216 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2217 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2218 }
2219 }
2220
2221 if (!Subtarget.useSoftFloat() &&
2222 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2223 addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
2224 addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
2225 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2226 // provide the method to promote BUILD_VECTOR. Set the operation action
2227 // Custom to do the customization later.
2228 setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2229 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2230 setF16Action(VT, Expand);
2231 setOperationAction(ISD::FADD, VT, Expand);
2232 setOperationAction(ISD::FSUB, VT, Expand);
2233 setOperationAction(ISD::FMUL, VT, Expand);
2234 setOperationAction(ISD::FDIV, VT, Expand);
2235 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2236 }
2237 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2238 }
2239
2240 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2241 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2242 setF16Action(MVT::v32bf16, Expand);
2243 setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2244 setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2245 setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2246 setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2247 setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2248 }
2249
2250 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2251 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2252 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2253 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2254 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2255 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2256
2257 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2258 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2259 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2261 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2262
2263 if (Subtarget.hasBWI()) {
2264 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2265 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2266 }
2267
2268 if (Subtarget.hasFP16()) {
2269 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2270 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2271 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2272 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2273 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2274 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2275 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2276 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2277 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2278 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2279 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2280 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2281 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2282 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2283 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2284 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2285 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2286 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2287 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2288 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2289 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2290 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2291 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2292 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2293 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2294 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2295 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2296 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2297 }
2298
2299 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2300 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2301 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2302 }
2303
2304 if (Subtarget.hasAMXTILE()) {
2305 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2306 }
2307
2308 // We want to custom lower some of our intrinsics.
2309 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2310 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2311 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2312 if (!Subtarget.is64Bit()) {
2313 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2314 }
2315
2316 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2317 // handle type legalization for these operations here.
2318 //
2319 // FIXME: We really should do custom legalization for addition and
2320 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2321 // than generic legalization for 64-bit multiplication-with-overflow, though.
2322 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2323 if (VT == MVT::i64 && !Subtarget.is64Bit())
2324 continue;
2325 // Add/Sub/Mul with overflow operations are custom lowered.
2326 setOperationAction(ISD::SADDO, VT, Custom);
2327 setOperationAction(ISD::UADDO, VT, Custom);
2328 setOperationAction(ISD::SSUBO, VT, Custom);
2329 setOperationAction(ISD::USUBO, VT, Custom);
2330 setOperationAction(ISD::SMULO, VT, Custom);
2331 setOperationAction(ISD::UMULO, VT, Custom);
2332
2333 // Support carry in as value rather than glue.
2334 setOperationAction(ISD::ADDCARRY, VT, Custom);
2335 setOperationAction(ISD::SUBCARRY, VT, Custom);
2336 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2337 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2338 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2339 }
2340
2341 if (!Subtarget.is64Bit()) {
2342 // These libcalls are not available in 32-bit.
2343 setLibcallName(RTLIB::SHL_I128, nullptr);
2344 setLibcallName(RTLIB::SRL_I128, nullptr);
2345 setLibcallName(RTLIB::SRA_I128, nullptr);
2346 setLibcallName(RTLIB::MUL_I128, nullptr);
2347 // The MULO libcall is not part of libgcc, only compiler-rt.
2348 setLibcallName(RTLIB::MULO_I64, nullptr);
2349 }
2350 // The MULO libcall is not part of libgcc, only compiler-rt.
2351 setLibcallName(RTLIB::MULO_I128, nullptr);
2352
2353 // Combine sin / cos into _sincos_stret if it is available.
2354 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2355 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2356 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2357 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2358 }
2359
2360 if (Subtarget.isTargetWin64()) {
2361 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2362 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2363 setOperationAction(ISD::SREM, MVT::i128, Custom);
2364 setOperationAction(ISD::UREM, MVT::i128, Custom);
2365 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2366 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2367 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2368 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2369 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2370 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2371 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2372 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2373 }
2374
2375 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2376 // is. We should promote the value to 64-bits to solve this.
2377 // This is what the CRT headers do - `fmodf` is an inline header
2378 // function casting to f64 and calling `fmod`.
2379 if (Subtarget.is32Bit() &&
2380 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2381 for (ISD::NodeType Op :
2382 {ISD::FCEIL, ISD::STRICT_FCEIL,
2383 ISD::FCOS, ISD::STRICT_FCOS,
2384 ISD::FEXP, ISD::STRICT_FEXP,
2385 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2386 ISD::FREM, ISD::STRICT_FREM,
2387 ISD::FLOG, ISD::STRICT_FLOG,
2388 ISD::FLOG10, ISD::STRICT_FLOG10,
2389 ISD::FPOW, ISD::STRICT_FPOW,
2390 ISD::FSIN, ISD::STRICT_FSIN})
2391 if (isOperationExpand(Op, MVT::f32))
2392 setOperationAction(Op, MVT::f32, Promote);
2393
2394 // We have target-specific dag combine patterns for the following nodes:
2395 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2396 ISD::SCALAR_TO_VECTOR,
2397 ISD::INSERT_VECTOR_ELT,
2398 ISD::EXTRACT_VECTOR_ELT,
2399 ISD::CONCAT_VECTORS,
2400 ISD::INSERT_SUBVECTOR,
2401 ISD::EXTRACT_SUBVECTOR,
2402 ISD::BITCAST,
2403 ISD::VSELECT,
2404 ISD::SELECT,
2405 ISD::SHL,
2406 ISD::SRA,
2407 ISD::SRL,
2408 ISD::OR,
2409 ISD::AND,
2410 ISD::ADD,
2411 ISD::FADD,
2412 ISD::FSUB,
2413 ISD::FNEG,
2414 ISD::FMA,
2415 ISD::STRICT_FMA,
2416 ISD::FMINNUM,
2417 ISD::FMAXNUM,
2418 ISD::SUB,
2419 ISD::LOAD,
2420 ISD::MLOAD,
2421 ISD::STORE,
2422 ISD::MSTORE,
2423 ISD::TRUNCATE,
2424 ISD::ZERO_EXTEND,
2425 ISD::ANY_EXTEND,
2426 ISD::SIGN_EXTEND,
2427 ISD::SIGN_EXTEND_INREG,
2428 ISD::ANY_EXTEND_VECTOR_INREG,
2429 ISD::SIGN_EXTEND_VECTOR_INREG,
2430 ISD::ZERO_EXTEND_VECTOR_INREG,
2431 ISD::SINT_TO_FP,
2432 ISD::UINT_TO_FP,
2433 ISD::STRICT_SINT_TO_FP,
2434 ISD::STRICT_UINT_TO_FP,
2435 ISD::SETCC,
2436 ISD::MUL,
2437 ISD::XOR,
2438 ISD::MSCATTER,
2439 ISD::MGATHER,
2440 ISD::FP16_TO_FP,
2441 ISD::FP_EXTEND,
2442 ISD::STRICT_FP_EXTEND,
2443 ISD::FP_ROUND,
2444 ISD::STRICT_FP_ROUND});
2445
2446 computeRegisterProperties(Subtarget.getRegisterInfo());
2447
2448 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2449 MaxStoresPerMemsetOptSize = 8;
2450 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2451 MaxStoresPerMemcpyOptSize = 4;
2452 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2453 MaxStoresPerMemmoveOptSize = 4;
2454
2455 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2456 // that needs to benchmarked and balanced with the potential use of vector
2457 // load/store types (PR33329, PR33914).
2458 MaxLoadsPerMemcmp = 2;
2459 MaxLoadsPerMemcmpOptSize = 2;
2460
2461 // Default loop alignment, which can be overridden by -align-loops.
2462 setPrefLoopAlignment(Align(16));
2463
2464 // An out-of-order CPU can speculatively execute past a predictable branch,
2465 // but a conditional move could be stalled by an expensive earlier operation.
2466 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2467 EnableExtLdPromotion = true;
2468 setPrefFunctionAlignment(Align(16));
2469
2470 verifyIntrinsicTables();
2471
2472 // Default to having -disable-strictnode-mutation on
2473 IsStrictFPEnabled = true;
2474}
2475
2476// This has so far only been implemented for 64-bit MachO.
2477bool X86TargetLowering::useLoadStackGuardNode() const {
2478 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2479}
2480
2481bool X86TargetLowering::useStackGuardXorFP() const {
2482 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2483 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2484}
2485
2486SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2487 const SDLoc &DL) const {
2488 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2489 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2490 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2491 return SDValue(Node, 0);
2492}
2493
2494TargetLoweringBase::LegalizeTypeAction
2495X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2496 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2497 !Subtarget.hasBWI())
2498 return TypeSplitVector;
2499
2500 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2501 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2502 return TypeSplitVector;
2503
2504 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2505 VT.getVectorElementType() != MVT::i1)
2506 return TypeWidenVector;
2507
2508 return TargetLoweringBase::getPreferredVectorAction(VT);
2509}
2510
2511static std::pair<MVT, unsigned>
2512handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2513 const X86Subtarget &Subtarget) {
2514 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2515 // convention is one that uses k registers.
2516 if (NumElts == 2)
2517 return {MVT::v2i64, 1};
2518 if (NumElts == 4)
2519 return {MVT::v4i32, 1};
2520 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2521 CC != CallingConv::Intel_OCL_BI)
2522 return {MVT::v8i16, 1};
2523 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2524 CC != CallingConv::Intel_OCL_BI)
2525 return {MVT::v16i8, 1};
2526 // v32i1 passes in ymm unless we have BWI and the calling convention is
2527 // regcall.
2528 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2529 return {MVT::v32i8, 1};
2530 // Split v64i1 vectors if we don't have v64i8 available.
2531 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2532 if (Subtarget.useAVX512Regs())
2533 return {MVT::v64i8, 1};
2534 return {MVT::v32i8, 2};
2535 }
2536
2537 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2538 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2539 NumElts > 64)
2540 return {MVT::i8, NumElts};
2541
2542 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2543}
2544
2545MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2546 CallingConv::ID CC,
2547 EVT VT) const {
2548 if (VT.isVector()) {
2549 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2550 unsigned NumElts = VT.getVectorNumElements();
2551
2552 MVT RegisterVT;
2553 unsigned NumRegisters;
2554 std::tie(RegisterVT, NumRegisters) =
2555 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2556 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2557 return RegisterVT;
2558 }
2559
2560 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2561 return MVT::v8f16;
2562 }
2563
2564 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2565 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2566 !Subtarget.hasX87())
2567 return MVT::i32;
2568
2569 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2570 return getRegisterTypeForCallingConv(Context, CC,
2571 VT.changeVectorElementTypeToInteger());
2572
2573 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2574}
2575
2576unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2577 CallingConv::ID CC,
2578 EVT VT) const {
2579 if (VT.isVector()) {
2580 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2581 unsigned NumElts = VT.getVectorNumElements();
2582
2583 MVT RegisterVT;
2584 unsigned NumRegisters;
2585 std::tie(RegisterVT, NumRegisters) =
2586 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2587 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2588 return NumRegisters;
2589 }
2590
2591 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2592 return 1;
2593 }
2594
2595 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2596 // x87 is disabled.
2597 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2598 if (VT == MVT::f64)
2599 return 2;
2600 if (VT == MVT::f80)
2601 return 3;
2602 }
2603
2604 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2605 return getNumRegistersForCallingConv(Context, CC,
2606 VT.changeVectorElementTypeToInteger());
2607
2608 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2609}
2610
2611unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2612 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2613 unsigned &NumIntermediates, MVT &RegisterVT) const {
2614 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2615 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2616 Subtarget.hasAVX512() &&
2617 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2618 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2619 VT.getVectorNumElements() > 64)) {
2620 RegisterVT = MVT::i8;
2621 IntermediateVT = MVT::i1;
2622 NumIntermediates = VT.getVectorNumElements();
2623 return NumIntermediates;
2624 }
2625
2626 // Split v64i1 vectors if we don't have v64i8 available.
2627 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2628 CC != CallingConv::X86_RegCall) {
2629 RegisterVT = MVT::v32i8;
2630 IntermediateVT = MVT::v32i1;
2631 NumIntermediates = 2;
2632 return 2;
2633 }
2634
2635 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2636 NumIntermediates, RegisterVT);
2637}
2638
2639EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2640 LLVMContext& Context,
2641 EVT VT) const {
2642 if (!VT.isVector())
2643 return MVT::i8;
2644
2645 if (Subtarget.hasAVX512()) {
2646 // Figure out what this type will be legalized to.
2647 EVT LegalVT = VT;
2648 while (getTypeAction(Context, LegalVT) != TypeLegal)
2649 LegalVT = getTypeToTransformTo(Context, LegalVT);
2650
2651 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2652 if (LegalVT.getSimpleVT().is512BitVector())
2653 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2654
2655 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2656 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2657 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2658 // vXi16/vXi8.
2659 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2660 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2661 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2662 }
2663 }
2664
2665 return VT.changeVectorElementTypeToInteger();
2666}
2667
2668/// Helper for getByValTypeAlignment to determine
2669/// the desired ByVal argument alignment.
2670static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2671 if (MaxAlign == 16)
2672 return;
2673 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2674 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
2675 MaxAlign = Align(16);
2676 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2677 Align EltAlign;
2678 getMaxByValAlign(ATy->getElementType(), EltAlign);
2679 if (EltAlign > MaxAlign)
2680 MaxAlign = EltAlign;
2681 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2682 for (auto *EltTy : STy->elements()) {
2683 Align EltAlign;
2684 getMaxByValAlign(EltTy, EltAlign);
2685 if (EltAlign > MaxAlign)
2686 MaxAlign = EltAlign;
2687 if (MaxAlign == 16)
2688 break;
2689 }
2690 }
2691}
2692
2693/// Return the desired alignment for ByVal aggregate
2694/// function arguments in the caller parameter area. For X86, aggregates
2695/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2696/// are at 4-byte boundaries.
2697uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2698 const DataLayout &DL) const {
2699 if (Subtarget.is64Bit()) {
2700 // Max of 8 and alignment of type.
2701 Align TyAlign = DL.getABITypeAlign(Ty);
2702 if (TyAlign > 8)
2703 return TyAlign.value();
2704 return 8;
2705 }
2706
2707 Align Alignment(4);
2708 if (Subtarget.hasSSE1())
2709 getMaxByValAlign(Ty, Alignment);
2710 return Alignment.value();
2711}
2712
2713/// It returns EVT::Other if the type should be determined using generic
2714/// target-independent logic.
2715/// For vector ops we check that the overall size isn't larger than our
2716/// preferred vector width.
2717EVT X86TargetLowering::getOptimalMemOpType(
2718 const MemOp &Op, const AttributeList &FuncAttributes) const {
2719 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2720 if (Op.size() >= 16 &&
2721 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2722 // FIXME: Check if unaligned 64-byte accesses are slow.
2723 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2724 (Subtarget.getPreferVectorWidth() >= 512)) {
2725 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2726 }
2727 // FIXME: Check if unaligned 32-byte accesses are slow.
2728 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2729 Subtarget.useLight256BitInstructions()) {
2730 // Although this isn't a well-supported type for AVX1, we'll let
2731 // legalization and shuffle lowering produce the optimal codegen. If we
2732 // choose an optimal type with a vector element larger than a byte,
2733 // getMemsetStores() may create an intermediate splat (using an integer
2734 // multiply) before we splat as a vector.
2735 return MVT::v32i8;
2736 }
2737 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2738 return MVT::v16i8;
2739 // TODO: Can SSE1 handle a byte vector?
2740 // If we have SSE1 registers we should be able to use them.
2741 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2742 (Subtarget.getPreferVectorWidth() >= 128))
2743 return MVT::v4f32;
2744 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2745 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2746 // Do not use f64 to lower memcpy if source is string constant. It's
2747 // better to use i32 to avoid the loads.
2748 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2749 // The gymnastics of splatting a byte value into an XMM register and then
2750 // only using 8-byte stores (because this is a CPU with slow unaligned
2751 // 16-byte accesses) makes that a loser.
2752 return MVT::f64;
2753 }
2754 }
2755 // This is a compromise. If we reach here, unaligned accesses may be slow on
2756 // this target. However, creating smaller, aligned accesses could be even
2757 // slower and would certainly be a lot more code.
2758 if (Subtarget.is64Bit() && Op.size() >= 8)
2759 return MVT::i64;
2760 return MVT::i32;
2761}
2762
2763bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2764 if (VT == MVT::f32)
2765 return Subtarget.hasSSE1();
2766 if (VT == MVT::f64)
2767 return Subtarget.hasSSE2();
2768 return true;
2769}
2770
2771static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
2772 return (8 * Alignment.value()) % SizeInBits == 0;
2773}
2774
2775bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
2776 if (isBitAligned(Alignment, VT.getSizeInBits()))
2777 return true;
2778 switch (VT.getSizeInBits()) {
2779 default:
2780 // 8-byte and under are always assumed to be fast.
2781 return true;
2782 case 128:
2783 return !Subtarget.isUnalignedMem16Slow();
2784 case 256:
2785 return !Subtarget.isUnalignedMem32Slow();
2786 // TODO: What about AVX-512 (512-bit) accesses?
2787 }
2788}
2789
2790bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2791 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2792 unsigned *Fast) const {
2793 if (Fast)
2794 *Fast = isMemoryAccessFast(VT, Alignment);
2795 // NonTemporal vector memory ops must be aligned.
2796 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2797 // NT loads can only be vector aligned, so if its less aligned than the
2798 // minimum vector size (which we can split the vector down to), we might as
2799 // well use a regular unaligned vector load.
2800 // We don't have any NT loads pre-SSE41.
2801 if (!!(Flags & MachineMemOperand::MOLoad))
2802 return (Alignment < 16 || !Subtarget.hasSSE41());
2803 return false;
2804 }
2805 // Misaligned accesses of any size are always allowed.
2806 return true;
2807}
2808
2809bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
2810 const DataLayout &DL, EVT VT,
2811 unsigned AddrSpace, Align Alignment,
2812 MachineMemOperand::Flags Flags,
2813 unsigned *Fast) const {
2814 if (Fast)
2815 *Fast = isMemoryAccessFast(VT, Alignment);
2816 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2817 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
2818 /*Fast=*/nullptr))
2819 return true;
2820 // NonTemporal vector memory ops are special, and must be aligned.
2821 if (!isBitAligned(Alignment, VT.getSizeInBits()))
2822 return false;
2823 switch (VT.getSizeInBits()) {
2824 case 128:
2825 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
2826 return true;
2827 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
2828 return true;
2829 return false;
2830 case 256:
2831 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
2832 return true;
2833 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
2834 return true;
2835 return false;
2836 case 512:
2837 if (Subtarget.hasAVX512())
2838 return true;
2839 return false;
2840 default:
2841 return false; // Don't have NonTemporal vector memory ops of this size.
2842 }
2843 }
2844 return true;
2845}
2846
2847/// Return the entry encoding for a jump table in the
2848/// current function. The returned value is a member of the
2849/// MachineJumpTableInfo::JTEntryKind enum.
2850unsigned X86TargetLowering::getJumpTableEncoding() const {
2851 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2852 // symbol.
2853 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2854 return MachineJumpTableInfo::EK_Custom32;
2855
2856 // Otherwise, use the normal jump table encoding heuristics.
2857 return TargetLowering::getJumpTableEncoding();
2858}
2859
2860bool X86TargetLowering::splitValueIntoRegisterParts(
2861 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
2862 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
2863 bool IsABIRegCopy = CC.has_value();
2864 EVT ValueVT = Val.getValueType();
2865 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2866 unsigned ValueBits = ValueVT.getSizeInBits();
2867 unsigned PartBits = PartVT.getSizeInBits();
2868 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
2869 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
2870 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
2871 Parts[0] = Val;
2872 return true;
2873 }
2874 return false;
2875}
2876
2877SDValue X86TargetLowering::joinRegisterPartsIntoValue(
2878 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
2879 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
2880 bool IsABIRegCopy = CC.has_value();
2881 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2882 unsigned ValueBits = ValueVT.getSizeInBits();
2883 unsigned PartBits = PartVT.getSizeInBits();
2884 SDValue Val = Parts[0];
2885
2886 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
2887 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
2888 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
2889 return Val;
2890 }
2891 return SDValue();
2892}
2893
2894bool X86TargetLowering::useSoftFloat() const {
2895 return Subtarget.useSoftFloat();
2896}
2897
2898void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2899 ArgListTy &Args) const {
2900
2901 // Only relabel X86-32 for C / Stdcall CCs.
2902 if (Subtarget.is64Bit())
2903 return;
2904 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2905 return;
2906 unsigned ParamRegs = 0;
2907 if (auto *M = MF->getFunction().getParent())
2908 ParamRegs = M->getNumberRegisterParameters();
2909
2910 // Mark the first N int arguments as having reg
2911 for (auto &Arg : Args) {
2912 Type *T = Arg.Ty;
2913 if (T->isIntOrPtrTy())
2914 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2915 unsigned numRegs = 1;
2916 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2917 numRegs = 2;
2918 if (ParamRegs < numRegs)
2919 return;
2920 ParamRegs -= numRegs;
2921 Arg.IsInReg = true;
2922 }
2923 }
2924}
2925
2926const MCExpr *
2927X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2928 const MachineBasicBlock *MBB,
2929 unsigned uid,MCContext &Ctx) const{
2930 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2930, __extension__
__PRETTY_FUNCTION__))
;
2931 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2932 // entries.
2933 return MCSymbolRefExpr::create(MBB->getSymbol(),
2934 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2935}
2936
2937/// Returns relocation base for the given PIC jumptable.
2938SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2939 SelectionDAG &DAG) const {
2940 if (!Subtarget.is64Bit())
2941 // This doesn't have SDLoc associated with it, but is not really the
2942 // same as a Register.
2943 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2944 getPointerTy(DAG.getDataLayout()));
2945 return Table;
2946}
2947
2948/// This returns the relocation base for the given PIC jumptable,
2949/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2950const MCExpr *X86TargetLowering::
2951getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2952 MCContext &Ctx) const {
2953 // X86-64 uses RIP relative addressing based on the jump table label.
2954 if (Subtarget.isPICStyleRIPRel())
2955 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2956
2957 // Otherwise, the reference is relative to the PIC base.
2958 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2959}
2960
2961std::pair<const TargetRegisterClass *, uint8_t>
2962X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2963 MVT VT) const {
2964 const TargetRegisterClass *RRC = nullptr;
2965 uint8_t Cost = 1;
2966 switch (VT.SimpleTy) {
2967 default:
2968 return TargetLowering::findRepresentativeClass(TRI, VT);
2969 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2970 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2971 break;
2972 case MVT::x86mmx:
2973 RRC = &X86::VR64RegClass;
2974 break;
2975 case MVT::f32: case MVT::f64:
2976 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2977 case MVT::v4f32: case MVT::v2f64:
2978 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2979 case MVT::v8f32: case MVT::v4f64:
2980 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2981 case MVT::v16f32: case MVT::v8f64:
2982 RRC = &X86::VR128XRegClass;
2983 break;
2984 }
2985 return std::make_pair(RRC, Cost);
2986}
2987
2988unsigned X86TargetLowering::getAddressSpace() const {
2989 if (Subtarget.is64Bit())
2990 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2991 return 256;
2992}
2993
2994static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2995 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2996 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2997}
2998
2999static Constant* SegmentOffset(IRBuilderBase &IRB,
3000 int Offset, unsigned AddressSpace) {
3001 return ConstantExpr::getIntToPtr(
3002 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
3003 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
3004}
3005
3006Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
3007 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
3008 // tcbhead_t; use it instead of the usual global variable (see
3009 // sysdeps/{i386,x86_64}/nptl/tls.h)
3010 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
3011 if (Subtarget.isTargetFuchsia()) {
3012 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
3013 return SegmentOffset(IRB, 0x10, getAddressSpace());
3014 } else {
3015 unsigned AddressSpace = getAddressSpace();
3016 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
3017 // Specially, some users may customize the base reg and offset.
3018 int Offset = M->getStackProtectorGuardOffset();
3019 // If we don't set -stack-protector-guard-offset value:
3020 // %fs:0x28, unless we're using a Kernel code model, in which case
3021 // it's %gs:0x28. gs:0x14 on i386.
3022 if (Offset == INT_MAX2147483647)
3023 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
3024
3025 StringRef GuardReg = M->getStackProtectorGuardReg();
3026 if (GuardReg == "fs")
3027 AddressSpace = X86AS::FS;
3028 else if (GuardReg == "gs")
3029 AddressSpace = X86AS::GS;
3030
3031 // Use symbol guard if user specify.
3032 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
3033 if (!GuardSymb.empty()) {
3034 GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
3035 if (!GV) {
3036 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
3037 : Type::getInt32Ty(M->getContext());
3038 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
3039 nullptr, GuardSymb, nullptr,
3040 GlobalValue::NotThreadLocal, AddressSpace);
3041 }
3042 return GV;
3043 }
3044
3045 return SegmentOffset(IRB, Offset, AddressSpace);
3046 }
3047 }
3048 return TargetLowering::getIRStackGuard(IRB);
3049}
3050
3051void X86TargetLowering::insertSSPDeclarations(Module &M) const {
3052 // MSVC CRT provides functionalities for stack protection.
3053 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3054 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3055 // MSVC CRT has a global variable holding security cookie.
3056 M.getOrInsertGlobal("__security_cookie",
3057 Type::getInt8PtrTy(M.getContext()));
3058
3059 // MSVC CRT has a function to validate security cookie.
3060 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
3061 "__security_check_cookie", Type::getVoidTy(M.getContext()),
3062 Type::getInt8PtrTy(M.getContext()));
3063 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
3064 F->setCallingConv(CallingConv::X86_FastCall);
3065 F->addParamAttr(0, Attribute::AttrKind::InReg);
3066 }
3067 return;
3068 }
3069
3070 StringRef GuardMode = M.getStackProtectorGuard();
3071
3072 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
3073 if ((GuardMode == "tls" || GuardMode.empty()) &&
3074 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
3075 return;
3076 TargetLowering::insertSSPDeclarations(M);
3077}
3078
3079Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
3080 // MSVC CRT has a global variable holding security cookie.
3081 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3082 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3083 return M.getGlobalVariable("__security_cookie");
3084 }
3085 return TargetLowering::getSDagStackGuard(M);
3086}
3087
3088Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
3089 // MSVC CRT has a function to validate security cookie.
3090 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3091 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3092 return M.getFunction("__security_check_cookie");
3093 }
3094 return TargetLowering::getSSPStackGuardCheck(M);
3095}
3096
3097Value *
3098X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
3099 if (Subtarget.getTargetTriple().isOSContiki())
3100 return getDefaultSafeStackPointerLocation(IRB, false);
3101
3102 // Android provides a fixed TLS slot for the SafeStack pointer. See the
3103 // definition of TLS_SLOT_SAFESTACK in
3104 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
3105 if (Subtarget.isTargetAndroid()) {
3106 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
3107 // %gs:0x24 on i386
3108 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
3109 return SegmentOffset(IRB, Offset, getAddressSpace());
3110 }
3111
3112 // Fuchsia is similar.
3113 if (Subtarget.isTargetFuchsia()) {
3114 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
3115 return SegmentOffset(IRB, 0x18, getAddressSpace());
3116 }
3117
3118 return TargetLowering::getSafeStackPointerLocation(IRB);
3119}
3120
3121//===----------------------------------------------------------------------===//
3122// Return Value Calling Convention Implementation
3123//===----------------------------------------------------------------------===//
3124
3125bool X86TargetLowering::CanLowerReturn(
3126 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3127 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3128 SmallVector<CCValAssign, 16> RVLocs;
3129 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3130 return CCInfo.CheckReturn(Outs, RetCC_X86);
3131}
3132
3133const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
3134 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
3135 return ScratchRegs;
3136}
3137
3138ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
3139 // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
3140 // tests at the moment, which is not what we expected.
3141 static const MCPhysReg RCRegs[] = {X86::MXCSR};
3142 return RCRegs;
3143}
3144
3145/// Lowers masks values (v*i1) to the local register values
3146/// \returns DAG node after lowering to register type
3147static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
3148 const SDLoc &Dl, SelectionDAG &DAG) {
3149 EVT ValVT = ValArg.getValueType();
3150
3151 if (ValVT == MVT::v1i1)
3152 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
3153 DAG.getIntPtrConstant(0, Dl));
3154
3155 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
3156 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
3157 // Two stage lowering might be required
3158 // bitcast: v8i1 -> i8 / v16i1 -> i16
3159 // anyextend: i8 -> i32 / i16 -> i32
3160 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
3161 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
3162 if (ValLoc == MVT::i32)
3163 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
3164 return ValToCopy;
3165 }
3166
3167 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
3168 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
3169 // One stage lowering is required
3170 // bitcast: v32i1 -> i32 / v64i1 -> i64
3171 return DAG.getBitcast(ValLoc, ValArg);
3172 }
3173
3174 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
3175}
3176
3177/// Breaks v64i1 value into two registers and adds the new node to the DAG
3178static void Passv64i1ArgInRegs(
3179 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
3180 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
3181 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
3182 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__))
;
3183 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3183, __extension__
__PRETTY_FUNCTION__))
;
3184 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__))
;
3185 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
3186 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))
;
3187
3188 // Before splitting the value we cast it to i64
3189 Arg = DAG.getBitcast(MVT::i64, Arg);
3190
3191 // Splitting the value into two i32 types
3192 SDValue Lo, Hi;
3193 std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);
3194
3195 // Attach the two i32 types into corresponding registers
3196 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
3197 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
3198}
3199
3200SDValue
3201X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3202 bool isVarArg,
3203 const SmallVectorImpl<ISD::OutputArg> &Outs,
3204 const SmallVectorImpl<SDValue> &OutVals,
3205 const SDLoc &dl, SelectionDAG &DAG) const {
3206 MachineFunction &MF = DAG.getMachineFunction();
3207 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3208
3209 // In some cases we need to disable registers from the default CSR list.
3210 // For example, when they are used as return registers (preserve_* and X86's
3211 // regcall) or for argument passing (X86's regcall).
3212 bool ShouldDisableCalleeSavedRegister =
3213 shouldDisableRetRegFromCSR(CallConv) ||
3214 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
3215
3216 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
3217 report_fatal_error("X86 interrupts may not return any value");
3218
3219 SmallVector<CCValAssign, 16> RVLocs;
3220 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
3221 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
3222
3223 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
3224 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
3225 ++I, ++OutsIndex) {
3226 CCValAssign &VA = RVLocs[I];
3227 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3227, __extension__
__PRETTY_FUNCTION__))
;
3228
3229 // Add the register to the CalleeSaveDisableRegs list.
3230 if (ShouldDisableCalleeSavedRegister)
3231 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
3232
3233 SDValue ValToCopy = OutVals[OutsIndex];
3234 EVT ValVT = ValToCopy.getValueType();
3235
3236 // Promote values to the appropriate types.
3237 if (VA.getLocInfo() == CCValAssign::SExt)
3238 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
3239 else if (VA.getLocInfo() == CCValAssign::ZExt)
3240 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
3241 else if (VA.getLocInfo() == CCValAssign::AExt) {
3242 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
3243 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
3244 else
3245 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
3246 }
3247 else if (VA.getLocInfo() == CCValAssign::BCvt)
3248 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
3249
3250 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__))
3251 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__))
;
3252
3253 // Report an error if we have attempted to return a value via an XMM
3254 // register and SSE was disabled.
3255 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3256 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3257 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3258 } else if (!Subtarget.hasSSE2() &&
3259 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3260 ValVT == MVT::f64) {
3261 // When returning a double via an XMM register, report an error if SSE2 is
3262 // not enabled.
3263 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3264 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3265 }
3266
3267 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
3268 // the RET instruction and handled by the FP Stackifier.
3269 if (VA.getLocReg() == X86::FP0 ||
3270 VA.getLocReg() == X86::FP1) {
3271 // If this is a copy from an xmm register to ST(0), use an FPExtend to
3272 // change the value to the FP stack register class.
3273 if (isScalarFPTypeInSSEReg(VA.getValVT()))
3274 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
3275 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3276 // Don't emit a copytoreg.
3277 continue;
3278 }
3279
3280 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
3281 // which is returned in RAX / RDX.
3282 if (Subtarget.is64Bit()) {
3283 if (ValVT == MVT::x86mmx) {
3284 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
3285 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
3286 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
3287 ValToCopy);
3288 // If we don't have SSE2 available, convert to v4f32 so the generated
3289 // register is legal.
3290 if (!Subtarget.hasSSE2())
3291 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3292 }
3293 }
3294 }
3295
3296 if (VA.needsCustom()) {
3297 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__))
3298 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__))
;
3299
3300 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3301 Subtarget);
3302
3303 // Add the second register to the CalleeSaveDisableRegs list.
3304 if (ShouldDisableCalleeSavedRegister)
3305 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3306 } else {
3307 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3308 }
3309 }
3310
3311 SDValue Glue;
3312 SmallVector<SDValue, 6> RetOps;
3313 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3314 // Operand #1 = Bytes To Pop
3315 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3316 MVT::i32));
3317
3318 // Copy the result values into the output registers.
3319 for (auto &RetVal : RetVals) {
3320 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3321 RetOps.push_back(RetVal.second);
3322 continue; // Don't emit a copytoreg.
3323 }
3324
3325 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
3326 Glue = Chain.getValue(1);
3327 RetOps.push_back(
3328 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3329 }
3330
3331 // Swift calling convention does not require we copy the sret argument
3332 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3333
3334 // All x86 ABIs require that for returning structs by value we copy
3335 // the sret argument into %rax/%eax (depending on ABI) for the return.
3336 // We saved the argument into a virtual register in the entry block,
3337 // so now we copy the value out and into %rax/%eax.
3338 //
3339 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3340 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3341 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3342 // either case FuncInfo->setSRetReturnReg() will have been called.
3343 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3344 // When we have both sret and another return value, we should use the
3345 // original Chain stored in RetOps[0], instead of the current Chain updated
3346 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3347
3348 // For the case of sret and another return value, we have
3349 // Chain_0 at the function entry
3350 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3351 // If we use Chain_1 in getCopyFromReg, we will have
3352 // Val = getCopyFromReg(Chain_1)
3353 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3354
3355 // getCopyToReg(Chain_0) will be glued together with
3356 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3357 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3358 // Data dependency from Unit B to Unit A due to usage of Val in
3359 // getCopyToReg(Chain_1, Val)
3360 // Chain dependency from Unit A to Unit B
3361
3362 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3363 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3364 getPointerTy(MF.getDataLayout()));
3365
3366 Register RetValReg
3367 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3368 X86::RAX : X86::EAX;
3369 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
3370 Glue = Chain.getValue(1);
3371
3372 // RAX/EAX now acts like a return value.
3373 RetOps.push_back(
3374 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3375
3376 // Add the returned register to the CalleeSaveDisableRegs list. Don't do
3377 // this however for preserve_most/preserve_all to minimize the number of
3378 // callee-saved registers for these CCs.
3379 if (ShouldDisableCalleeSavedRegister &&
3380 CallConv != CallingConv::PreserveAll &&
3381 CallConv != CallingConv::PreserveMost)
3382 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3383 }
3384
3385 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3386 const MCPhysReg *I =
3387 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3388 if (I) {
3389 for (; *I; ++I) {
3390 if (X86::GR64RegClass.contains(*I))
3391 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3392 else
3393 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3393)
;
3394 }
3395 }
3396
3397 RetOps[0] = Chain; // Update chain.
3398
3399 // Add the glue if we have it.
3400 if (Glue.getNode())
3401 RetOps.push_back(Glue);
3402
3403 X86ISD::NodeType opcode = X86ISD::RET_GLUE;
3404 if (CallConv == CallingConv::X86_INTR)
3405 opcode = X86ISD::IRET;
3406 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3407}
3408
3409bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3410 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3411 return false;
3412
3413 SDValue TCChain = Chain;
3414 SDNode *Copy = *N->use_begin();
3415 if (Copy->getOpcode() == ISD::CopyToReg) {
3416 // If the copy has a glue operand, we conservatively assume it isn't safe to
3417 // perform a tail call.
3418 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3419 return false;
3420 TCChain = Copy->getOperand(0);
3421 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3422 return false;
3423
3424 bool HasRet = false;
3425 for (const SDNode *U : Copy->uses()) {
3426 if (U->getOpcode() != X86ISD::RET_GLUE)
3427 return false;
3428 // If we are returning more than one value, we can definitely
3429 // not make a tail call see PR19530
3430 if (U->getNumOperands() > 4)
3431 return false;
3432 if (U->getNumOperands() == 4 &&
3433 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3434 return false;
3435 HasRet = true;
3436 }
3437
3438 if (!HasRet)
3439 return false;
3440
3441 Chain = TCChain;
3442 return true;
3443}
3444
3445EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3446 ISD::NodeType ExtendKind) const {
3447 MVT ReturnMVT = MVT::i32;
3448
3449 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3450 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3451 // The ABI does not require i1, i8 or i16 to be extended.
3452 //
3453 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3454 // always extending i8/i16 return values, so keep doing that for now.
3455 // (PR26665).
3456 ReturnMVT = MVT::i8;
3457 }
3458
3459 EVT MinVT = getRegisterType(Context, ReturnMVT);
3460 return VT.bitsLT(MinVT) ? MinVT : VT;
3461}
3462
3463/// Reads two 32 bit registers and creates a 64 bit mask value.
3464/// \param VA The current 32 bit value that need to be assigned.
3465/// \param NextVA The next 32 bit value that need to be assigned.
3466/// \param Root The parent DAG node.
3467/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
3468/// glue purposes. In the case the DAG is already using
3469/// physical register instead of virtual, we should glue
3470/// our new SDValue to InGlue SDvalue.
3471/// \return a new SDvalue of size 64bit.
3472static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3473 SDValue &Root, SelectionDAG &DAG,
3474 const SDLoc &Dl, const X86Subtarget &Subtarget,
3475 SDValue *InGlue = nullptr) {
3476 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3476, __extension__
__PRETTY_FUNCTION__))
;
3477 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3477, __extension__
__PRETTY_FUNCTION__))
;
3478 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__))
3479 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__))
;
3480 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__))
3481 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__))
;
3482 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__))
3483 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__))
;
3484
3485 SDValue Lo, Hi;
3486 SDValue ArgValueLo, ArgValueHi;
3487
3488 MachineFunction &MF = DAG.getMachineFunction();
3489 const TargetRegisterClass *RC = &X86::GR32RegClass;
3490
3491 // Read a 32 bit value from the registers.
3492 if (nullptr == InGlue) {
3493 // When no physical register is present,
3494 // create an intermediate virtual register.
3495 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3496 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3497 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3498 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3499 } else {
3500 // When a physical register is available read the value from it and glue
3501 // the reads together.
3502 ArgValueLo =
3503 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);
3504 *InGlue = ArgValueLo.getValue(2);
3505 ArgValueHi =
3506 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);
3507 *InGlue = ArgValueHi.getValue(2);
3508 }
3509
3510 // Convert the i32 type into v32i1 type.
3511 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3512
3513 // Convert the i32 type into v32i1 type.
3514 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3515
3516 // Concatenate the two values together.
3517 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3518}
3519
3520/// The function will lower a register of various sizes (8/16/32/64)
3521/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3522/// \returns a DAG node contains the operand after lowering to mask type.
3523static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3524 const EVT &ValLoc, const SDLoc &Dl,
3525 SelectionDAG &DAG) {
3526 SDValue ValReturned = ValArg;
3527
3528 if (ValVT == MVT::v1i1)
3529 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3530
3531 if (ValVT == MVT::v64i1) {
3532 // In 32 bit machine, this case is handled by getv64i1Argument
3533 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3533, __extension__
__PRETTY_FUNCTION__))
;
3534 // In 64 bit machine, There is no need to truncate the value only bitcast
3535 } else {
3536 MVT maskLen;
3537 switch (ValVT.getSimpleVT().SimpleTy) {
3538 case MVT::v8i1:
3539 maskLen = MVT::i8;
3540 break;
3541 case MVT::v16i1:
3542 maskLen = MVT::i16;
3543 break;
3544 case MVT::v32i1:
3545 maskLen = MVT::i32;
3546 break;
3547 default:
3548 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3548)
;
3549 }
3550
3551 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3552 }
3553 return DAG.getBitcast(ValVT, ValReturned);
3554}
3555
3556/// Lower the result values of a call into the
3557/// appropriate copies out of appropriate physical registers.
3558///
3559SDValue X86TargetLowering::LowerCallResult(
3560 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
3561 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3562 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3563 uint32_t *RegMask) const {
3564
3565 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3566 // Assign locations to each value returned by this call.
3567 SmallVector<CCValAssign, 16> RVLocs;
3568 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3569 *DAG.getContext());
3570 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3571
3572 // Copy all of the result registers out of their specified physreg.
3573 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3574 ++I, ++InsIndex) {
3575 CCValAssign &VA = RVLocs[I];
3576 EVT CopyVT = VA.getLocVT();
3577
3578 // In some calling conventions we need to remove the used registers
3579 // from the register mask.
3580 if (RegMask) {
3581 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3582 SubRegs.isValid(); ++SubRegs)
3583 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3584 }
3585
3586 // Report an error if there was an attempt to return FP values via XMM
3587 // registers.
3588 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3589 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3590 if (VA.getLocReg() == X86::XMM1)
3591 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3592 else
3593 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3594 } else if (!Subtarget.hasSSE2() &&
3595 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3596 CopyVT == MVT::f64) {
3597 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3598 if (VA.getLocReg() == X86::XMM1)
3599 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3600 else
3601 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3602 }
3603
3604 // If we prefer to use the value in xmm registers, copy it out as f80 and
3605 // use a truncate to move it from fp stack reg to xmm reg.
3606 bool RoundAfterCopy = false;
3607 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3608 isScalarFPTypeInSSEReg(VA.getValVT())) {
3609 if (!Subtarget.hasX87())
3610 report_fatal_error("X87 register return with X87 disabled");
3611 CopyVT = MVT::f80;
3612 RoundAfterCopy = (CopyVT != VA.getLocVT());
3613 }
3614
3615 SDValue Val;
3616 if (VA.needsCustom()) {
3617 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__))
3618 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__))
;
3619 Val =
3620 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
3621 } else {
3622 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
3623 .getValue(1);
3624 Val = Chain.getValue(0);
3625 InGlue = Chain.getValue(2);
3626 }
3627
3628 if (RoundAfterCopy)
3629 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3630 // This truncation won't change the value.
3631 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
3632
3633 if (VA.isExtInLoc()) {
3634 if (VA.getValVT().isVector() &&
3635 VA.getValVT().getScalarType() == MVT::i1 &&
3636 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3637 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3638 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3639 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3640 } else
3641 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3642 }
3643
3644 if (VA.getLocInfo() == CCValAssign::BCvt)
3645 Val = DAG.getBitcast(VA.getValVT(), Val);
3646
3647 InVals.push_back(Val);
3648 }
3649
3650 return Chain;
3651}
3652
3653//===----------------------------------------------------------------------===//
3654// C & StdCall & Fast Calling Convention implementation
3655//===----------------------------------------------------------------------===//
3656// StdCall calling convention seems to be standard for many Windows' API
3657// routines and around. It differs from C calling convention just a little:
3658// callee should clean up the stack, not caller. Symbols should be also
3659// decorated in some fancy way :) It doesn't support any vector arguments.
3660// For info on fast calling convention see Fast Calling Convention (tail call)
3661// implementation LowerX86_32FastCCCallTo.
3662
3663/// Determines whether Args, either a set of outgoing arguments to a call, or a
3664/// set of incoming args of a call, contains an sret pointer that the callee
3665/// pops
3666template <typename T>
3667static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3668 const X86Subtarget &Subtarget) {
3669 // Not C++20 (yet), so no concepts available.
3670 static_assert(std::is_same_v<T, ISD::OutputArg> ||
3671 std::is_same_v<T, ISD::InputArg>,
3672 "requires ISD::OutputArg or ISD::InputArg");
3673
3674 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3675 // for most compilations.
3676 if (!Subtarget.is32Bit())
3677 return false;
3678
3679 if (Args.empty())
3680 return false;
3681
3682 // Most calls do not have an sret argument, check the arg next.
3683 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3684 if (!Flags.isSRet() || Flags.isInReg())
3685 return false;
3686
3687 // The MSVCabi does not pop the sret.
3688 if (Subtarget.getTargetTriple().isOSMSVCRT())
3689 return false;
3690
3691 // MCUs don't pop the sret
3692 if (Subtarget.isTargetMCU())
3693 return false;
3694
3695 // Callee pops argument
3696 return true;
3697}
3698
3699/// Make a copy of an aggregate at address specified by "Src" to address
3700/// "Dst" with size and alignment information specified by the specific
3701/// parameter attribute. The copy will be passed as a byval function parameter.
3702static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3703 SDValue Chain, ISD::ArgFlagsTy Flags,
3704 SelectionDAG &DAG, const SDLoc &dl) {
3705 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3706
3707 return DAG.getMemcpy(
3708 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3709 /*isVolatile*/ false, /*AlwaysInline=*/true,
3710 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3711}
3712
3713/// Return true if the calling convention is one that we can guarantee TCO for.
3714static bool canGuaranteeTCO(CallingConv::ID CC) {
3715 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3716 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3717 CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
3718}
3719
3720/// Return true if we might ever do TCO for calls with this calling convention.
3721static bool mayTailCallThisCC(CallingConv::ID CC) {
3722 switch (CC) {
3723 // C calling conventions:
3724 case CallingConv::C:
3725 case CallingConv::Win64:
3726 case CallingConv::X86_64_SysV:
3727 // Callee pop conventions:
3728 case CallingConv::X86_ThisCall:
3729 case CallingConv::X86_StdCall:
3730 case CallingConv::X86_VectorCall:
3731 case CallingConv::X86_FastCall:
3732 // Swift:
3733 case CallingConv::Swift:
3734 return true;
3735 default:
3736 return canGuaranteeTCO(CC);
3737 }
3738}
3739
3740/// Return true if the function is being made into a tailcall target by
3741/// changing its ABI.
3742static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3743 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3744 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3745}
3746
3747bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3748 if (!CI->isTailCall())
3749 return false;
3750
3751 CallingConv::ID CalleeCC = CI->getCallingConv();
3752 if (!mayTailCallThisCC(CalleeCC))
3753 return false;
3754
3755 return true;
3756}
3757
3758SDValue
3759X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3760 const SmallVectorImpl<ISD::InputArg> &Ins,
3761 const SDLoc &dl, SelectionDAG &DAG,
3762 const CCValAssign &VA,
3763 MachineFrameInfo &MFI, unsigned i) const {
3764 // Create the nodes corresponding to a load from this parameter slot.
3765 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3766 bool AlwaysUseMutable = shouldGuaranteeTCO(
3767 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3768 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3769 EVT ValVT;
3770 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3771
3772 // If value is passed by pointer we have address passed instead of the value
3773 // itself. No need to extend if the mask value and location share the same
3774 // absolute size.
3775 bool ExtendedInMem =
3776 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3777 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3778
3779 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3780 ValVT = VA.getLocVT();
3781 else
3782 ValVT = VA.getValVT();
3783
3784 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3785 // changed with more analysis.
3786 // In case of tail call optimization mark all arguments mutable. Since they
3787 // could be overwritten by lowering of arguments in case of a tail call.
3788 if (Flags.isByVal()) {
3789 unsigned Bytes = Flags.getByValSize();
3790 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3791
3792 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3793 // can be improved with deeper analysis.
3794 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3795 /*isAliased=*/true);
3796 return DAG.getFrameIndex(FI, PtrVT);
3797 }
3798
3799 EVT ArgVT = Ins[i].ArgVT;
3800
3801 // If this is a vector that has been split into multiple parts, and the
3802 // scalar size of the parts don't match the vector element size, then we can't
3803 // elide the copy. The parts will have padding between them instead of being
3804 // packed like a vector.
3805 bool ScalarizedAndExtendedVector =
3806 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3807 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3808
3809 // This is an argument in memory. We might be able to perform copy elision.
3810 // If the argument is passed directly in memory without any extension, then we
3811 // can perform copy elision. Large vector types, for example, may be passed
3812 // indirectly by pointer.
3813 if (Flags.isCopyElisionCandidate() &&
3814 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3815 !ScalarizedAndExtendedVector) {
3816 SDValue PartAddr;
3817 if (Ins[i].PartOffset == 0) {
3818 // If this is a one-part value or the first part of a multi-part value,
3819 // create a stack object for the entire argument value type and return a
3820 // load from our portion of it. This assumes that if the first part of an
3821 // argument is in memory, the rest will also be in memory.
3822 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3823 /*IsImmutable=*/false);
3824 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3825 return DAG.getLoad(
3826 ValVT, dl, Chain, PartAddr,
3827 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3828 } else {
3829 // This is not the first piece of an argument in memory. See if there is
3830 // already a fixed stack object including this offset. If so, assume it
3831 // was created by the PartOffset == 0 branch above and create a load from
3832 // the appropriate offset into it.
3833 int64_t PartBegin = VA.getLocMemOffset();
3834 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3835 int FI = MFI.getObjectIndexBegin();
3836 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3837 int64_t ObjBegin = MFI.getObjectOffset(FI);
3838 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3839 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3840 break;
3841 }
3842 if (MFI.isFixedObjectIndex(FI)) {
3843 SDValue Addr =
3844 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3845 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3846 return DAG.getLoad(
3847 ValVT, dl, Chain, Addr,
3848 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3849 Ins[i].PartOffset));
3850 }
3851 }
3852 }
3853
3854 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3855 VA.getLocMemOffset(), isImmutable);
3856
3857 // Set SExt or ZExt flag.
3858 if (VA.getLocInfo() == CCValAssign::ZExt) {
3859 MFI.setObjectZExt(FI, true);
3860 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3861 MFI.setObjectSExt(FI, true);
3862 }
3863
3864 MaybeAlign Alignment;
3865 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3866 ValVT != MVT::f80)
3867 Alignment = MaybeAlign(4);
3868 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3869 SDValue Val = DAG.getLoad(
3870 ValVT, dl, Chain, FIN,
3871 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3872 Alignment);
3873 return ExtendedInMem
3874 ? (VA.getValVT().isVector()
3875 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3876 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3877 : Val;
3878}
3879
3880// FIXME: Get this from tablegen.
3881static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3882 const X86Subtarget &Subtarget) {
3883 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3883, __extension__ __PRETTY_FUNCTION__))
;
3884
3885 if (Subtarget.isCallingConvWin64(CallConv)) {
3886 static const MCPhysReg GPR64ArgRegsWin64[] = {
3887 X86::RCX, X86::RDX, X86::R8, X86::R9
3888 };
3889 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3890 }
3891
3892 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3893 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3894 };
3895 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3896}
3897
3898// FIXME: Get this from tablegen.
3899static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3900 CallingConv::ID CallConv,
3901 const X86Subtarget &Subtarget) {
3902 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3902, __extension__ __PRETTY_FUNCTION__))
;
3903 if (Subtarget.isCallingConvWin64(CallConv)) {
3904 // The XMM registers which might contain var arg parameters are shadowed
3905 // in their paired GPR. So we only need to save the GPR to their home
3906 // slots.
3907 // TODO: __vectorcall will change this.
3908 return std::nullopt;
3909 }
3910
3911 bool isSoftFloat = Subtarget.useSoftFloat();
3912 if (isSoftFloat || !Subtarget.hasSSE1())
3913 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3914 // registers.
3915 return std::nullopt;
3916
3917 static const MCPhysReg XMMArgRegs64Bit[] = {
3918 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3919 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3920 };
3921 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3922}
3923
3924#ifndef NDEBUG
3925static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3926 return llvm::is_sorted(
3927 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3928 return A.getValNo() < B.getValNo();
3929 });
3930}
3931#endif
3932
3933namespace {
3934/// This is a helper class for lowering variable arguments parameters.
3935class VarArgsLoweringHelper {
3936public:
3937 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3938 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3939 CallingConv::ID CallConv, CCState &CCInfo)
3940 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3941 TheMachineFunction(DAG.getMachineFunction()),
3942 TheFunction(TheMachineFunction.getFunction()),
3943 FrameInfo(TheMachineFunction.getFrameInfo()),
3944 FrameLowering(*Subtarget.getFrameLowering()),
3945 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3946 CCInfo(CCInfo) {}
3947
3948 // Lower variable arguments parameters.
3949 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3950
3951private:
3952 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3953
3954 void forwardMustTailParameters(SDValue &Chain);
3955
3956 bool is64Bit() const { return Subtarget.is64Bit(); }
3957 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3958
3959 X86MachineFunctionInfo *FuncInfo;
3960 const SDLoc &DL;
3961 SelectionDAG &DAG;
3962 const X86Subtarget &Subtarget;
3963 MachineFunction &TheMachineFunction;
3964 const Function &TheFunction;
3965 MachineFrameInfo &FrameInfo;
3966 const TargetFrameLowering &FrameLowering;
3967 const TargetLowering &TargLowering;
3968 CallingConv::ID CallConv;
3969 CCState &CCInfo;
3970};
3971} // namespace
3972
3973void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3974 SDValue &Chain, unsigned StackSize) {
3975 // If the function takes variable number of arguments, make a frame index for
3976 // the start of the first vararg value... for expansion of llvm.va_start. We
3977 // can skip this if there are no va_start calls.
3978 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3979 CallConv != CallingConv::X86_ThisCall)) {
3980 FuncInfo->setVarArgsFrameIndex(
3981 FrameInfo.CreateFixedObject(1, StackSize, true));
3982 }
3983
3984 // 64-bit calling conventions support varargs and register parameters, so we
3985 // have to do extra work to spill them in the prologue.
3986 if (is64Bit()) {
3987 // Find the first unallocated argument registers.
3988 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3989 ArrayRef<MCPhysReg> ArgXMMs =
3990 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3991 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3992 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3993
3994 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__))
3995 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__))
;
3996
3997 if (isWin64()) {
3998 // Get to the caller-allocated home save location. Add 8 to account
3999 // for the return address.
4000 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
4001 FuncInfo->setRegSaveFrameIndex(
4002 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
4003 // Fixup to set vararg frame on shadow area (4 x i64).
4004 if (NumIntRegs < 4)
4005 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
4006 } else {
4007 // For X86-64, if there are vararg parameters that are passed via
4008 // registers, then we must store them to their spots on the stack so
4009 // they may be loaded by dereferencing the result of va_next.
4010 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
4011 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
4012 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
4013 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
4014 }
4015
4016 SmallVector<SDValue, 6>
4017 LiveGPRs; // list of SDValue for GPR registers keeping live input value
4018 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
4019 // keeping live input value
4020 SDValue ALVal; // if applicable keeps SDValue for %al register
4021
4022 // Gather all the live in physical registers.
4023 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
4024 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
4025 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
4026 }
4027 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
4028 if (!AvailableXmms.empty()) {
4029 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4030 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
4031 for (MCPhysReg Reg : AvailableXmms) {
4032 // FastRegisterAllocator spills virtual registers at basic
4033 // block boundary. That leads to usages of xmm registers
4034 // outside of check for %al. Pass physical registers to
4035 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
4036 TheMachineFunction.getRegInfo().addLiveIn(Reg);
4037 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
4038 }
4039 }
4040
4041 // Store the integer parameter registers.
4042 SmallVector<SDValue, 8> MemOps;
4043 SDValue RSFIN =
4044 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
4045 TargLowering.getPointerTy(DAG.getDataLayout()));
4046 unsigned Offset = FuncInfo->getVarArgsGPOffset();
4047 for (SDValue Val : LiveGPRs) {
4048 SDValue FIN = DAG.getNode(ISD::ADD, DL,
4049 TargLowering.getPointerTy(DAG.getDataLayout()),
4050 RSFIN, DAG.getIntPtrConstant(Offset, DL));
4051 SDValue Store =
4052 DAG.getStore(Val.getValue(1), DL, Val, FIN,
4053 MachinePointerInfo::getFixedStack(
4054 DAG.getMachineFunction(),
4055 FuncInfo->getRegSaveFrameIndex(), Offset));
4056 MemOps.push_back(Store);
4057 Offset += 8;
4058 }
4059
4060 // Now store the XMM (fp + vector) parameter registers.
4061 if (!LiveXMMRegs.empty()) {
4062 SmallVector<SDValue, 12> SaveXMMOps;
4063 SaveXMMOps.push_back(Chain);
4064 SaveXMMOps.push_back(ALVal);
4065 SaveXMMOps.push_back(RSFIN);
4066 SaveXMMOps.push_back(
4067 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
4068 llvm::append_range(SaveXMMOps, LiveXMMRegs);
4069 MachineMemOperand *StoreMMO =
4070 DAG.getMachineFunction().getMachineMemOperand(
4071 MachinePointerInfo::getFixedStack(
4072 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
4073 Offset),
4074 MachineMemOperand::MOStore, 128, Align(16));
4075 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
4076 DL, DAG.getVTList(MVT::Other),
4077 SaveXMMOps, MVT::i8, StoreMMO));
4078 }
4079
4080 if (!MemOps.empty())
4081 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4082 }
4083}
4084
4085void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
4086 // Find the largest legal vector type.
4087 MVT VecVT = MVT::Other;
4088 // FIXME: Only some x86_32 calling conventions support AVX512.
4089 if (Subtarget.useAVX512Regs() &&
4090 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
4091 CallConv == CallingConv::Intel_OCL_BI)))
4092 VecVT = MVT::v16f32;
4093 else if (Subtarget.hasAVX())
4094 VecVT = MVT::v8f32;
4095 else if (Subtarget.hasSSE2())
4096 VecVT = MVT::v4f32;
4097
4098 // We forward some GPRs and some vector types.
4099 SmallVector<MVT, 2> RegParmTypes;
4100 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
4101 RegParmTypes.push_back(IntVT);
4102 if (VecVT != MVT::Other)
4103 RegParmTypes.push_back(VecVT);
4104
4105 // Compute the set of forwarded registers. The rest are scratch.
4106 SmallVectorImpl<ForwardedRegister> &Forwards =
4107 FuncInfo->getForwardedMustTailRegParms();
4108 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
4109
4110 // Forward AL for SysV x86_64 targets, since it is used for varargs.
4111 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
4112 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4113 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
4114 }
4115
4116 // Copy all forwards from physical to virtual registers.
4117 for (ForwardedRegister &FR : Forwards) {
4118 // FIXME: Can we use a less constrained schedule?
4119 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
4120 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
4121 TargLowering.getRegClassFor(FR.VT));
4122 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
4123 }
4124}
4125
4126void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
4127 unsigned StackSize) {
4128 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
4129 // If necessary, it would be set into the correct value later.
4130 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
4131 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4132
4133 if (FrameInfo.hasVAStart())
4134 createVarArgAreaAndStoreRegisters(Chain, StackSize);
4135
4136 if (FrameInfo.hasMustTailInVarArgFunc())
4137 forwardMustTailParameters(Chain);
4138}
4139
4140SDValue X86TargetLowering::LowerFormalArguments(
4141 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
4142 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4143 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4144 MachineFunction &MF = DAG.getMachineFunction();
4145 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4146
4147 const Function &F = MF.getFunction();
4148 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
4149 F.getName() == "main")
4150 FuncInfo->setForceFramePointer(true);
4151
4152 MachineFrameInfo &MFI = MF.getFrameInfo();
4153 bool Is64Bit = Subtarget.is64Bit();
4154 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4155
4156 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))
4157 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))
4158 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))
;
4159
4160 // Assign locations to all of the incoming arguments.
4161 SmallVector<CCValAssign, 16> ArgLocs;
4162 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4163
4164 // Allocate shadow area for Win64.
4165 if (IsWin64)
4166 CCInfo.AllocateStack(32, Align(8));
4167
4168 CCInfo.AnalyzeArguments(Ins, CC_X86);
4169
4170 // In vectorcall calling convention a second pass is required for the HVA
4171 // types.
4172 if (CallingConv::X86_VectorCall == CallConv) {
4173 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
4174 }
4175
4176 // The next loop assumes that the locations are in the same order of the
4177 // input arguments.
4178 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__))
4179 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__))
;
4180
4181 SDValue ArgValue;
4182 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
4183 ++I, ++InsIndex) {
4184 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4184, __extension__
__PRETTY_FUNCTION__))
;
4185 CCValAssign &VA = ArgLocs[I];
4186
4187 if (VA.isRegLoc()) {
4188 EVT RegVT = VA.getLocVT();
4189 if (VA.needsCustom()) {
4190 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))
4191 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))
4192 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))
;
4193
4194 // v64i1 values, in regcall calling convention, that are
4195 // compiled to 32 bit arch, are split up into two registers.
4196 ArgValue =
4197 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
4198 } else {
4199 const TargetRegisterClass *RC;
4200 if (RegVT == MVT::i8)
4201 RC = &X86::GR8RegClass;
4202 else if (RegVT == MVT::i16)
4203 RC = &X86::GR16RegClass;
4204 else if (RegVT == MVT::i32)
4205 RC = &X86::GR32RegClass;
4206 else if (Is64Bit && RegVT == MVT::i64)
4207 RC = &X86::GR64RegClass;
4208 else if (RegVT == MVT::f16)
4209 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
4210 else if (RegVT == MVT::f32)
4211 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
4212 else if (RegVT == MVT::f64)
4213 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
4214 else if (RegVT == MVT::f80)
4215 RC = &X86::RFP80RegClass;
4216 else if (RegVT == MVT::f128)
4217 RC = &X86::VR128RegClass;
4218 else if (RegVT.is512BitVector())
4219 RC = &X86::VR512RegClass;
4220 else if (RegVT.is256BitVector())
4221 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
4222 else if (RegVT.is128BitVector())
4223 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
4224 else if (RegVT == MVT::x86mmx)
4225 RC = &X86::VR64RegClass;
4226 else if (RegVT == MVT::v1i1)
4227 RC = &X86::VK1RegClass;
4228 else if (RegVT == MVT::v8i1)
4229 RC = &X86::VK8RegClass;
4230 else if (RegVT == MVT::v16i1)
4231 RC = &X86::VK16RegClass;
4232 else if (RegVT == MVT::v32i1)
4233 RC = &X86::VK32RegClass;
4234 else if (RegVT == MVT::v64i1)
4235 RC = &X86::VK64RegClass;
4236 else
4237 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4237)
;
4238
4239 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4240 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4241 }
4242
4243 // If this is an 8 or 16-bit value, it is really passed promoted to 32
4244 // bits. Insert an assert[sz]ext to capture this, then truncate to the
4245 // right size.
4246 if (VA.getLocInfo() == CCValAssign::SExt)
4247 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4248 DAG.getValueType(VA.getValVT()));
4249 else if (VA.getLocInfo() == CCValAssign::ZExt)
4250 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4251 DAG.getValueType(VA.getValVT()));
4252 else if (VA.getLocInfo() == CCValAssign::BCvt)
4253 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
4254
4255 if (VA.isExtInLoc()) {
4256 // Handle MMX values passed in XMM regs.
4257 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
4258 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
4259 else if (VA.getValVT().isVector() &&
4260 VA.getValVT().getScalarType() == MVT::i1 &&
4261 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
4262 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
4263 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
4264 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
4265 } else
4266 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4267 }
4268 } else {
4269 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4269, __extension__ __PRETTY_FUNCTION__))
;
4270 ArgValue =
4271 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
4272 }
4273
4274 // If value is passed via pointer - do a load.
4275 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
4276 ArgValue =
4277 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
4278
4279 InVals.push_back(ArgValue);
4280 }
4281
4282 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4283 if (Ins[I].Flags.isSwiftAsync()) {
4284 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
4285 if (Subtarget.is64Bit())
4286 X86FI->setHasSwiftAsyncContext(true);
4287 else {
4288 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
4289 X86FI->setSwiftAsyncContextFrameIdx(FI);
4290 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
4291 DAG.getFrameIndex(FI, MVT::i32),
4292 MachinePointerInfo::getFixedStack(MF, FI));
4293 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4294 }
4295 }
4296
4297 // Swift calling convention does not require we copy the sret argument
4298 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4299 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4300 continue;
4301
4302 // All x86 ABIs require that for returning structs by value we copy the
4303 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4304 // the argument into a virtual register so that we can access it from the
4305 // return points.
4306 if (Ins[I].Flags.isSRet()) {
4307 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__))
4308 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__))
;
4309 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4310 Register Reg =
4311 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4312 FuncInfo->setSRetReturnReg(Reg);
4313 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4314 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4315 break;
4316 }
4317 }
4318
4319 unsigned StackSize = CCInfo.getNextStackOffset();
4320 // Align stack specially for tail calls.
4321 if (shouldGuaranteeTCO(CallConv,
4322 MF.getTarget().Options.GuaranteedTailCallOpt))
4323 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4324
4325 if (IsVarArg)
4326 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4327 .lowerVarArgsParameters(Chain, StackSize);
4328
4329 // Some CCs need callee pop.
4330 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4331 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4332 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4333 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4334 // X86 interrupts must pop the error code (and the alignment padding) if
4335 // present.
4336 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4337 } else {
4338 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4339 // If this is an sret function, the return should pop the hidden pointer.
4340 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4341 FuncInfo->setBytesToPopOnReturn(4);
4342 }
4343
4344 if (!Is64Bit) {
4345 // RegSaveFrameIndex is X86-64 only.
4346 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4347 }
4348
4349 FuncInfo->setArgumentStackSize(StackSize);
4350
4351 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4352 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4353 if (Personality == EHPersonality::CoreCLR) {
4354 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4354,
__extension__ __PRETTY_FUNCTION__))
;
4355 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4356 // that we'd prefer this slot be allocated towards the bottom of the frame
4357 // (i.e. near the stack pointer after allocating the frame). Every
4358 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4359 // offset from the bottom of this and each funclet's frame must be the
4360 // same, so the size of funclets' (mostly empty) frames is dictated by
4361 // how far this slot is from the bottom (since they allocate just enough
4362 // space to accommodate holding this slot at the correct offset).
4363 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4364 EHInfo->PSPSymFrameIdx = PSPSymFI;
4365 }
4366 }
4367
4368 if (shouldDisableArgRegFromCSR(CallConv) ||
4369 F.hasFnAttribute("no_caller_saved_registers")) {
4370 MachineRegisterInfo &MRI = MF.getRegInfo();
4371 for (std::pair<Register, Register> Pair : MRI.liveins())
4372 MRI.disableCalleeSavedRegister(Pair.first);
4373 }
4374
4375 return Chain;
4376}
4377
4378SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4379 SDValue Arg, const SDLoc &dl,
4380 SelectionDAG &DAG,
4381 const CCValAssign &VA,
4382 ISD::ArgFlagsTy Flags,
4383 bool isByVal) const {
4384 unsigned LocMemOffset = VA.getLocMemOffset();
4385 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4386 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4387 StackPtr, PtrOff);
4388 if (isByVal)
4389 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4390
4391 MaybeAlign Alignment;
4392 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4393 Arg.getSimpleValueType() != MVT::f80)
4394 Alignment = MaybeAlign(4);
4395 return DAG.getStore(
4396 Chain, dl, Arg, PtrOff,
4397 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4398 Alignment);
4399}
4400
4401/// Emit a load of return address if tail call
4402/// optimization is performed and it is required.
4403SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4404 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4405 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4406 // Adjust the Return address stack slot.
4407 EVT VT = getPointerTy(DAG.getDataLayout());
4408 OutRetAddr = getReturnAddressFrameIndex(DAG);
4409
4410 // Load the "old" Return address.
4411 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4412 return SDValue(OutRetAddr.getNode(), 1);
4413}
4414
4415/// Emit a store of the return address if tail call
4416/// optimization is performed and it is required (FPDiff!=0).
4417static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4418 SDValue Chain, SDValue RetAddrFrIdx,
4419 EVT PtrVT, unsigned SlotSize,
4420 int FPDiff, const SDLoc &dl) {
4421 // Store the return address to the appropriate stack slot.
4422 if (!FPDiff) return Chain;
4423 // Calculate the new stack slot for the return address.
4424 int NewReturnAddrFI =
4425 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4426 false);
4427 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4428 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4429 MachinePointerInfo::getFixedStack(
4430 DAG.getMachineFunction(), NewReturnAddrFI));
4431 return Chain;
4432}
4433
4434/// Returns a vector_shuffle mask for an movs{s|d}, movd
4435/// operation of specified width.
4436static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4437 SDValue V2) {
4438 unsigned NumElems = VT.getVectorNumElements();
4439 SmallVector<int, 8> Mask;
4440 Mask.push_back(NumElems);
4441 for (unsigned i = 1; i != NumElems; ++i)
4442 Mask.push_back(i);
4443 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4444}
4445
4446SDValue
4447X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4448 SmallVectorImpl<SDValue> &InVals) const {
4449 SelectionDAG &DAG = CLI.DAG;
4450 SDLoc &dl = CLI.DL;
4451 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4452 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4453 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4454 SDValue Chain = CLI.Chain;
4455 SDValue Callee = CLI.Callee;
4456 CallingConv::ID CallConv = CLI.CallConv;
4457 bool &isTailCall = CLI.IsTailCall;
4458 bool isVarArg = CLI.IsVarArg;
4459 const auto *CB = CLI.CB;
4460
4461 MachineFunction &MF = DAG.getMachineFunction();
4462 bool Is64Bit = Subtarget.is64Bit();
4463 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4464 bool IsSibcall = false;
4465 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4466 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4467 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4468 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4469 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4470 CB->hasFnAttr("no_caller_saved_registers"));
4471 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4472 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4473 bool IsCFICall = IsIndirectCall && CLI.CFIType;
4474 const Module *M = MF.getMMI().getModule();
4475 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4476
4477 MachineFunction::CallSiteInfo CSInfo;
4478 if (CallConv == CallingConv::X86_INTR)
4479 report_fatal_error("X86 interrupts may not be called directly");
4480
4481 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4482 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4483 // If we are using a GOT, disable tail calls to external symbols with
4484 // default visibility. Tail calling such a symbol requires using a GOT
4485 // relocation, which forces early binding of the symbol. This breaks code
4486 // that require lazy function symbol resolution. Using musttail or
4487 // GuaranteedTailCallOpt will override this.
4488 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4489 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4490 G->getGlobal()->hasDefaultVisibility()))
4491 isTailCall = false;
4492 }
4493
4494 if (isTailCall && !IsMustTail) {
4495 // Check if it's really possible to do a tail call.
4496 isTailCall = IsEligibleForTailCallOptimization(
4497 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4498 Ins, DAG);
4499
4500 // Sibcalls are automatically detected tailcalls which do not require
4501 // ABI changes.
4502 if (!IsGuaranteeTCO && isTailCall)
4503 IsSibcall = true;
4504
4505 if (isTailCall)
4506 ++NumTailCalls;
4507 }
4508
4509 if (IsMustTail && !isTailCall)
4510 report_fatal_error("failed to perform tail call elimination on a call "
4511 "site marked musttail");
4512
4513 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
4514 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
;
4515
4516 // Analyze operands of the call, assigning locations to each operand.
4517 SmallVector<CCValAssign, 16> ArgLocs;
4518 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4519
4520 // Allocate shadow area for Win64.
4521 if (IsWin64)
4522 CCInfo.AllocateStack(32, Align(8));
4523
4524 CCInfo.AnalyzeArguments(Outs, CC_X86);
4525
4526 // In vectorcall calling convention a second pass is required for the HVA
4527 // types.
4528 if (CallingConv::X86_VectorCall == CallConv) {
4529 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4530 }
4531
4532 // Get a count of how many bytes are to be pushed on the stack.
4533 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4534 if (IsSibcall)
4535 // This is a sibcall. The memory operands are available in caller's
4536 // own caller's stack.
4537 NumBytes = 0;
4538 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4539 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4540
4541 int FPDiff = 0;
4542 if (isTailCall &&
4543 shouldGuaranteeTCO(CallConv,
4544 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4545 // Lower arguments at fp - stackoffset + fpdiff.
4546 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4547
4548 FPDiff = NumBytesCallerPushed - NumBytes;
4549
4550 // Set the delta of movement of the returnaddr stackslot.
4551 // But only set if delta is greater than previous delta.
4552 if (FPDiff < X86Info->getTCReturnAddrDelta())
4553 X86Info->setTCReturnAddrDelta(FPDiff);
4554 }
4555
4556 unsigned NumBytesToPush = NumBytes;
4557 unsigned NumBytesToPop = NumBytes;
4558
4559 // If we have an inalloca argument, all stack space has already been allocated
4560 // for us and be right at the top of the stack. We don't support multiple
4561 // arguments passed in memory when using inalloca.
4562 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4563 NumBytesToPush = 0;
4564 if (!ArgLocs.back().isMemLoc())
4565 report_fatal_error("cannot use inalloca attribute on a register "
4566 "parameter");
4567 if (ArgLocs.back().getLocMemOffset() != 0)
4568 report_fatal_error("any parameter with the inalloca attribute must be "
4569 "the only memory argument");
4570 } else if (CLI.IsPreallocated) {
4571 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
4572 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
4573 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))
;
4574 SmallVector<size_t, 4> PreallocatedOffsets;
4575 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4576 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4577 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4578 }
4579 }
4580 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4581 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4582 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4583 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4584 NumBytesToPush = 0;
4585 }
4586
4587 if (!IsSibcall && !IsMustTail)
4588 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4589 NumBytes - NumBytesToPush, dl);
4590
4591 SDValue RetAddrFrIdx;
4592 // Load return address for tail calls.
4593 if (isTailCall && FPDiff)
4594 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4595 Is64Bit, FPDiff, dl);
4596
4597 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4598 SmallVector<SDValue, 8> MemOpChains;
4599 SDValue StackPtr;
4600
4601 // The next loop assumes that the locations are in the same order of the
4602 // input arguments.
4603 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__))
4604 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__))
;
4605
4606 // Walk the register/memloc assignments, inserting copies/loads. In the case
4607 // of tail call optimization arguments are handle later.
4608 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4609 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4610 ++I, ++OutIndex) {
4611 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4611, __extension__
__PRETTY_FUNCTION__))
;
4612 // Skip inalloca/preallocated arguments, they have already been written.
4613 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4614 if (Flags.isInAlloca() || Flags.isPreallocated())
4615 continue;
4616
4617 CCValAssign &VA = ArgLocs[I];
4618 EVT RegVT = VA.getLocVT();
4619 SDValue Arg = OutVals[OutIndex];
4620 bool isByVal = Flags.isByVal();
4621
4622 // Promote the value if needed.
4623 switch (VA.getLocInfo()) {
4624 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4624)
;
4625 case CCValAssign::Full: break;
4626 case CCValAssign::SExt:
4627 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4628 break;
4629 case CCValAssign::ZExt:
4630 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4631 break;
4632 case CCValAssign::AExt:
4633 if (Arg.getValueType().isVector() &&
4634 Arg.getValueType().getVectorElementType() == MVT::i1)
4635 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4636 else if (RegVT.is128BitVector()) {
4637 // Special case: passing MMX values in XMM registers.
4638 Arg = DAG.getBitcast(MVT::i64, Arg);
4639 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4640 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4641 } else
4642 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4643 break;
4644 case CCValAssign::BCvt:
4645 Arg = DAG.getBitcast(RegVT, Arg);
4646 break;
4647 case CCValAssign::Indirect: {
4648 if (isByVal) {
4649 // Memcpy the argument to a temporary stack slot to prevent
4650 // the caller from seeing any modifications the callee may make
4651 // as guaranteed by the `byval` attribute.
4652 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4653 Flags.getByValSize(),
4654 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4655 SDValue StackSlot =
4656 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4657 Chain =
4658 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4659 // From now on treat this as a regular pointer
4660 Arg = StackSlot;
4661 isByVal = false;
4662 } else {
4663 // Store the argument.
4664 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4665 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4666 Chain = DAG.getStore(
4667 Chain, dl, Arg, SpillSlot,
4668 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4669 Arg = SpillSlot;
4670 }
4671 break;
4672 }
4673 }
4674
4675 if (VA.needsCustom()) {
4676 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__))
4677 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__))
;
4678 // Split v64i1 value into two registers
4679 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4680 } else if (VA.isRegLoc()) {
4681 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4682 const TargetOptions &Options = DAG.getTarget().Options;
4683 if (Options.EmitCallSiteInfo)
4684 CSInfo.emplace_back(VA.getLocReg(), I);
4685 if (isVarArg && IsWin64) {
4686 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4687 // shadow reg if callee is a varargs function.
4688 Register ShadowReg;
4689 switch (VA.getLocReg()) {
4690 case X86::XMM0: ShadowReg = X86::RCX; break;
4691 case X86::XMM1: ShadowReg = X86::RDX; break;
4692 case X86::XMM2: ShadowReg = X86::R8; break;
4693 case X86::XMM3: ShadowReg = X86::R9; break;
4694 }
4695 if (ShadowReg)
4696 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4697 }
4698 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4699 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4699, __extension__ __PRETTY_FUNCTION__))
;
4700 if (!StackPtr.getNode())
4701 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4702 getPointerTy(DAG.getDataLayout()));
4703 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4704 dl, DAG, VA, Flags, isByVal));
4705 }
4706 }
4707
4708 if (!MemOpChains.empty())
4709 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4710
4711 if (Subtarget.isPICStyleGOT()) {
4712 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4713 // GOT pointer (except regcall).
4714 if (!isTailCall) {
4715 // Indirect call with RegCall calling convertion may use up all the
4716 // general registers, so it is not suitable to bind EBX reister for
4717 // GOT address, just let register allocator handle it.
4718 if (CallConv != CallingConv::X86_RegCall)
4719 RegsToPass.push_back(std::make_pair(
4720 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4721 getPointerTy(DAG.getDataLayout()))));
4722 } else {
4723 // If we are tail calling and generating PIC/GOT style code load the
4724 // address of the callee into ECX. The value in ecx is used as target of
4725 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4726 // for tail calls on PIC/GOT architectures. Normally we would just put the
4727 // address of GOT into ebx and then call target@PLT. But for tail calls
4728 // ebx would be restored (since ebx is callee saved) before jumping to the
4729 // target@PLT.
4730
4731 // Note: The actual moving to ECX is done further down.
4732 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4733 if (G && !G->getGlobal()->hasLocalLinkage() &&
4734 G->getGlobal()->hasDefaultVisibility())
4735 Callee = LowerGlobalAddress(Callee, DAG);
4736 else if (isa<ExternalSymbolSDNode>(Callee))
4737 Callee = LowerExternalSymbol(Callee, DAG);
4738 }
4739 }
4740
4741 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4742 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4743 // From AMD64 ABI document:
4744 // For calls that may call functions that use varargs or stdargs
4745 // (prototype-less calls or calls to functions containing ellipsis (...) in
4746 // the declaration) %al is used as hidden argument to specify the number
4747 // of SSE registers used. The contents of %al do not need to match exactly
4748 // the number of registers, but must be an ubound on the number of SSE
4749 // registers used and is in the range 0 - 8 inclusive.
4750
4751 // Count the number of XMM registers allocated.
4752 static const MCPhysReg XMMArgRegs[] = {
4753 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4754 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4755 };
4756 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4757 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__))
4758 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__))
;
4759 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4760 DAG.getConstant(NumXMMRegs, dl,
4761 MVT::i8)));
4762 }
4763
4764 if (isVarArg && IsMustTail) {
4765 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4766 for (const auto &F : Forwards) {
4767 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4768 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4769 }
4770 }
4771
4772 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4773 // don't need this because the eligibility check rejects calls that require
4774 // shuffling arguments passed in memory.
4775 if (!IsSibcall && isTailCall) {
4776 // Force all the incoming stack arguments to be loaded from the stack
4777 // before any new outgoing arguments are stored to the stack, because the
4778 // outgoing stack slots may alias the incoming argument stack slots, and
4779 // the alias isn't otherwise explicit. This is slightly more conservative
4780 // than necessary, because it means that each store effectively depends
4781 // on every argument instead of just those arguments it would clobber.
4782 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4783
4784 SmallVector<SDValue, 8> MemOpChains2;
4785 SDValue FIN;
4786 int FI = 0;
4787 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4788 ++I, ++OutsIndex) {
4789 CCValAssign &VA = ArgLocs[I];
4790
4791 if (VA.isRegLoc()) {
4792 if (VA.needsCustom()) {
4793 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__))
4794 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__))
;
4795 // This means that we are in special case where one argument was
4796 // passed through two register locations - Skip the next location
4797 ++I;
4798 }
4799
4800 continue;
4801 }
4802
4803 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4803, __extension__ __PRETTY_FUNCTION__))
;
4804 SDValue Arg = OutVals[OutsIndex];
4805 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4806 // Skip inalloca/preallocated arguments. They don't require any work.
4807 if (Flags.isInAlloca() || Flags.isPreallocated())
4808 continue;
4809 // Create frame index.
4810 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4811 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4812 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4813 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4814
4815 if (Flags.isByVal()) {
4816 // Copy relative to framepointer.
4817 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4818 if (!StackPtr.getNode())
4819 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4820 getPointerTy(DAG.getDataLayout()));
4821 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4822 StackPtr, Source);
4823
4824 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4825 ArgChain,
4826 Flags, DAG, dl));
4827 } else {
4828 // Store relative to framepointer.
4829 MemOpChains2.push_back(DAG.getStore(
4830 ArgChain, dl, Arg, FIN,
4831 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4832 }
4833 }
4834
4835 if (!MemOpChains2.empty())
4836 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4837
4838 // Store the return address to the appropriate stack slot.
4839 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4840 getPointerTy(DAG.getDataLayout()),
4841 RegInfo->getSlotSize(), FPDiff, dl);
4842 }
4843
4844 // Build a sequence of copy-to-reg nodes chained together with token chain
4845 // and glue operands which copy the outgoing args into registers.
4846 SDValue InGlue;
4847 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4848 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4849 RegsToPass[i].second, InGlue);
4850 InGlue = Chain.getValue(1);
4851 }
4852
4853 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4854 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4854, __extension__
__PRETTY_FUNCTION__))
;
4855 // In the 64-bit large code model, we have to make all calls
4856 // through a register, since the call instruction's 32-bit
4857 // pc-relative offset may not be large enough to hold the whole
4858 // address.
4859 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4860 Callee->getOpcode() == ISD::ExternalSymbol) {
4861 // Lower direct calls to global addresses and external symbols. Setting
4862 // ForCall to true here has the effect of removing WrapperRIP when possible
4863 // to allow direct calls to be selected without first materializing the
4864 // address into a register.
4865 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4866 } else if (Subtarget.isTarget64BitILP32() &&
4867 Callee.getValueType() == MVT::i32) {
4868 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4869 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4870 }
4871
4872 // Returns a chain & a glue for retval copy to use.
4873 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4874 SmallVector<SDValue, 8> Ops;
4875
4876 if (!IsSibcall && isTailCall && !IsMustTail) {
4877 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
4878 InGlue = Chain.getValue(1);
4879 }
4880
4881 Ops.push_back(Chain);
4882 Ops.push_back(Callee);
4883
4884 if (isTailCall)
4885 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4886
4887 // Add argument registers to the end of the list so that they are known live
4888 // into the call.
4889 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4890 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4891 RegsToPass[i].second.getValueType()));
4892
4893 // Add a register mask operand representing the call-preserved registers.
4894 const uint32_t *Mask = [&]() {
4895 auto AdaptedCC = CallConv;
4896 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4897 // use X86_INTR calling convention because it has the same CSR mask
4898 // (same preserved registers).
4899 if (HasNCSR)
4900 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4901 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4902 // to use the CSR_NoRegs_RegMask.
4903 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4904 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4905 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4906 }();
4907 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4907, __extension__
__PRETTY_FUNCTION__))
;
4908
4909 // If this is an invoke in a 32-bit function using a funclet-based
4910 // personality, assume the function clobbers all registers. If an exception
4911 // is thrown, the runtime will not restore CSRs.
4912 // FIXME: Model this more precisely so that we can register allocate across
4913 // the normal edge and spill and fill across the exceptional edge.
4914 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4915 const Function &CallerFn = MF.getFunction();
4916 EHPersonality Pers =
4917 CallerFn.hasPersonalityFn()
4918 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4919 : EHPersonality::Unknown;
4920 if (isFuncletEHPersonality(Pers))
4921 Mask = RegInfo->getNoPreservedMask();
4922 }
4923
4924 // Define a new register mask from the existing mask.
4925 uint32_t *RegMask = nullptr;
4926
4927 // In some calling conventions we need to remove the used physical registers
4928 // from the reg mask. Create a new RegMask for such calling conventions.
4929 // RegMask for calling conventions that disable only return registers (e.g.
4930 // preserve_most) will be modified later in LowerCallResult.
4931 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
4932 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
4933 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4934
4935 // Allocate a new Reg Mask and copy Mask.
4936 RegMask = MF.allocateRegMask();
4937 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4938 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4939
4940 // Make sure all sub registers of the argument registers are reset
4941 // in the RegMask.
4942 if (ShouldDisableArgRegs) {
4943 for (auto const &RegPair : RegsToPass)
4944 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4945 SubRegs.isValid(); ++SubRegs)
4946 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4947 }
4948
4949 // Create the RegMask Operand according to our updated mask.
4950 Ops.push_back(DAG.getRegisterMask(RegMask));
4951 } else {
4952 // Create the RegMask Operand according to the static mask.
4953 Ops.push_back(DAG.getRegisterMask(Mask));
4954 }
4955
4956 if (InGlue.getNode())
4957 Ops.push_back(InGlue);
4958
4959 if (isTailCall) {
4960 // We used to do:
4961 //// If this is the first return lowered for this function, add the regs
4962 //// to the liveout set for the function.
4963 // This isn't right, although it's probably harmless on x86; liveouts
4964 // should be computed from returns not tail calls. Consider a void
4965 // function making a tail call to a function returning int.
4966 MF.getFrameInfo().setHasTailCall();
4967 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4968
4969 if (IsCFICall)
4970 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4971
4972 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4973 return Ret;
4974 }
4975
4976 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4977 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4978 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4979 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4980 // expanded to the call, directly followed by a special marker sequence and
4981 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4982 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
4983 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__))
;
4984 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4984, __extension__
__PRETTY_FUNCTION__))
;
4985
4986 // Add a target global address for the retainRV/claimRV runtime function
4987 // just before the call target.
4988 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
4989 auto PtrVT = getPointerTy(DAG.getDataLayout());
4990 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
4991 Ops.insert(Ops.begin() + 1, GA);
4992 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4993 } else {
4994 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4995 }
4996
4997 if (IsCFICall)
4998 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4999
5000 InGlue = Chain.getValue(1);
5001 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5002 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5003
5004 // Save heapallocsite metadata.
5005 if (CLI.CB)
5006 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
5007 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
5008
5009 // Create the CALLSEQ_END node.
5010 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
5011 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
5012 DAG.getTarget().Options.GuaranteedTailCallOpt))
5013 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
5014 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
5015 // If this call passes a struct-return pointer, the callee
5016 // pops that struct pointer.
5017 NumBytesForCalleeToPop = 4;
5018
5019 // Returns a glue for retval copy to use.
5020 if (!IsSibcall) {
5021 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
5022 InGlue, dl);
5023 InGlue = Chain.getValue(1);
5024 }
5025
5026 // Handle result values, copying them out of physregs into vregs that we
5027 // return.
5028 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
5029 InVals, RegMask);
5030}
5031
5032//===----------------------------------------------------------------------===//
5033// Fast Calling Convention (tail call) implementation
5034//===----------------------------------------------------------------------===//
5035
5036// Like std call, callee cleans arguments, convention except that ECX is
5037// reserved for storing the tail called function address. Only 2 registers are
5038// free for argument passing (inreg). Tail call optimization is performed
5039// provided:
5040// * tailcallopt is enabled
5041// * caller/callee are fastcc
5042// On X86_64 architecture with GOT-style position independent code only local
5043// (within module) calls are supported at the moment.
5044// To keep the stack aligned according to platform abi the function
5045// GetAlignedArgumentStackSize ensures that argument delta is always multiples
5046// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
5047// If a tail called function callee has more arguments than the caller the
5048// caller needs to make sure that there is room to move the RETADDR to. This is
5049// achieved by reserving an area the size of the argument delta right after the
5050// original RETADDR, but before the saved framepointer or the spilled registers
5051// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
5052// stack layout:
5053// arg1
5054// arg2
5055// RETADDR
5056// [ new RETADDR
5057// move area ]
5058// (possible EBP)
5059// ESI
5060// EDI
5061// local1 ..
5062
5063/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
5064/// requirement.
5065unsigned
5066X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
5067 SelectionDAG &DAG) const {
5068 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
5069 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
5070 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__))
5071 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__))
;
5072 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
5073}
5074
5075/// Return true if the given stack call argument is already available in the
5076/// same position (relatively) of the caller's incoming argument stack.
5077static
5078bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
5079 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
5080 const X86InstrInfo *TII, const CCValAssign &VA) {
5081 unsigned Bytes = Arg.getValueSizeInBits() / 8;
5082
5083 for (;;) {
5084 // Look through nodes that don't alter the bits of the incoming value.
5085 unsigned Op = Arg.getOpcode();
5086 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
5087 Arg = Arg.getOperand(0);
5088 continue;
5089 }
5090 if (Op == ISD::TRUNCATE) {
5091 const SDValue &TruncInput = Arg.getOperand(0);
5092 if (TruncInput.getOpcode() == ISD::AssertZext &&
5093 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
5094 Arg.getValueType()) {
5095 Arg = TruncInput.getOperand(0);
5096 continue;
5097 }
5098 }
5099 break;
5100 }
5101
5102 int FI = INT_MAX2147483647;
5103 if (Arg.getOpcode() == ISD::CopyFromReg) {
5104 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
5105 if (!VR.isVirtual())
5106 return false;
5107 MachineInstr *Def = MRI->getVRegDef(VR);
5108 if (!Def)
5109 return false;
5110 if (!Flags.isByVal()) {
5111 if (!TII->isLoadFromStackSlot(*Def, FI))
5112 return false;
5113 } else {
5114 unsigned Opcode = Def->getOpcode();
5115 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
5116 Opcode == X86::LEA64_32r) &&
5117 Def->getOperand(1).isFI()) {
5118 FI = Def->getOperand(1).getIndex();
5119 Bytes = Flags.getByValSize();
5120 } else
5121 return false;
5122 }
5123 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
5124 if (Flags.isByVal())
5125 // ByVal argument is passed in as a pointer but it's now being
5126 // dereferenced. e.g.
5127 // define @foo(%struct.X* %A) {
5128 // tail call @bar(%struct.X* byval %A)
5129 // }
5130 return false;
5131 SDValue Ptr = Ld->getBasePtr();
5132 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
5133 if (!FINode)
5134 return false;
5135 FI = FINode->getIndex();
5136 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
5137 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
5138 FI = FINode->getIndex();
5139 Bytes = Flags.getByValSize();
5140 } else
5141 return false;
5142
5143 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5143, __extension__ __PRETTY_FUNCTION__))
;
5144 if (!MFI.isFixedObjectIndex(FI))
5145 return false;
5146
5147 if (Offset != MFI.getObjectOffset(FI))
5148 return false;
5149
5150 // If this is not byval, check that the argument stack object is immutable.
5151 // inalloca and argument copy elision can create mutable argument stack
5152 // objects. Byval objects can be mutated, but a byval call intends to pass the
5153 // mutated memory.
5154 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
5155 return false;
5156
5157 if (VA.getLocVT().getFixedSizeInBits() >
5158 Arg.getValueSizeInBits().getFixedValue()) {
5159 // If the argument location is wider than the argument type, check that any
5160 // extension flags match.
5161 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
5162 Flags.isSExt() != MFI.isObjectSExt(FI)) {
5163 return false;
5164 }
5165 }
5166
5167 return Bytes == MFI.getObjectSize(FI);
5168}
5169
5170/// Check whether the call is eligible for tail call optimization. Targets
5171/// that want to do tail call optimization should implement this function.
5172bool X86TargetLowering::IsEligibleForTailCallOptimization(
5173 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
5174 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
5175 const SmallVectorImpl<SDValue> &OutVals,
5176 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5177 if (!mayTailCallThisCC(CalleeCC))
5178 return false;
5179
5180 // If -tailcallopt is specified, make fastcc functions tail-callable.
5181 MachineFunction &MF = DAG.getMachineFunction();
5182 const Function &CallerF = MF.getFunction();
5183
5184 // If the function return type is x86_fp80 and the callee return type is not,
5185 // then the FP_EXTEND of the call result is not a nop. It's not safe to
5186 // perform a tailcall optimization here.
5187 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
5188 return false;
5189
5190 CallingConv::ID CallerCC = CallerF.getCallingConv();
5191 bool CCMatch = CallerCC == CalleeCC;
5192 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
5193 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
5194 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
5195 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
5196
5197 // Win64 functions have extra shadow space for argument homing. Don't do the
5198 // sibcall if the caller and callee have mismatched expectations for this
5199 // space.
5200 if (IsCalleeWin64 != IsCallerWin64)
5201 return false;
5202
5203 if (IsGuaranteeTCO) {
5204 if (canGuaranteeTCO(CalleeCC) && CCMatch)
5205 return true;
5206 return false;
5207 }
5208
5209 // Look for obvious safe cases to perform tail call optimization that do not
5210 // require ABI changes. This is what gcc calls sibcall.
5211
5212 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
5213 // emit a special epilogue.
5214 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5215 if (RegInfo->hasStackRealignment(MF))
5216 return false;
5217
5218 // Also avoid sibcall optimization if we're an sret return fn and the callee
5219 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
5220 // insufficient.
5221 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
5222 // For a compatible tail call the callee must return our sret pointer. So it
5223 // needs to be (a) an sret function itself and (b) we pass our sret as its
5224 // sret. Condition #b is harder to determine.
5225 return false;
5226 } else if (IsCalleePopSRet)
5227 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
5228 // expect that.
5229 return false;
5230
5231 // Do not sibcall optimize vararg calls unless all arguments are passed via
5232 // registers.
5233 LLVMContext &C = *DAG.getContext();
5234 if (isVarArg && !Outs.empty()) {
5235 // Optimizing for varargs on Win64 is unlikely to be safe without
5236 // additional testing.
5237 if (IsCalleeWin64 || IsCallerWin64)
5238 return false;
5239
5240 SmallVector<CCValAssign, 16> ArgLocs;
5241 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5242
5243 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5244 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
5245 if (!ArgLocs[i].isRegLoc())
5246 return false;
5247 }
5248
5249 // If the call result is in ST0 / ST1, it needs to be popped off the x87
5250 // stack. Therefore, if it's not used by the call it is not safe to optimize
5251 // this into a sibcall.
5252 bool Unused = false;
5253 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5254 if (!Ins[i].Used) {
5255 Unused = true;
5256 break;
5257 }
5258 }
5259 if (Unused) {
5260 SmallVector<CCValAssign, 16> RVLocs;
5261 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
5262 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
5263 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5264 CCValAssign &VA = RVLocs[i];
5265 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
5266 return false;
5267 }
5268 }
5269
5270 // Check that the call results are passed in the same way.
5271 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5272 RetCC_X86, RetCC_X86))
5273 return false;
5274 // The callee has to preserve all registers the caller needs to preserve.
5275 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5276 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5277 if (!CCMatch) {
5278 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5279 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5280 return false;
5281 }
5282
5283 unsigned StackArgsSize = 0;
5284
5285 // If the callee takes no arguments then go on to check the results of the
5286 // call.
5287 if (!Outs.empty()) {
5288 // Check if stack adjustment is needed. For now, do not do this if any
5289 // argument is passed on the stack.
5290 SmallVector<CCValAssign, 16> ArgLocs;
5291 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5292
5293 // Allocate shadow area for Win64
5294 if (IsCalleeWin64)
5295 CCInfo.AllocateStack(32, Align(8));
5296
5297 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5298 StackArgsSize = CCInfo.getNextStackOffset();
5299
5300 if (CCInfo.getNextStackOffset()) {
5301 // Check if the arguments are already laid out in the right way as
5302 // the caller's fixed stack objects.
5303 MachineFrameInfo &MFI = MF.getFrameInfo();
5304 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5305 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5306 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5307 CCValAssign &VA = ArgLocs[i];
5308 SDValue Arg = OutVals[i];
5309 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5310 if (VA.getLocInfo() == CCValAssign::Indirect)
5311 return false;
5312 if (!VA.isRegLoc()) {
5313 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5314 MFI, MRI, TII, VA))
5315 return false;
5316 }
5317 }
5318 }
5319
5320 bool PositionIndependent = isPositionIndependent();
5321 // If the tailcall address may be in a register, then make sure it's
5322 // possible to register allocate for it. In 32-bit, the call address can
5323 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5324 // callee-saved registers are restored. These happen to be the same
5325 // registers used to pass 'inreg' arguments so watch out for those.
5326 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5327 !isa<ExternalSymbolSDNode>(Callee)) ||
5328 PositionIndependent)) {
5329 unsigned NumInRegs = 0;
5330 // In PIC we need an extra register to formulate the address computation
5331 // for the callee.
5332 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5333
5334 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5335 CCValAssign &VA = ArgLocs[i];
5336 if (!VA.isRegLoc())
5337 continue;
5338 Register Reg = VA.getLocReg();
5339 switch (Reg) {
5340 default: break;
5341 case X86::EAX: case X86::EDX: case X86::ECX:
5342 if (++NumInRegs == MaxInRegs)
5343 return false;
5344 break;
5345 }
5346 }
5347 }
5348
5349 const MachineRegisterInfo &MRI = MF.getRegInfo();
5350 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5351 return false;
5352 }
5353
5354 bool CalleeWillPop =
5355 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5356 MF.getTarget().Options.GuaranteedTailCallOpt);
5357
5358 if (unsigned BytesToPop =
5359 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5360 // If we have bytes to pop, the callee must pop them.
5361 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5362 if (!CalleePopMatches)
5363 return false;
5364 } else if (CalleeWillPop && StackArgsSize > 0) {
5365 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5366 return false;
5367 }
5368
5369 return true;
5370}
5371
5372FastISel *
5373X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5374 const TargetLibraryInfo *libInfo) const {
5375 return X86::createFastISel(funcInfo, libInfo);
5376}
5377
5378//===----------------------------------------------------------------------===//
5379// Other Lowering Hooks
5380//===----------------------------------------------------------------------===//
5381
5382bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5383 bool AssumeSingleUse) {
5384 if (!AssumeSingleUse && !Op.hasOneUse())
5385 return false;
5386 if (!ISD::isNormalLoad(Op.getNode()))
5387 return false;
5388
5389 // If this is an unaligned vector, make sure the target supports folding it.
5390 auto *Ld = cast<LoadSDNode>(Op.getNode());
5391 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5392 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
5393 return false;
5394
5395 // TODO: If this is a non-temporal load and the target has an instruction
5396 // for it, it should not be folded. See "useNonTemporalLoad()".
5397
5398 return true;
5399}
5400
5401bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5402 const X86Subtarget &Subtarget,
5403 bool AssumeSingleUse) {
5404 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5404, __extension__
__PRETTY_FUNCTION__))
;
5405 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5406 return false;
5407
5408 // We can not replace a wide volatile load with a broadcast-from-memory,
5409 // because that would narrow the load, which isn't legal for volatiles.
5410 auto *Ld = cast<LoadSDNode>(Op.getNode());
5411 return !Ld->isVolatile() ||
5412 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5413}
5414
5415bool X86::mayFoldIntoStore(SDValue Op) {
5416 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5417}
5418
5419bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5420 if (Op.hasOneUse()) {
5421 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5422 return (ISD::ZERO_EXTEND == Opcode);
5423 }
5424 return false;
5425}
5426
5427static bool isTargetShuffle(unsigned Opcode) {
5428 switch(Opcode) {
5429 default: return false;
5430 case X86ISD::BLENDI:
5431 case X86ISD::PSHUFB:
5432 case X86ISD::PSHUFD:
5433 case X86ISD::PSHUFHW:
5434 case X86ISD::PSHUFLW:
5435 case X86ISD::SHUFP:
5436 case X86ISD::INSERTPS:
5437 case X86ISD::EXTRQI:
5438 case X86ISD::INSERTQI:
5439 case X86ISD::VALIGN:
5440 case X86ISD::PALIGNR:
5441 case X86ISD::VSHLDQ:
5442 case X86ISD::VSRLDQ:
5443 case X86ISD::MOVLHPS:
5444 case X86ISD::MOVHLPS:
5445 case X86ISD::MOVSHDUP:
5446 case X86ISD::MOVSLDUP:
5447 case X86ISD::MOVDDUP:
5448 case X86ISD::MOVSS:
5449 case X86ISD::MOVSD:
5450 case X86ISD::MOVSH:
5451 case X86ISD::UNPCKL:
5452 case X86ISD::UNPCKH:
5453 case X86ISD::VBROADCAST:
5454 case X86ISD::VPERMILPI:
5455 case X86ISD::VPERMILPV:
5456 case X86ISD::VPERM2X128:
5457 case X86ISD::SHUF128:
5458 case X86ISD::VPERMIL2:
5459 case X86ISD::VPERMI:
5460 case X86ISD::VPPERM:
5461 case X86ISD::VPERMV:
5462 case X86ISD::VPERMV3:
5463 case X86ISD::VZEXT_MOVL:
5464 return true;
5465 }
5466}
5467
5468static bool isTargetShuffleVariableMask(unsigned Opcode) {
5469 switch (Opcode) {
5470 default: return false;
5471 // Target Shuffles.
5472 case X86ISD::PSHUFB:
5473 case X86ISD::VPERMILPV:
5474 case X86ISD::VPERMIL2:
5475 case X86ISD::VPPERM:
5476 case X86ISD::VPERMV:
5477 case X86ISD::VPERMV3:
5478 return true;
5479 // 'Faux' Target Shuffles.
5480 case ISD::OR:
5481 case ISD::AND:
5482 case X86ISD::ANDNP:
5483 return true;
5484 }
5485}
5486
5487SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5488 MachineFunction &MF = DAG.getMachineFunction();
5489 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5490 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5491 int ReturnAddrIndex = FuncInfo->getRAIndex();
5492
5493 if (ReturnAddrIndex == 0) {
5494 // Set up a frame object for the return address.
5495 unsigned SlotSize = RegInfo->getSlotSize();
5496 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5497 -(int64_t)SlotSize,
5498 false);
5499 FuncInfo->setRAIndex(ReturnAddrIndex);
5500 }
5501
5502 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5503}
5504
5505bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5506 bool hasSymbolicDisplacement) {
5507 // Offset should fit into 32 bit immediate field.
5508 if (!isInt<32>(Offset))
5509 return false;
5510
5511 // If we don't have a symbolic displacement - we don't have any extra
5512 // restrictions.
5513 if (!hasSymbolicDisplacement)
5514 return true;
5515
5516 // FIXME: Some tweaks might be needed for medium code model.
5517 if (M != CodeModel::Small && M != CodeModel::Kernel)
5518 return false;
5519
5520 // For small code model we assume that latest object is 16MB before end of 31
5521 // bits boundary. We may also accept pretty large negative constants knowing
5522 // that all objects are in the positive half of address space.
5523 if (M == CodeModel::Small && Offset < 16*1024*1024)
5524 return true;
5525
5526 // For kernel code model we know that all object resist in the negative half
5527 // of 32bits address space. We may not accept negative offsets, since they may
5528 // be just off and we may accept pretty large positive ones.
5529 if (M == CodeModel::Kernel && Offset >= 0)
5530 return true;
5531
5532 return false;
5533}
5534
5535/// Determines whether the callee is required to pop its own arguments.
5536/// Callee pop is necessary to support tail calls.
5537bool X86::isCalleePop(CallingConv::ID CallingConv,
5538 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5539 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5540 // can guarantee TCO.
5541 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5542 return true;
5543
5544 switch (CallingConv) {
5545 default:
5546 return false;
5547 case CallingConv::X86_StdCall:
5548 case CallingConv::X86_FastCall:
5549 case CallingConv::X86_ThisCall:
5550 case CallingConv::X86_VectorCall:
5551 return !is64Bit;
5552 }
5553}
5554
5555/// Return true if the condition is an signed comparison operation.
5556static bool isX86CCSigned(unsigned X86CC) {
5557 switch (X86CC) {
5558 default:
5559 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5559)
;
5560 case X86::COND_E:
5561 case X86::COND_NE:
5562 case X86::COND_B:
5563 case X86::COND_A:
5564 case X86::COND_BE:
5565 case X86::COND_AE:
5566 return false;
5567 case X86::COND_G:
5568 case X86::COND_GE:
5569 case X86::COND_L:
5570 case X86::COND_LE:
5571 return true;
5572 }
5573}
5574
5575static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5576 switch (SetCCOpcode) {
5577 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5577)
;
5578 case ISD::SETEQ: return X86::COND_E;
5579 case ISD::SETGT: return X86::COND_G;
5580 case ISD::SETGE: return X86::COND_GE;
5581 case ISD::SETLT: return X86::COND_L;
5582 case ISD::SETLE: return X86::COND_LE;
5583 case ISD::SETNE: return X86::COND_NE;
5584 case ISD::SETULT: return X86::COND_B;
5585 case ISD::SETUGT: return X86::COND_A;
5586 case ISD::SETULE: return X86::COND_BE;
5587 case ISD::SETUGE: return X86::COND_AE;
5588 }
5589}
5590
5591/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5592/// condition code, returning the condition code and the LHS/RHS of the
5593/// comparison to make.
5594static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5595 bool isFP, SDValue &LHS, SDValue &RHS,
5596 SelectionDAG &DAG) {
5597 if (!isFP) {
5598 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5599 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5600 // X > -1 -> X == 0, jump !sign.
5601 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5602 return X86::COND_NS;
5603 }
5604 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5605 // X < 0 -> X == 0, jump on sign.
5606 return X86::COND_S;
5607 }
5608 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5609 // X >= 0 -> X == 0, jump on !sign.
5610 return X86::COND_NS;
5611 }
5612 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5613 // X < 1 -> X <= 0
5614 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5615 return X86::COND_LE;
5616 }
5617 }
5618
5619 return TranslateIntegerX86CC(SetCCOpcode);
5620 }
5621
5622 // First determine if it is required or is profitable to flip the operands.
5623
5624 // If LHS is a foldable load, but RHS is not, flip the condition.
5625 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5626 !ISD::isNON_EXTLoad(RHS.getNode())) {
5627 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5628 std::swap(LHS, RHS);
5629 }
5630
5631 switch (SetCCOpcode) {
5632 default: break;
5633 case ISD::SETOLT:
5634 case ISD::SETOLE:
5635 case ISD::SETUGT:
5636 case ISD::SETUGE:
5637 std::swap(LHS, RHS);
5638 break;
5639 }
5640
5641 // On a floating point condition, the flags are set as follows:
5642 // ZF PF CF op
5643 // 0 | 0 | 0 | X > Y
5644 // 0 | 0 | 1 | X < Y
5645 // 1 | 0 | 0 | X == Y
5646 // 1 | 1 | 1 | unordered
5647 switch (SetCCOpcode) {
5648 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5648)
;
5649 case ISD::SETUEQ:
5650 case ISD::SETEQ: return X86::COND_E;
5651 case ISD::SETOLT: // flipped
5652 case ISD::SETOGT:
5653 case ISD::SETGT: return X86::COND_A;
5654 case ISD::SETOLE: // flipped
5655 case ISD::SETOGE:
5656 case ISD::SETGE: return X86::COND_AE;
5657 case ISD::SETUGT: // flipped
5658 case ISD::SETULT:
5659 case ISD::SETLT: return X86::COND_B;
5660 case ISD::SETUGE: // flipped
5661 case ISD::SETULE:
5662 case ISD::SETLE: return X86::COND_BE;
5663 case ISD::SETONE:
5664 case ISD::SETNE: return X86::COND_NE;
5665 case ISD::SETUO: return X86::COND_P;
5666 case ISD::SETO: return X86::COND_NP;
5667 case ISD::SETOEQ:
5668 case ISD::SETUNE: return X86::COND_INVALID;
5669 }
5670}
5671
5672/// Is there a floating point cmov for the specific X86 condition code?
5673/// Current x86 isa includes the following FP cmov instructions:
5674/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5675static bool hasFPCMov(unsigned X86CC) {
5676 switch (X86CC) {
5677 default:
5678 return false;
5679 case X86::COND_B:
5680 case X86::COND_BE:
5681 case X86::COND_E:
5682 case X86::COND_P:
5683 case X86::COND_A:
5684 case X86::COND_AE:
5685 case X86::COND_NE:
5686 case X86::COND_NP:
5687 return true;
5688 }
5689}
5690
5691static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5692 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5693 VT.is512BitVector();
5694}
5695
5696bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5697 const CallInst &I,
5698 MachineFunction &MF,
5699 unsigned Intrinsic) const {
5700 Info.flags = MachineMemOperand::MONone;
5701 Info.offset = 0;
5702
5703 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5704 if (!IntrData) {
5705 switch (Intrinsic) {
5706 case Intrinsic::x86_aesenc128kl:
5707 case Intrinsic::x86_aesdec128kl:
5708 Info.opc = ISD::INTRINSIC_W_CHAIN;
5709 Info.ptrVal = I.getArgOperand(1);
5710 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5711 Info.align = Align(1);
5712 Info.flags |= MachineMemOperand::MOLoad;
5713 return true;
5714 case Intrinsic::x86_aesenc256kl:
5715 case Intrinsic::x86_aesdec256kl:
5716 Info.opc = ISD::INTRINSIC_W_CHAIN;
5717 Info.ptrVal = I.getArgOperand(1);
5718 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5719 Info.align = Align(1);
5720 Info.flags |= MachineMemOperand::MOLoad;
5721 return true;
5722 case Intrinsic::x86_aesencwide128kl:
5723 case Intrinsic::x86_aesdecwide128kl:
5724 Info.opc = ISD::INTRINSIC_W_CHAIN;
5725 Info.ptrVal = I.getArgOperand(0);
5726 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5727 Info.align = Align(1);
5728 Info.flags |= MachineMemOperand::MOLoad;
5729 return true;
5730 case Intrinsic::x86_aesencwide256kl:
5731 case Intrinsic::x86_aesdecwide256kl:
5732 Info.opc = ISD::INTRINSIC_W_CHAIN;
5733 Info.ptrVal = I.getArgOperand(0);
5734 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5735 Info.align = Align(1);
5736 Info.flags |= MachineMemOperand::MOLoad;
5737 return true;
5738 case Intrinsic::x86_cmpccxadd32:
5739 case Intrinsic::x86_cmpccxadd64:
5740 case Intrinsic::x86_atomic_bts:
5741 case Intrinsic::x86_atomic_btc:
5742 case Intrinsic::x86_atomic_btr: {
5743 Info.opc = ISD::INTRINSIC_W_CHAIN;
5744 Info.ptrVal = I.getArgOperand(0);
5745 unsigned Size = I.getType()->getScalarSizeInBits();
5746 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5747 Info.align = Align(Size);
5748 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5749 MachineMemOperand::MOVolatile;
5750 return true;
5751 }
5752 case Intrinsic::x86_atomic_bts_rm:
5753 case Intrinsic::x86_atomic_btc_rm:
5754 case Intrinsic::x86_atomic_btr_rm: {
5755 Info.opc = ISD::INTRINSIC_W_CHAIN;
5756 Info.ptrVal = I.getArgOperand(0);
5757 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5758 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5759 Info.align = Align(Size);
5760 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5761 MachineMemOperand::MOVolatile;
5762 return true;
5763 }
5764 case Intrinsic::x86_aadd32:
5765 case Intrinsic::x86_aadd64:
5766 case Intrinsic::x86_aand32:
5767 case Intrinsic::x86_aand64:
5768 case Intrinsic::x86_aor32:
5769 case Intrinsic::x86_aor64:
5770 case Intrinsic::x86_axor32:
5771 case Intrinsic::x86_axor64:
5772 case Intrinsic::x86_atomic_add_cc:
5773 case Intrinsic::x86_atomic_sub_cc:
5774 case Intrinsic::x86_atomic_or_cc:
5775 case Intrinsic::x86_atomic_and_cc:
5776 case Intrinsic::x86_atomic_xor_cc: {
5777 Info.opc = ISD::INTRINSIC_W_CHAIN;
5778 Info.ptrVal = I.getArgOperand(0);
5779 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5780 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5781 Info.align = Align(Size);
5782 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5783 MachineMemOperand::MOVolatile;
5784 return true;
5785 }
5786 }
5787 return false;
5788 }
5789
5790 switch (IntrData->Type) {
5791 case TRUNCATE_TO_MEM_VI8:
5792 case TRUNCATE_TO_MEM_VI16:
5793 case TRUNCATE_TO_MEM_VI32: {
5794 Info.opc = ISD::INTRINSIC_VOID;
5795 Info.ptrVal = I.getArgOperand(0);
5796 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5797 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5798 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5799 ScalarVT = MVT::i8;
5800 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5801 ScalarVT = MVT::i16;
5802 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5803 ScalarVT = MVT::i32;
5804
5805 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5806 Info.align = Align(1);
5807 Info.flags |= MachineMemOperand::MOStore;
5808 break;
5809 }
5810 case GATHER:
5811 case GATHER_AVX2: {
5812 Info.opc = ISD::INTRINSIC_W_CHAIN;
5813 Info.ptrVal = nullptr;
5814 MVT DataVT = MVT::getVT(I.getType());
5815 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5816 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5817 IndexVT.getVectorNumElements());
5818 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5819 Info.align = Align(1);
5820 Info.flags |= MachineMemOperand::MOLoad;
5821 break;
5822 }
5823 case SCATTER: {
5824 Info.opc = ISD::INTRINSIC_VOID;
5825 Info.ptrVal = nullptr;
5826 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5827 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5828 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5829 IndexVT.getVectorNumElements());
5830 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5831 Info.align = Align(1);
5832 Info.flags |= MachineMemOperand::MOStore;
5833 break;
5834 }
5835 default:
5836 return false;
5837 }
5838
5839 return true;
5840}
5841
5842/// Returns true if the target can instruction select the
5843/// specified FP immediate natively. If false, the legalizer will
5844/// materialize the FP immediate as a load from a constant pool.
5845bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5846 bool ForCodeSize) const {
5847 for (const APFloat &FPImm : LegalFPImmediates)
5848 if (Imm.bitwiseIsEqual(FPImm))
5849 return true;
5850 return false;
5851}
5852
5853bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5854 ISD::LoadExtType ExtTy,
5855 EVT NewVT) const {
5856 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5856, __extension__
__PRETTY_FUNCTION__))
;
5857
5858 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5859 // relocation target a movq or addq instruction: don't let the load shrink.
5860 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5861 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5862 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5863 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5864
5865 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5866 // those uses are extracted directly into a store, then the extract + store
5867 // can be store-folded. Therefore, it's probably not worth splitting the load.
5868 EVT VT = Load->getValueType(0);
5869 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5870 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5871 // Skip uses of the chain value. Result 0 of the node is the load value.
5872 if (UI.getUse().getResNo() != 0)
5873 continue;
5874
5875 // If this use is not an extract + store, it's probably worth splitting.
5876 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5877 UI->use_begin()->getOpcode() != ISD::STORE)
5878 return true;
5879 }
5880 // All non-chain uses are extract + store.
5881 return false;
5882 }
5883
5884 return true;
5885}
5886
5887/// Returns true if it is beneficial to convert a load of a constant
5888/// to just the constant itself.
5889bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5890 Type *Ty) const {
5891 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5891, __extension__ __PRETTY_FUNCTION__))
;
5892
5893 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5894 if (BitSize == 0 || BitSize > 64)
5895 return false;
5896 return true;
5897}
5898
5899bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5900 // If we are using XMM registers in the ABI and the condition of the select is
5901 // a floating-point compare and we have blendv or conditional move, then it is
5902 // cheaper to select instead of doing a cross-register move and creating a
5903 // load that depends on the compare result.
5904 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5905 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5906}
5907
5908bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5909 // TODO: It might be a win to ease or lift this restriction, but the generic
5910 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5911 if (VT.isVector() && Subtarget.hasAVX512())
5912 return false;
5913
5914 return true;
5915}
5916
5917bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5918 SDValue C) const {
5919 // TODO: We handle scalars using custom code, but generic combining could make
5920 // that unnecessary.
5921 APInt MulC;
5922 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5923 return false;
5924
5925 // Find the type this will be legalized too. Otherwise we might prematurely
5926 // convert this to shl+add/sub and then still have to type legalize those ops.
5927 // Another choice would be to defer the decision for illegal types until
5928 // after type legalization. But constant splat vectors of i64 can't make it
5929 // through type legalization on 32-bit targets so we would need to special
5930 // case vXi64.
5931 while (getTypeAction(Context, VT) != TypeLegal)
5932 VT = getTypeToTransformTo(Context, VT);
5933
5934 // If vector multiply is legal, assume that's faster than shl + add/sub.
5935 // Multiply is a complex op with higher latency and lower throughput in
5936 // most implementations, sub-vXi32 vector multiplies are always fast,
5937 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5938 // is always going to be slow.
5939 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5940 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5941 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5942 return false;
5943
5944 // shl+add, shl+sub, shl+add+neg
5945 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5946 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5947}
5948
5949bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5950 unsigned Index) const {
5951 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5952 return false;
5953
5954 // Mask vectors support all subregister combinations and operations that
5955 // extract half of vector.
5956 if (ResVT.getVectorElementType() == MVT::i1)
5957 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5958 (Index == ResVT.getVectorNumElements()));
5959
5960 return (Index % ResVT.getVectorNumElements()) == 0;
5961}
5962
5963bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5964 unsigned Opc = VecOp.getOpcode();
5965
5966 // Assume target opcodes can't be scalarized.
5967 // TODO - do we have any exceptions?
5968 if (Opc >= ISD::BUILTIN_OP_END)
5969 return false;
5970
5971 // If the vector op is not supported, try to convert to scalar.
5972 EVT VecVT = VecOp.getValueType();
5973 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5974 return true;
5975
5976 // If the vector op is supported, but the scalar op is not, the transform may
5977 // not be worthwhile.
5978 EVT ScalarVT = VecVT.getScalarType();
5979 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5980}
5981
5982bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5983 bool) const {
5984 // TODO: Allow vectors?
5985 if (VT.isVector())
5986 return false;
5987 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5988}
5989
5990bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
5991 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
5992 return Subtarget.hasBMI() ||
5993 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
5994}
5995
5996bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
5997 // Speculate ctlz only if we can directly use LZCNT.
5998 return Subtarget.hasLZCNT();
5999}
6000
6001bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
6002 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
6003 // expensive than a straight movsd. On the other hand, it's important to
6004 // shrink long double fp constant since fldt is very slow.
6005 return !Subtarget.hasSSE2() || VT == MVT::f80;
6006}
6007
6008bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
6009 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
6010 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
6011}
6012
6013bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
6014 const SelectionDAG &DAG,
6015 const MachineMemOperand &MMO) const {
6016 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
6017 BitcastVT.getVectorElementType() == MVT::i1)
6018 return false;
6019
6020 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
6021 return false;
6022
6023 // If both types are legal vectors, it's always ok to convert them.
6024 if (LoadVT.isVector() && BitcastVT.isVector() &&
6025 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
6026 return true;
6027
6028 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
6029}
6030
6031bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
6032 const MachineFunction &MF) const {
6033 // Do not merge to float value size (128 bytes) if no implicit
6034 // float attribute is set.
6035 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
6036
6037 if (NoFloat) {
6038 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
6039 return (MemVT.getSizeInBits() <= MaxIntSize);
6040 }
6041 // Make sure we don't merge greater than our preferred vector
6042 // width.
6043 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
6044 return false;
6045
6046 return true;
6047}
6048
6049bool X86TargetLowering::isCtlzFast() const {
6050 return Subtarget.hasFastLZCNT();
6051}
6052
6053bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
6054 const Instruction &AndI) const {
6055 return true;
6056}
6057
6058bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
6059 EVT VT = Y.getValueType();
6060
6061 if (VT.isVector())
6062 return false;
6063
6064 if (!Subtarget.hasBMI())
6065 return false;
6066
6067 // There are only 32-bit and 64-bit forms for 'andn'.
6068 if (VT != MVT::i32 && VT != MVT::i64)
6069 return false;
6070
6071 return !isa<ConstantSDNode>(Y);
6072}
6073
6074bool X86TargetLowering::hasAndNot(SDValue Y) const {
6075 EVT VT = Y.getValueType();
6076
6077 if (!VT.isVector())
6078 return hasAndNotCompare(Y);
6079
6080 // Vector.
6081
6082 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
6083 return false;
6084
6085 if (VT == MVT::v4i32)
6086 return true;
6087
6088 return Subtarget.hasSSE2();
6089}
6090
6091bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
6092 return X.getValueType().isScalarInteger(); // 'bt'
6093}
6094
6095bool X86TargetLowering::
6096 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6097 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
6098 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
6099 SelectionDAG &DAG) const {
6100 // Does baseline recommend not to perform the fold by default?
6101 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6102 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
6103 return false;
6104 // For scalars this transform is always beneficial.
6105 if (X.getValueType().isScalarInteger())
6106 return true;
6107 // If all the shift amounts are identical, then transform is beneficial even
6108 // with rudimentary SSE2 shifts.
6109 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
6110 return true;
6111 // If we have AVX2 with it's powerful shift operations, then it's also good.
6112 if (Subtarget.hasAVX2())
6113 return true;
6114 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
6115 return NewShiftOpcode == ISD::SHL;
6116}
6117
6118bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
6119 return N->getOpcode() != ISD::FP_EXTEND;
6120}
6121
6122bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
6123 const SDNode *N, CombineLevel Level) const {
6124 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6125 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6126 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6127 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
6128 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))
;
6129 // TODO: Should we always create i64 masks? Or only folded immediates?
6130 EVT VT = N->getValueType(0);
6131 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
6132 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
6133 // Only fold if the shift values are equal - so it folds to AND.
6134 // TODO - we should fold if either is a non-uniform vector but we don't do
6135 // the fold for non-splats yet.
6136 return N->getOperand(1) == N->getOperand(0).getOperand(1);
6137 }
6138 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
6139}
6140
6141bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
6142 EVT VT = Y.getValueType();
6143
6144 // For vectors, we don't have a preference, but we probably want a mask.
6145 if (VT.isVector())
6146 return false;
6147
6148 // 64-bit shifts on 32-bit targets produce really bad bloated code.
6149 if (VT == MVT::i64 && !Subtarget.is64Bit())
6150 return false;
6151
6152 return true;
6153}
6154
6155TargetLowering::ShiftLegalizationStrategy
6156X86TargetLowering::preferredShiftLegalizationStrategy(
6157 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
6158 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
6159 !Subtarget.isOSWindows())
6160 return ShiftLegalizationStrategy::LowerToLibcall;
6161 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6162 ExpansionFactor);
6163}
6164
6165bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
6166 // Any legal vector type can be splatted more efficiently than
6167 // loading/spilling from memory.
6168 return isTypeLegal(VT);
6169}
6170
6171MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
6172 MVT VT = MVT::getIntegerVT(NumBits);
6173 if (isTypeLegal(VT))
6174 return VT;
6175
6176 // PMOVMSKB can handle this.
6177 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
6178 return MVT::v16i8;
6179
6180 // VPMOVMSKB can handle this.
6181 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
6182 return MVT::v32i8;
6183
6184 // TODO: Allow 64-bit type for 32-bit target.
6185 // TODO: 512-bit types should be allowed, but make sure that those
6186 // cases are handled in combineVectorSizedSetCCEquality().
6187
6188 return MVT::INVALID_SIMPLE_VALUE_TYPE;
6189}
6190
6191/// Val is the undef sentinel value or equal to the specified value.
6192static bool isUndefOrEqual(int Val, int CmpVal) {
6193 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
6194}
6195
6196/// Return true if every element in Mask is the undef sentinel value or equal to
6197/// the specified value..
6198static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
6199 return llvm::all_of(Mask, [CmpVal](int M) {
6200 return (M == SM_SentinelUndef) || (M == CmpVal);
6201 });
6202}
6203
6204/// Val is either the undef or zero sentinel value.
6205static bool isUndefOrZero(int Val) {
6206 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
6207}
6208
6209/// Return true if every element in Mask, beginning from position Pos and ending
6210/// in Pos+Size is the undef sentinel value.
6211static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
6212 return llvm::all_of(Mask.slice(Pos, Size),
6213 [](int M) { return M == SM_SentinelUndef; });
6214}
6215
6216/// Return true if the mask creates a vector whose lower half is undefined.
6217static bool isUndefLowerHalf(ArrayRef<int> Mask) {
6218 unsigned NumElts = Mask.size();
6219 return isUndefInRange(Mask, 0, NumElts / 2);
6220}
6221
6222/// Return true if the mask creates a vector whose upper half is undefined.
6223static bool isUndefUpperHalf(ArrayRef<int> Mask) {
6224 unsigned NumElts = Mask.size();
6225 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
6226}
6227
6228/// Return true if Val falls within the specified range (L, H].
6229static bool isInRange(int Val, int Low, int Hi) {
6230 return (Val >= Low && Val < Hi);
6231}
6232
6233/// Return true if the value of any element in Mask falls within the specified
6234/// range (L, H].
6235static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
6236 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
6237}
6238
6239/// Return true if the value of any element in Mask is the zero sentinel value.
6240static bool isAnyZero(ArrayRef<int> Mask) {
6241 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
6242}
6243
6244/// Return true if the value of any element in Mask is the zero or undef
6245/// sentinel values.
6246static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
6247 return llvm::any_of(Mask, [](int M) {
6248 return M == SM_SentinelZero || M == SM_SentinelUndef;
6249 });
6250}
6251
6252/// Return true if Val is undef or if its value falls within the
6253/// specified range (L, H].
6254static bool isUndefOrInRange(int Val, int Low, int Hi) {
6255 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
6256}
6257
6258/// Return true if every element in Mask is undef or if its value
6259/// falls within the specified range (L, H].
6260static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6261 return llvm::all_of(
6262 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
6263}
6264
6265/// Return true if Val is undef, zero or if its value falls within the
6266/// specified range (L, H].
6267static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
6268 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
6269}
6270
6271/// Return true if every element in Mask is undef, zero or if its value
6272/// falls within the specified range (L, H].
6273static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6274 return llvm::all_of(
6275 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
6276}
6277
6278/// Return true if every element in Mask, beginning
6279/// from position Pos and ending in Pos + Size, falls within the specified
6280/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
6281static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
6282 unsigned Size, int Low, int Step = 1) {
6283 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6284 if (!isUndefOrEqual(Mask[i], Low))
6285 return false;
6286 return true;
6287}
6288
6289/// Return true if every element in Mask, beginning
6290/// from position Pos and ending in Pos+Size, falls within the specified
6291/// sequential range (Low, Low+Size], or is undef or is zero.
6292static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6293 unsigned Size, int Low,
6294 int Step = 1) {
6295 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6296 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
6297 return false;
6298 return true;
6299}
6300
6301/// Return true if every element in Mask, beginning
6302/// from position Pos and ending in Pos+Size is undef or is zero.
6303static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6304 unsigned Size) {
6305 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
6306}
6307
6308/// Helper function to test whether a shuffle mask could be
6309/// simplified by widening the elements being shuffled.
6310///
6311/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
6312/// leaves it in an unspecified state.
6313///
6314/// NOTE: This must handle normal vector shuffle masks and *target* vector
6315/// shuffle masks. The latter have the special property of a '-2' representing
6316/// a zero-ed lane of a vector.
6317static bool canWidenShuffleElements(ArrayRef<int> Mask,
6318 SmallVectorImpl<int> &WidenedMask) {
6319 WidenedMask.assign(Mask.size() / 2, 0);
6320 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
6321 int M0 = Mask[i];
6322 int M1 = Mask[i + 1];
6323
6324 // If both elements are undef, its trivial.
6325 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
6326 WidenedMask[i / 2] = SM_SentinelUndef;
6327 continue;
6328 }
6329
6330 // Check for an undef mask and a mask value properly aligned to fit with
6331 // a pair of values. If we find such a case, use the non-undef mask's value.
6332 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6333 WidenedMask[i / 2] = M1 / 2;
6334 continue;
6335 }
6336 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6337 WidenedMask[i / 2] = M0 / 2;
6338 continue;
6339 }
6340
6341 // When zeroing, we need to spread the zeroing across both lanes to widen.
6342 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6343 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6344 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6345 WidenedMask[i / 2] = SM_SentinelZero;
6346 continue;
6347 }
6348 return false;
6349 }
6350
6351 // Finally check if the two mask values are adjacent and aligned with
6352 // a pair.
6353 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6354 WidenedMask[i / 2] = M0 / 2;
6355 continue;
6356 }
6357
6358 // Otherwise we can't safely widen the elements used in this shuffle.
6359 return false;
6360 }
6361 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__))
6362 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__))
;
6363
6364 return true;
6365}
6366
6367static bool canWidenShuffleElements(ArrayRef<int> Mask,
6368 const APInt &Zeroable,
6369 bool V2IsZero,
6370 SmallVectorImpl<int> &WidenedMask) {
6371 // Create an alternative mask with info about zeroable elements.
6372 // Here we do not set undef elements as zeroable.
6373 SmallVector<int, 64> ZeroableMask(Mask);
6374 if (V2IsZero) {
6375 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6375, __extension__
__PRETTY_FUNCTION__))
;
6376 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6377 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6378 ZeroableMask[i] = SM_SentinelZero;
6379 }
6380 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6381}
6382
6383static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6384 SmallVector<int, 32> WidenedMask;
6385 return canWidenShuffleElements(Mask, WidenedMask);
6386}
6387
6388// Attempt to narrow/widen shuffle mask until it matches the target number of
6389// elements.
6390static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6391 SmallVectorImpl<int> &ScaledMask) {
6392 unsigned NumSrcElts = Mask.size();
6393 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__))
6394 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__))
;
6395
6396 // Narrowing is guaranteed to work.
6397 if (NumDstElts >= NumSrcElts) {
6398 int Scale = NumDstElts / NumSrcElts;
6399 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6400 return true;
6401 }
6402
6403 // We have to repeat the widening until we reach the target size, but we can
6404 // split out the first widening as it sets up ScaledMask for us.
6405 if (canWidenShuffleElements(Mask, ScaledMask)) {
6406 while (ScaledMask.size() > NumDstElts) {
6407 SmallVector<int, 16> WidenedMask;
6408 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6409 return false;
6410 ScaledMask = std::move(WidenedMask);
6411 }
6412 return true;
6413 }
6414
6415 return false;
6416}
6417
6418/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6419bool X86::isZeroNode(SDValue Elt) {
6420 return isNullConstant(Elt) || isNullFPConstant(Elt);
6421}
6422
6423// Build a vector of constants.
6424// Use an UNDEF node if MaskElt == -1.
6425// Split 64-bit constants in the 32-bit mode.
6426static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6427 const SDLoc &dl, bool IsMask = false) {
6428
6429 SmallVector<SDValue, 32> Ops;
6430 bool Split = false;
6431
6432 MVT ConstVecVT = VT;
6433 unsigned NumElts = VT.getVectorNumElements();
6434 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6435 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6436 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6437 Split = true;
6438 }
6439
6440 MVT EltVT = ConstVecVT.getVectorElementType();
6441 for (unsigned i = 0; i < NumElts; ++i) {
6442 bool IsUndef = Values[i] < 0 && IsMask;
6443 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6444 DAG.getConstant(Values[i], dl, EltVT);
6445 Ops.push_back(OpNode);
6446 if (Split)
6447 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6448 DAG.getConstant(0, dl, EltVT));
6449 }
6450 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6451 if (Split)
6452 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6453 return ConstsNode;
6454}
6455
6456static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6457 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6458 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__))
6459 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__))
;
6460 SmallVector<SDValue, 32> Ops;
6461 bool Split = false;
6462
6463 MVT ConstVecVT = VT;
6464 unsigned NumElts = VT.getVectorNumElements();
6465 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6466 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6467 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6468 Split = true;
6469 }
6470
6471 MVT EltVT = ConstVecVT.getVectorElementType();
6472 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6473 if (Undefs[i]) {
6474 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6475 continue;
6476 }
6477 const APInt &V = Bits[i];
6478 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6478, __extension__
__PRETTY_FUNCTION__))
;
6479 if (Split) {
6480 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6481 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6482 } else if (EltVT == MVT::f32) {
6483 APFloat FV(APFloat::IEEEsingle(), V);
6484 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6485 } else if (EltVT == MVT::f64) {
6486 APFloat FV(APFloat::IEEEdouble(), V);
6487 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6488 } else {
6489 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6490 }
6491 }
6492
6493 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6494 return DAG.getBitcast(VT, ConstsNode);
6495}
6496
6497/// Returns a vector of specified type with all zero elements.
6498static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6499 SelectionDAG &DAG, const SDLoc &dl) {
6500 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))
6501 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))
6502 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))
;
6503
6504 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6505 // type. This ensures they get CSE'd. But if the integer type is not
6506 // available, use a floating-point +0.0 instead.
6507 SDValue Vec;
6508 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6509 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6510 } else if (VT.isFloatingPoint()) {
6511 Vec = DAG.getConstantFP(+0.0, dl, VT);
6512 } else if (VT.getVectorElementType() == MVT::i1) {
6513 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__))
6514 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__))
;
6515 Vec = DAG.getConstant(0, dl, VT);
6516 } else {
6517 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6518 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6519 }
6520 return DAG.getBitcast(VT, Vec);
6521}
6522
6523// Helper to determine if the ops are all the extracted subvectors come from a
6524// single source. If we allow commute they don't have to be in order (Lo/Hi).
6525static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6526 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6527 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6528 LHS.getValueType() != RHS.getValueType() ||
6529 LHS.getOperand(0) != RHS.getOperand(0))
6530 return SDValue();
6531
6532 SDValue Src = LHS.getOperand(0);
6533 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6534 return SDValue();
6535
6536 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6537 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6538 RHS.getConstantOperandAPInt(1) == NumElts) ||
6539 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6540 LHS.getConstantOperandAPInt(1) == NumElts))
6541 return Src;
6542
6543 return SDValue();
6544}
6545
6546static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6547 const SDLoc &dl, unsigned vectorWidth) {
6548 EVT VT = Vec.getValueType();
6549 EVT ElVT = VT.getVectorElementType();
6550 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6551 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6552 VT.getVectorNumElements() / Factor);
6553
6554 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6555 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6556 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6556, __extension__
__PRETTY_FUNCTION__))
;
6557
6558 // This is the index of the first element of the vectorWidth-bit chunk
6559 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6560 IdxVal &= ~(ElemsPerChunk - 1);
6561
6562 // If the input is a buildvector just emit a smaller one.
6563 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6564 return DAG.getBuildVector(ResultVT, dl,
6565 Vec->ops().slice(IdxVal, ElemsPerChunk));
6566
6567 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6568 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6569}
6570
6571/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6572/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6573/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6574/// instructions or a simple subregister reference. Idx is an index in the
6575/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6576/// lowering EXTRACT_VECTOR_ELT operations easier.
6577static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6578 SelectionDAG &DAG, const SDLoc &dl) {
6579 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__))
6580 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__))
;
6581 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6582}
6583
6584/// Generate a DAG to grab 256-bits from a 512-bit vector.
6585static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6586 SelectionDAG &DAG, const SDLoc &dl) {
6587 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__))
;
6588 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6589}
6590
6591static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6592 SelectionDAG &DAG, const SDLoc &dl,
6593 unsigned vectorWidth) {
6594 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__))
6595 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__))
;
6596 // Inserting UNDEF is Result
6597 if (Vec.isUndef())
6598 return Result;
6599 EVT VT = Vec.getValueType();
6600 EVT ElVT = VT.getVectorElementType();
6601 EVT ResultVT = Result.getValueType();
6602
6603 // Insert the relevant vectorWidth bits.
6604 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6605 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6605, __extension__
__PRETTY_FUNCTION__))
;
6606
6607 // This is the index of the first element of the vectorWidth-bit chunk
6608 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6609 IdxVal &= ~(ElemsPerChunk - 1);
6610
6611 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6612 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6613}
6614
6615/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6616/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6617/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6618/// simple superregister reference. Idx is an index in the 128 bits
6619/// we want. It need not be aligned to a 128-bit boundary. That makes
6620/// lowering INSERT_VECTOR_ELT operations easier.
6621static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6622 SelectionDAG &DAG, const SDLoc &dl) {
6623 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6623, __extension__
__PRETTY_FUNCTION__))
;
6624 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6625}
6626
6627/// Widen a vector to a larger size with the same scalar type, with the new
6628/// elements either zero or undef.
6629static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6630 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6631 const SDLoc &dl) {
6632 assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))
6633 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))
6634 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))
;
6635 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6636 : DAG.getUNDEF(VT);
6637 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6638 DAG.getIntPtrConstant(0, dl));
6639}
6640
6641/// Widen a vector to a larger size with the same scalar type, with the new
6642/// elements either zero or undef.
6643static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6644 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6645 const SDLoc &dl, unsigned WideSizeInBits) {
6646 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))
6647 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))
6648 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))
;
6649 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6650 MVT SVT = Vec.getSimpleValueType().getScalarType();
6651 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6652 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6653}
6654
6655// Helper function to collect subvector ops that are concatenated together,
6656// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6657// The subvectors in Ops are guaranteed to be the same type.
6658static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
6659 SelectionDAG &DAG) {
6660 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6660, __extension__
__PRETTY_FUNCTION__))
;
6661
6662 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6663 Ops.append(N->op_begin(), N->op_end());
6664 return true;
6665 }
6666
6667 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6668 SDValue Src = N->getOperand(0);
6669 SDValue Sub = N->getOperand(1);
6670 const APInt &Idx = N->getConstantOperandAPInt(2);
6671 EVT VT = Src.getValueType();
6672 EVT SubVT = Sub.getValueType();
6673
6674 // TODO - Handle more general insert_subvector chains.
6675 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
6676 // insert_subvector(undef, x, lo)
6677 if (Idx == 0 && Src.isUndef()) {
6678 Ops.push_back(Sub);
6679 Ops.push_back(DAG.getUNDEF(SubVT));
6680 return true;
6681 }
6682 if (Idx == (VT.getVectorNumElements() / 2)) {
6683 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6684 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6685 Src.getOperand(1).getValueType() == SubVT &&
6686 isNullConstant(Src.getOperand(2))) {
6687 Ops.push_back(Src.getOperand(1));
6688 Ops.push_back(Sub);
6689 return true;
6690 }
6691 // insert_subvector(x, extract_subvector(x, lo), hi)
6692 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6693 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6694 Ops.append(2, Sub);
6695 return true;
6696 }
6697 // insert_subvector(undef, x, hi)
6698 if (Src.isUndef()) {
6699 Ops.push_back(DAG.getUNDEF(SubVT));
6700 Ops.push_back(Sub);
6701 return true;
6702 }
6703 }
6704 }
6705 }
6706
6707 return false;
6708}
6709
6710static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6711 const SDLoc &dl) {
6712 EVT VT = Op.getValueType();
6713 unsigned NumElems = VT.getVectorNumElements();
6714 unsigned SizeInBits = VT.getSizeInBits();
6715 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__))
6716 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__))
;
6717
6718 // If this is a splat value (with no-undefs) then use the lower subvector,
6719 // which should be a free extraction.
6720 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6721 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6722 return std::make_pair(Lo, Lo);
6723
6724 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6725 return std::make_pair(Lo, Hi);
6726}
6727
6728/// Break an operation into 2 half sized ops and then concatenate the results.
6729static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6730 unsigned NumOps = Op.getNumOperands();
6731 EVT VT = Op.getValueType();
6732 SDLoc dl(Op);
6733
6734 // Extract the LHS Lo/Hi vectors
6735 SmallVector<SDValue> LoOps(NumOps, SDValue());
6736 SmallVector<SDValue> HiOps(NumOps, SDValue());
6737 for (unsigned I = 0; I != NumOps; ++I) {
6738 SDValue SrcOp = Op.getOperand(I);
6739 if (!SrcOp.getValueType().isVector()) {
6740 LoOps[I] = HiOps[I] = SrcOp;
6741 continue;
6742 }
6743 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6744 }
6745
6746 EVT LoVT, HiVT;
6747 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6748 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6749 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6750 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6751}
6752
6753/// Break an unary integer operation into 2 half sized ops and then
6754/// concatenate the result back.
6755static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6756 // Make sure we only try to split 256/512-bit types to avoid creating
6757 // narrow vectors.
6758 EVT VT = Op.getValueType();
6759 (void)VT;
6760 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))
6761 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))
6762 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))
;
6763 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
6764 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
6765 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
;
6766 return splitVectorOp(Op, DAG);
6767}
6768
6769/// Break a binary integer operation into 2 half sized ops and then
6770/// concatenate the result back.
6771static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6772 // Assert that all the types match.
6773 EVT VT = Op.getValueType();
6774 (void)VT;
6775 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__))
6776 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__))
;
6777 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6777, __extension__
__PRETTY_FUNCTION__))
;
6778 return splitVectorOp(Op, DAG);
6779}
6780
6781// Helper for splitting operands of an operation to legal target size and
6782// apply a function on each part.
6783// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6784// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6785// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6786// The argument Builder is a function that will be applied on each split part:
6787// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6788template <typename F>
6789SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6790 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6791 F Builder, bool CheckBWI = true) {
6792 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6792, __extension__
__PRETTY_FUNCTION__))
;
6793 unsigned NumSubs = 1;
6794 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6795 (!CheckBWI && Subtarget.useAVX512Regs())) {
6796 if (VT.getSizeInBits() > 512) {
6797 NumSubs = VT.getSizeInBits() / 512;
6798 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__))
;
6799 }
6800 } else if (Subtarget.hasAVX2()) {
6801 if (VT.getSizeInBits() > 256) {
6802 NumSubs = VT.getSizeInBits() / 256;
6803 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6803, __extension__
__PRETTY_FUNCTION__))
;
6804 }
6805 } else {
6806 if (VT.getSizeInBits() > 128) {
6807 NumSubs = VT.getSizeInBits() / 128;
6808 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6808, __extension__
__PRETTY_FUNCTION__))
;
6809 }
6810 }
6811
6812 if (NumSubs == 1)
6813 return Builder(DAG, DL, Ops);
6814
6815 SmallVector<SDValue, 4> Subs;
6816 for (unsigned i = 0; i != NumSubs; ++i) {
6817 SmallVector<SDValue, 2> SubOps;
6818 for (SDValue Op : Ops) {
6819 EVT OpVT = Op.getValueType();
6820 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6821 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6822 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6823 }
6824 Subs.push_back(Builder(DAG, DL, SubOps));
6825 }
6826 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6827}
6828
6829// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6830// targets.
6831static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6832 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6833 const X86Subtarget &Subtarget) {
6834 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6834, __extension__
__PRETTY_FUNCTION__))
;
6835 MVT SVT = VT.getScalarType();
6836
6837 // If we have a 32/64 splatted constant, splat it to DstTy to
6838 // encourage a foldable broadcast'd operand.
6839 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6840 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6841 // AVX512 broadcasts 32/64-bit operands.
6842 // TODO: Support float once getAVX512Node is used by fp-ops.
6843 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6844 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6845 return SDValue();
6846 // If we're not widening, don't bother if we're not bitcasting.
6847 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6848 return SDValue();
6849 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6850 APInt SplatValue, SplatUndef;
6851 unsigned SplatBitSize;
6852 bool HasAnyUndefs;
6853 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6854 HasAnyUndefs, OpEltSizeInBits) &&
6855 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6856 return DAG.getConstant(SplatValue, DL, DstVT);
6857 }
6858 return SDValue();
6859 };
6860
6861 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6862
6863 MVT DstVT = VT;
6864 if (Widen)
6865 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6866
6867 // Canonicalize src operands.
6868 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6869 for (SDValue &Op : SrcOps) {
6870 MVT OpVT = Op.getSimpleValueType();
6871 // Just pass through scalar operands.
6872 if (!OpVT.isVector())
6873 continue;
6874 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6874, __extension__
__PRETTY_FUNCTION__))
;
6875
6876 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6877 Op = BroadcastOp;
6878 continue;
6879 }
6880
6881 // Just widen the subvector by inserting into an undef wide vector.
6882 if (Widen)
6883 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6884 }
6885
6886 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6887
6888 // Perform the 512-bit op then extract the bottom subvector.
6889 if (Widen)
6890 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6891 return Res;
6892}
6893
6894/// Insert i1-subvector to i1-vector.
6895static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6896 const X86Subtarget &Subtarget) {
6897
6898 SDLoc dl(Op);
6899 SDValue Vec = Op.getOperand(0);
6900 SDValue SubVec = Op.getOperand(1);
6901 SDValue Idx = Op.getOperand(2);
6902 unsigned IdxVal = Op.getConstantOperandVal(2);
6903
6904 // Inserting undef is a nop. We can just return the original vector.
6905 if (SubVec.isUndef())
6906 return Vec;
6907
6908 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6909 return Op;
6910
6911 MVT OpVT = Op.getSimpleValueType();
6912 unsigned NumElems = OpVT.getVectorNumElements();
6913 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6914
6915 // Extend to natively supported kshift.
6916 MVT WideOpVT = OpVT;
6917 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6918 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6919
6920 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6921 // if necessary.
6922 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6923 // May need to promote to a legal type.
6924 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6925 DAG.getConstant(0, dl, WideOpVT),
6926 SubVec, Idx);
6927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6928 }
6929
6930 MVT SubVecVT = SubVec.getSimpleValueType();
6931 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6932 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))
6933 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))
6934 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))
;
6935
6936 SDValue Undef = DAG.getUNDEF(WideOpVT);
6937
6938 if (IdxVal == 0) {
6939 // Zero lower bits of the Vec
6940 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6941 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6942 ZeroIdx);
6943 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6944 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6945 // Merge them together, SubVec should be zero extended.
6946 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6947 DAG.getConstant(0, dl, WideOpVT),
6948 SubVec, ZeroIdx);
6949 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6950 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6951 }
6952
6953 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6954 Undef, SubVec, ZeroIdx);
6955
6956 if (Vec.isUndef()) {
6957 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6957, __extension__
__PRETTY_FUNCTION__))
;
6958 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6959 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6960 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6961 }
6962
6963 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6964 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6964, __extension__
__PRETTY_FUNCTION__))
;
6965 // If upper elements of Vec are known undef, then just shift into place.
6966 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6967 [](SDValue V) { return V.isUndef(); })) {
6968 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6969 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6970 } else {
6971 NumElems = WideOpVT.getVectorNumElements();
6972 unsigned ShiftLeft = NumElems - SubVecNumElems;
6973 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6974 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6975 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6976 if (ShiftRight != 0)
6977 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6978 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6979 }
6980 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6981 }
6982
6983 // Simple case when we put subvector in the upper part
6984 if (IdxVal + SubVecNumElems == NumElems) {
6985 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6986 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6987 if (SubVecNumElems * 2 == NumElems) {
6988 // Special case, use legal zero extending insert_subvector. This allows
6989 // isel to optimize when bits are known zero.
6990 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6991 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6992 DAG.getConstant(0, dl, WideOpVT),
6993 Vec, ZeroIdx);
6994 } else {
6995 // Otherwise use explicit shifts to zero the bits.
6996 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6997 Undef, Vec, ZeroIdx);
6998 NumElems = WideOpVT.getVectorNumElements();
6999 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
7000 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
7001 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
7002 }
7003 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7004 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7005 }
7006
7007 // Inserting into the middle is more complicated.
7008
7009 NumElems = WideOpVT.getVectorNumElements();
7010
7011 // Widen the vector if needed.
7012 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
7013
7014 unsigned ShiftLeft = NumElems - SubVecNumElems;
7015 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
7016
7017 // Do an optimization for the the most frequently used types.
7018 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
7019 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
7020 Mask0.flipAllBits();
7021 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
7022 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
7023 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
7024 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7025 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7026 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7027 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7028 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7029
7030 // Reduce to original width if needed.
7031 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7032 }
7033
7034 // Clear the upper bits of the subvector and move it to its insert position.
7035 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7036 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7037 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7038 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7039
7040 // Isolate the bits below the insertion point.
7041 unsigned LowShift = NumElems - IdxVal;
7042 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
7043 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7044 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
7045 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7046
7047 // Isolate the bits after the last inserted bit.
7048 unsigned HighShift = IdxVal + SubVecNumElems;
7049 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
7050 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7051 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
7052 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7053
7054 // Now OR all 3 pieces together.
7055 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
7056 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
7057
7058 // Reduce to original width if needed.
7059 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7060}
7061
7062static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
7063 const SDLoc &dl) {
7064 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7064, __extension__
__PRETTY_FUNCTION__))
;
7065 EVT SubVT = V1.getValueType();
7066 EVT SubSVT = SubVT.getScalarType();
7067 unsigned SubNumElts = SubVT.getVectorNumElements();
7068 unsigned SubVectorWidth = SubVT.getSizeInBits();
7069 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
7070 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
7071 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
7072}
7073
7074/// Returns a vector of specified type with all bits set.
7075/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
7076/// Then bitcast to their original type, ensuring they get CSE'd.
7077static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
7078 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__))
7079 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__))
;
7080
7081 APInt Ones = APInt::getAllOnes(32);
7082 unsigned NumElts = VT.getSizeInBits() / 32;
7083 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
7084 return DAG.getBitcast(VT, Vec);
7085}
7086
7087static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
7088 SDValue In, SelectionDAG &DAG) {
7089 EVT InVT = In.getValueType();
7090 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7090, __extension__
__PRETTY_FUNCTION__))
;
7091 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))
7092 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))
7093 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))
;
7094
7095 // For 256-bit vectors, we only need the lower (128-bit) input half.
7096 // For 512-bit vectors, we only need the lower input half or quarter.
7097 if (InVT.getSizeInBits() > 128) {
7098 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__))
7099 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__))
;
7100 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
7101 In = extractSubVector(In, 0, DAG, DL,
7102 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
7103 InVT = In.getValueType();
7104 }
7105
7106 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
7107 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
7108
7109 return DAG.getNode(Opcode, DL, VT, In);
7110}
7111
7112// Match (xor X, -1) -> X.
7113// Match extract_subvector(xor X, -1) -> extract_subvector(X).
7114// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
7115static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
7116 V = peekThroughBitcasts(V);
7117 if (V.getOpcode() == ISD::XOR &&
7118 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
7119 isAllOnesConstant(V.getOperand(1))))
7120 return V.getOperand(0);
7121 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7122 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
7123 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
7124 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
7125 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
7126 Not, V.getOperand(1));
7127 }
7128 }
7129 SmallVector<SDValue, 2> CatOps;
7130 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
7131 for (SDValue &CatOp : CatOps) {
7132 SDValue NotCat = IsNOT(CatOp, DAG);
7133 if (!NotCat) return SDValue();
7134 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
7135 }
7136 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
7137 }
7138 return SDValue();
7139}
7140
7141void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
7142 bool Lo, bool Unary) {
7143 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__))
7144 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__))
;
7145 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7145, __extension__
__PRETTY_FUNCTION__))
;
7146 int NumElts = VT.getVectorNumElements();
7147 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7148 for (int i = 0; i < NumElts; ++i) {
7149 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7150 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
7151 Pos += (Unary ? 0 : NumElts * (i % 2));
7152 Pos += (Lo ? 0 : NumEltsInLane / 2);
7153 Mask.push_back(Pos);
7154 }
7155}
7156
7157/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
7158/// imposed by AVX and specific to the unary pattern. Example:
7159/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
7160/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
7161void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7162 bool Lo) {
7163 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7163, __extension__
__PRETTY_FUNCTION__))
;
7164 int NumElts = VT.getVectorNumElements();
7165 for (int i = 0; i < NumElts; ++i) {
7166 int Pos = i / 2;
7167 Pos += (Lo ? 0 : NumElts / 2);
7168 Mask.push_back(Pos);
7169 }
7170}
7171
7172// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
7173static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
7174 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
7175 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
7176 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
7177 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
7178 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
7179 int M = Mask[I];
7180 if (M < 0)
7181 continue;
7182 SDValue V = (M < NumElts) ? V1 : V2;
7183 if (V.isUndef())
7184 continue;
7185 Ops[I] = V.getOperand(M % NumElts);
7186 }
7187 return DAG.getBuildVector(VT, dl, Ops);
7188 }
7189
7190 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
7191}
7192
7193/// Returns a vector_shuffle node for an unpackl operation.
7194static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7195 SDValue V1, SDValue V2) {
7196 SmallVector<int, 8> Mask;
7197 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
7198 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7199}
7200
7201/// Returns a vector_shuffle node for an unpackh operation.
7202static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7203 SDValue V1, SDValue V2) {
7204 SmallVector<int, 8> Mask;
7205 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
7206 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7207}
7208
7209/// Returns a node that packs the LHS + RHS nodes together at half width.
7210/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
7211/// TODO: Add subvector splitting if/when we have a need for it.
7212static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
7213 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
7214 bool PackHiHalf = false) {
7215 MVT OpVT = LHS.getSimpleValueType();
7216 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7217 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
7218 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
7219 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
7220 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
7221 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))
;
7222 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__))
7223 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__))
;
7224
7225 // Rely on vector shuffles for vXi64 -> vXi32 packing.
7226 if (EltSizeInBits == 32) {
7227 SmallVector<int> PackMask;
7228 int Offset = PackHiHalf ? 1 : 0;
7229 int NumElts = VT.getVectorNumElements();
7230 for (int I = 0; I != NumElts; I += 4) {
7231 PackMask.push_back(I + Offset);
7232 PackMask.push_back(I + Offset + 2);
7233 PackMask.push_back(I + Offset + NumElts);
7234 PackMask.push_back(I + Offset + NumElts + 2);
7235 }
7236 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
7237 DAG.getBitcast(VT, RHS), PackMask);
7238 }
7239
7240 // See if we already have sufficient leading bits for PACKSS/PACKUS.
7241 if (!PackHiHalf) {
7242 if (UsePackUS &&
7243 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
7244 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
7245 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7246
7247 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
7248 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
7249 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7250 }
7251
7252 // Fallback to sign/zero extending the requested half and pack.
7253 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
7254 if (UsePackUS) {
7255 if (PackHiHalf) {
7256 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
7257 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
7258 } else {
7259 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
7260 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
7261 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
7262 };
7263 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7264 };
7265
7266 if (!PackHiHalf) {
7267 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
7268 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
7269 }
7270 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
7271 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
7272 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7273}
7274
7275/// Return a vector_shuffle of the specified vector of zero or undef vector.
7276/// This produces a shuffle where the low element of V2 is swizzled into the
7277/// zero/undef vector, landing at element Idx.
7278/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
7279static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
7280 bool IsZero,
7281 const X86Subtarget &Subtarget,
7282 SelectionDAG &DAG) {
7283 MVT VT = V2.getSimpleValueType();
7284 SDValue V1 = IsZero
7285 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
7286 int NumElems = VT.getVectorNumElements();
7287 SmallVector<int, 16> MaskVec(NumElems);
7288 for (int i = 0; i != NumElems; ++i)
7289 // If this is the insertion idx, put the low elt of V2 here.
7290 MaskVec[i] = (i == Idx) ? NumElems : i;
7291 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
7292}
7293
7294static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
7295 if (Ptr.getOpcode() == X86ISD::Wrapper ||
7296 Ptr.getOpcode() == X86ISD::WrapperRIP)
7297 Ptr = Ptr.getOperand(0);
7298
7299 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
7300 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
7301 return nullptr;
7302
7303 return CNode->getConstVal();
7304}
7305
7306static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
7307 if (!Load || !ISD::isNormalLoad(Load))
7308 return nullptr;
7309 return getTargetConstantFromBasePtr(Load->getBasePtr());
7310}
7311
7312static const Constant *getTargetConstantFromNode(SDValue Op) {
7313 Op = peekThroughBitcasts(Op);
7314 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
7315}
7316
7317const Constant *
7318X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7319 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7319, __extension__
__PRETTY_FUNCTION__))
;
7320 return getTargetConstantFromNode(LD);
7321}
7322
7323// Extract raw constant bits from constant pools.
7324static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7325 APInt &UndefElts,
7326 SmallVectorImpl<APInt> &EltBits,
7327 bool AllowWholeUndefs = true,
7328 bool AllowPartialUndefs = true) {
7329 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7329, __extension__
__PRETTY_FUNCTION__))
;
7330
7331 Op = peekThroughBitcasts(Op);
7332
7333 EVT VT = Op.getValueType();
7334 unsigned SizeInBits = VT.getSizeInBits();
7335 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7335, __extension__
__PRETTY_FUNCTION__))
;
7336 unsigned NumElts = SizeInBits / EltSizeInBits;
7337
7338 // Bitcast a source array of element bits to the target size.
7339 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7340 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7341 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7342 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__))
7343 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__))
;
7344
7345 // Don't split if we don't allow undef bits.
7346 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7347 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7348 return false;
7349
7350 // If we're already the right size, don't bother bitcasting.
7351 if (NumSrcElts == NumElts) {
7352 UndefElts = UndefSrcElts;
7353 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7354 return true;
7355 }
7356
7357 // Extract all the undef/constant element data and pack into single bitsets.
7358 APInt UndefBits(SizeInBits, 0);
7359 APInt MaskBits(SizeInBits, 0);
7360
7361 for (unsigned i = 0; i != NumSrcElts; ++i) {
7362 unsigned BitOffset = i * SrcEltSizeInBits;
7363 if (UndefSrcElts[i])
7364 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7365 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7366 }
7367
7368 // Split the undef/constant single bitset data into the target elements.
7369 UndefElts = APInt(NumElts, 0);
7370 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7371
7372 for (unsigned i = 0; i != NumElts; ++i) {
7373 unsigned BitOffset = i * EltSizeInBits;
7374 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7375
7376 // Only treat an element as UNDEF if all bits are UNDEF.
7377 if (UndefEltBits.isAllOnes()) {
7378 if (!AllowWholeUndefs)
7379 return false;
7380 UndefElts.setBit(i);
7381 continue;
7382 }
7383
7384 // If only some bits are UNDEF then treat them as zero (or bail if not
7385 // supported).
7386 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7387 return false;
7388
7389 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7390 }
7391 return true;
7392 };
7393
7394 // Collect constant bits and insert into mask/undef bit masks.
7395 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7396 unsigned UndefBitIndex) {
7397 if (!Cst)
7398 return false;
7399 if (isa<UndefValue>(Cst)) {
7400 Undefs.setBit(UndefBitIndex);
7401 return true;
7402 }
7403 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7404 Mask = CInt->getValue();
7405 return true;
7406 }
7407 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7408 Mask = CFP->getValueAPF().bitcastToAPInt();
7409 return true;
7410 }
7411 return false;
7412 };
7413
7414 // Handle UNDEFs.
7415 if (Op.isUndef()) {
7416 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7417 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7418 return CastBitData(UndefSrcElts, SrcEltBits);
7419 }
7420
7421 // Extract scalar constant bits.
7422 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7423 APInt UndefSrcElts = APInt::getZero(1);
7424 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7425 return CastBitData(UndefSrcElts, SrcEltBits);
7426 }
7427 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7428 APInt UndefSrcElts = APInt::getZero(1);
7429 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7430 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7431 return CastBitData(UndefSrcElts, SrcEltBits);
7432 }
7433
7434 // Extract constant bits from build vector.
7435 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7436 BitVector Undefs;
7437 SmallVector<APInt> SrcEltBits;
7438 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7439 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7440 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
7441 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7442 if (Undefs[I])
7443 UndefSrcElts.setBit(I);
7444 return CastBitData(UndefSrcElts, SrcEltBits);
7445 }
7446 }
7447
7448 // Extract constant bits from constant pool vector.
7449 if (auto *Cst = getTargetConstantFromNode(Op)) {
7450 Type *CstTy = Cst->getType();
7451 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7452 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7453 return false;
7454
7455 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7456 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7457
7458 APInt UndefSrcElts(NumSrcElts, 0);
7459 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7460 for (unsigned i = 0; i != NumSrcElts; ++i)
7461 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7462 UndefSrcElts, i))
7463 return false;
7464
7465 return CastBitData(UndefSrcElts, SrcEltBits);
7466 }
7467
7468 // Extract constant bits from a broadcasted constant pool scalar.
7469 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7470 EltSizeInBits <= VT.getScalarSizeInBits()) {
7471 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7472 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7473 return false;
7474
7475 SDValue Ptr = MemIntr->getBasePtr();
7476 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7477 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7478 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7479
7480 APInt UndefSrcElts(NumSrcElts, 0);
7481 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7482 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7483 if (UndefSrcElts[0])
7484 UndefSrcElts.setBits(0, NumSrcElts);
7485 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7486 return CastBitData(UndefSrcElts, SrcEltBits);
7487 }
7488 }
7489 }
7490
7491 // Extract constant bits from a subvector broadcast.
7492 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7493 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7494 SDValue Ptr = MemIntr->getBasePtr();
7495 // The source constant may be larger than the subvector broadcast,
7496 // ensure we extract the correct subvector constants.
7497 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7498 Type *CstTy = Cst->getType();
7499 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7500 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7501 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7502 (SizeInBits % SubVecSizeInBits) != 0)
7503 return false;
7504 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7505 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7506 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7507 APInt UndefSubElts(NumSubElts, 0);
7508 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7509 APInt(CstEltSizeInBits, 0));
7510 for (unsigned i = 0; i != NumSubElts; ++i) {
7511 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7512 UndefSubElts, i))
7513 return false;
7514 for (unsigned j = 1; j != NumSubVecs; ++j)
7515 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7516 }
7517 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7518 UndefSubElts);
7519 return CastBitData(UndefSubElts, SubEltBits);
7520 }
7521 }
7522
7523 // Extract a rematerialized scalar constant insertion.
7524 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7525 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7526 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7527 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7528 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7529
7530 APInt UndefSrcElts(NumSrcElts, 0);
7531 SmallVector<APInt, 64> SrcEltBits;
7532 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7533 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7534 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7535 return CastBitData(UndefSrcElts, SrcEltBits);
7536 }
7537
7538 // Insert constant bits from a base and sub vector sources.
7539 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7540 // If bitcasts to larger elements we might lose track of undefs - don't
7541 // allow any to be safe.
7542 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7543 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7544
7545 APInt UndefSrcElts, UndefSubElts;
7546 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7547 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7548 UndefSubElts, EltSubBits,
7549 AllowWholeUndefs && AllowUndefs,
7550 AllowPartialUndefs && AllowUndefs) &&
7551 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7552 UndefSrcElts, EltSrcBits,
7553 AllowWholeUndefs && AllowUndefs,
7554 AllowPartialUndefs && AllowUndefs)) {
7555 unsigned BaseIdx = Op.getConstantOperandVal(2);
7556 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7557 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7558 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7559 return CastBitData(UndefSrcElts, EltSrcBits);
7560 }
7561 }
7562
7563 // Extract constant bits from a subvector's source.
7564 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7565 // TODO - support extract_subvector through bitcasts.
7566 if (EltSizeInBits != VT.getScalarSizeInBits())
7567 return false;
7568
7569 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7570 UndefElts, EltBits, AllowWholeUndefs,
7571 AllowPartialUndefs)) {
7572 EVT SrcVT = Op.getOperand(0).getValueType();
7573 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7574 unsigned NumSubElts = VT.getVectorNumElements();
7575 unsigned BaseIdx = Op.getConstantOperandVal(1);
7576 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7577 if ((BaseIdx + NumSubElts) != NumSrcElts)
7578 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7579 if (BaseIdx != 0)
7580 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7581 return true;
7582 }
7583 }
7584
7585 // Extract constant bits from shuffle node sources.
7586 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7587 // TODO - support shuffle through bitcasts.
7588 if (EltSizeInBits != VT.getScalarSizeInBits())
7589 return false;
7590
7591 ArrayRef<int> Mask = SVN->getMask();
7592 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7593 llvm::any_of(Mask, [](int M) { return M < 0; }))
7594 return false;
7595
7596 APInt UndefElts0, UndefElts1;
7597 SmallVector<APInt, 32> EltBits0, EltBits1;
7598 if (isAnyInRange(Mask, 0, NumElts) &&
7599 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7600 UndefElts0, EltBits0, AllowWholeUndefs,
7601 AllowPartialUndefs))
7602 return false;
7603 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7604 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7605 UndefElts1, EltBits1, AllowWholeUndefs,
7606 AllowPartialUndefs))
7607 return false;
7608
7609 UndefElts = APInt::getZero(NumElts);
7610 for (int i = 0; i != (int)NumElts; ++i) {
7611 int M = Mask[i];
7612 if (M < 0) {
7613 UndefElts.setBit(i);
7614 EltBits.push_back(APInt::getZero(EltSizeInBits));
7615 } else if (M < (int)NumElts) {
7616 if (UndefElts0[M])
7617 UndefElts.setBit(i);
7618 EltBits.push_back(EltBits0[M]);
7619 } else {
7620 if (UndefElts1[M - NumElts])
7621 UndefElts.setBit(i);
7622 EltBits.push_back(EltBits1[M - NumElts]);
7623 }
7624 }
7625 return true;
7626 }
7627
7628 return false;
7629}
7630
7631namespace llvm {
7632namespace X86 {
7633bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7634 APInt UndefElts;
7635 SmallVector<APInt, 16> EltBits;
7636 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7637 UndefElts, EltBits, true,
7638 AllowPartialUndefs)) {
7639 int SplatIndex = -1;
7640 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7641 if (UndefElts[i])
7642 continue;
7643 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7644 SplatIndex = -1;
7645 break;
7646 }
7647 SplatIndex = i;
7648 }
7649 if (0 <= SplatIndex) {
7650 SplatVal = EltBits[SplatIndex];
7651 return true;
7652 }
7653 }
7654
7655 return false;
7656}
7657} // namespace X86
7658} // namespace llvm
7659
7660static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7661 unsigned MaskEltSizeInBits,
7662 SmallVectorImpl<uint64_t> &RawMask,
7663 APInt &UndefElts) {
7664 // Extract the raw target constant bits.
7665 SmallVector<APInt, 64> EltBits;
7666 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7667 EltBits, /* AllowWholeUndefs */ true,
7668 /* AllowPartialUndefs */ false))
7669 return false;
7670
7671 // Insert the extracted elements into the mask.
7672 for (const APInt &Elt : EltBits)
7673 RawMask.push_back(Elt.getZExtValue());
7674
7675 return true;
7676}
7677
7678/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7679/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7680/// Note: This ignores saturation, so inputs must be checked first.
7681static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7682 bool Unary, unsigned NumStages = 1) {
7683 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7683, __extension__
__PRETTY_FUNCTION__))
;
7684 unsigned NumElts = VT.getVectorNumElements();
7685 unsigned NumLanes = VT.getSizeInBits() / 128;
7686 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7687 unsigned Offset = Unary ? 0 : NumElts;
7688 unsigned Repetitions = 1u << (NumStages - 1);
7689 unsigned Increment = 1u << NumStages;
7690 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__))
;
7691
7692 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7693 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7694 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7695 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7696 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7697 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7698 }
7699 }
7700}
7701
7702// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7703static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7704 APInt &DemandedLHS, APInt &DemandedRHS) {
7705 int NumLanes = VT.getSizeInBits() / 128;
7706 int NumElts = DemandedElts.getBitWidth();
7707 int NumInnerElts = NumElts / 2;
7708 int NumEltsPerLane = NumElts / NumLanes;
7709 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7710
7711 DemandedLHS = APInt::getZero(NumInnerElts);
7712 DemandedRHS = APInt::getZero(NumInnerElts);
7713
7714 // Map DemandedElts to the packed operands.
7715 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7716 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7717 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7718 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7719 if (DemandedElts[OuterIdx])
7720 DemandedLHS.setBit(InnerIdx);
7721 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7722 DemandedRHS.setBit(InnerIdx);
7723 }
7724 }
7725}
7726
7727// Split the demanded elts of a HADD/HSUB node between its operands.
7728static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7729 APInt &DemandedLHS, APInt &DemandedRHS) {
7730 int NumLanes = VT.getSizeInBits() / 128;
7731 int NumElts = DemandedElts.getBitWidth();
7732 int NumEltsPerLane = NumElts / NumLanes;
7733 int HalfEltsPerLane = NumEltsPerLane / 2;
7734
7735 DemandedLHS = APInt::getZero(NumElts);
7736 DemandedRHS = APInt::getZero(NumElts);
7737
7738 // Map DemandedElts to the horizontal operands.
7739 for (int Idx = 0; Idx != NumElts; ++Idx) {
7740 if (!DemandedElts[Idx])
7741 continue;
7742 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7743 int LocalIdx = Idx % NumEltsPerLane;
7744 if (LocalIdx < HalfEltsPerLane) {
7745 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7746 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7747 } else {
7748 LocalIdx -= HalfEltsPerLane;
7749 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7750 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7751 }
7752 }
7753}
7754
7755/// Calculates the shuffle mask corresponding to the target-specific opcode.
7756/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7757/// operands in \p Ops, and returns true.
7758/// Sets \p IsUnary to true if only one source is used. Note that this will set
7759/// IsUnary for shuffles which use a single input multiple times, and in those
7760/// cases it will adjust the mask to only have indices within that single input.
7761/// It is an error to call this with non-empty Mask/Ops vectors.
7762static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7763 SmallVectorImpl<SDValue> &Ops,
7764 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7765 unsigned NumElems = VT.getVectorNumElements();
7766 unsigned MaskEltSize = VT.getScalarSizeInBits();
7767 SmallVector<uint64_t, 32> RawMask;
7768 APInt RawUndefs;
7769 uint64_t ImmN;
7770
7771 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7771, __extension__
__PRETTY_FUNCTION__))
;
7772 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7772, __extension__
__PRETTY_FUNCTION__))
;
7773
7774 IsUnary = false;
7775 bool IsFakeUnary = false;
7776 switch (N->getOpcode()) {
7777 case X86ISD::BLENDI:
7778 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7778, __extension__
__PRETTY_FUNCTION__))
;
7779 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7779, __extension__
__PRETTY_FUNCTION__))
;
7780 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7781 DecodeBLENDMask(NumElems, ImmN, Mask);
7782 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7783 break;
7784 case X86ISD::SHUFP:
7785 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7785, __extension__
__PRETTY_FUNCTION__))
;
7786 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7786, __extension__
__PRETTY_FUNCTION__))
;
7787 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7788 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7789 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7790 break;
7791 case X86ISD::INSERTPS:
7792 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__))
;
7793 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__))
;
7794 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7795 DecodeINSERTPSMask(ImmN, Mask);
7796 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7797 break;
7798 case X86ISD::EXTRQI:
7799 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__))
;
7800 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7801 isa<ConstantSDNode>(N->getOperand(2))) {
7802 int BitLen = N->getConstantOperandVal(1);
7803 int BitIdx = N->getConstantOperandVal(2);
7804 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7805 IsUnary = true;
7806 }
7807 break;
7808 case X86ISD::INSERTQI:
7809 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7809, __extension__
__PRETTY_FUNCTION__))
;
7810 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7810, __extension__
__PRETTY_FUNCTION__))
;
7811 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7812 isa<ConstantSDNode>(N->getOperand(3))) {
7813 int BitLen = N->getConstantOperandVal(2);
7814 int BitIdx = N->getConstantOperandVal(3);
7815 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7816 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7817 }
7818 break;
7819 case X86ISD::UNPCKH:
7820 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__))
;
7821 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7821, __extension__
__PRETTY_FUNCTION__))
;
7822 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7823 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7824 break;
7825 case X86ISD::UNPCKL:
7826 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7826, __extension__
__PRETTY_FUNCTION__))
;
7827 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7827, __extension__
__PRETTY_FUNCTION__))
;
7828 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7829 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7830 break;
7831 case X86ISD::MOVHLPS:
7832 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7832, __extension__
__PRETTY_FUNCTION__))
;
7833 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7833, __extension__
__PRETTY_FUNCTION__))
;
7834 DecodeMOVHLPSMask(NumElems, Mask);
7835 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7836 break;
7837 case X86ISD::MOVLHPS:
7838 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7838, __extension__
__PRETTY_FUNCTION__))
;
7839 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7839, __extension__
__PRETTY_FUNCTION__))
;
7840 DecodeMOVLHPSMask(NumElems, Mask);
7841 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7842 break;
7843 case X86ISD::VALIGN:
7844 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__))
7845 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__))
;
7846 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7846, __extension__
__PRETTY_FUNCTION__))
;
7847 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__))
;
7848 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7849 DecodeVALIGNMask(NumElems, ImmN, Mask);
7850 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7851 Ops.push_back(N->getOperand(1));
7852 Ops.push_back(N->getOperand(0));
7853 break;
7854 case X86ISD::PALIGNR:
7855 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7855, __extension__
__PRETTY_FUNCTION__))
;
7856 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7856, __extension__
__PRETTY_FUNCTION__))
;
7857 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7857, __extension__
__PRETTY_FUNCTION__))
;
7858 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7859 DecodePALIGNRMask(NumElems, ImmN, Mask);
7860 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7861 Ops.push_back(N->getOperand(1));
7862 Ops.push_back(N->getOperand(0));
7863 break;
7864 case X86ISD::VSHLDQ:
7865 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7865, __extension__
__PRETTY_FUNCTION__))
;
7866 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__))
;
7867 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7868 DecodePSLLDQMask(NumElems, ImmN, Mask);
7869 IsUnary = true;
7870 break;
7871 case X86ISD::VSRLDQ:
7872 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7872, __extension__
__PRETTY_FUNCTION__))
;
7873 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7873, __extension__
__PRETTY_FUNCTION__))
;
7874 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7875 DecodePSRLDQMask(NumElems, ImmN, Mask);
7876 IsUnary = true;
7877 break;
7878 case X86ISD::PSHUFD:
7879 case X86ISD::VPERMILPI:
7880 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7880, __extension__
__PRETTY_FUNCTION__))
;
7881 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7882 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7883 IsUnary = true;
7884 break;
7885 case X86ISD::PSHUFHW:
7886 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__))
;
7887 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7888 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7889 IsUnary = true;
7890 break;
7891 case X86ISD::PSHUFLW:
7892 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7892, __extension__
__PRETTY_FUNCTION__))
;
7893 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7894 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7895 IsUnary = true;
7896 break;
7897 case X86ISD::VZEXT_MOVL:
7898 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7898, __extension__
__PRETTY_FUNCTION__))
;
7899 DecodeZeroMoveLowMask(NumElems, Mask);
7900 IsUnary = true;
7901 break;
7902 case X86ISD::VBROADCAST:
7903 // We only decode broadcasts of same-sized vectors, peeking through to
7904 // extracted subvectors is likely to cause hasOneUse issues with
7905 // SimplifyDemandedBits etc.
7906 if (N->getOperand(0).getValueType() == VT) {
7907 DecodeVectorBroadcast(NumElems, Mask);
7908 IsUnary = true;
7909 break;
7910 }
7911 return false;
7912 case X86ISD::VPERMILPV: {
7913 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__))
;
7914 IsUnary = true;
7915 SDValue MaskNode = N->getOperand(1);
7916 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7917 RawUndefs)) {
7918 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7919 break;
7920 }
7921 return false;
7922 }
7923 case X86ISD::PSHUFB: {
7924 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7924, __extension__
__PRETTY_FUNCTION__))
;
7925 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7925, __extension__
__PRETTY_FUNCTION__))
;
7926 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7926, __extension__
__PRETTY_FUNCTION__))
;
7927 IsUnary = true;
7928 SDValue MaskNode = N->getOperand(1);
7929 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7930 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7931 break;
7932 }
7933 return false;
7934 }
7935 case X86ISD::VPERMI:
7936 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7936, __extension__
__PRETTY_FUNCTION__))
;
7937 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7938 DecodeVPERMMask(NumElems, ImmN, Mask);
7939 IsUnary = true;
7940 break;
7941 case X86ISD::MOVSS:
7942 case X86ISD::MOVSD:
7943 case X86ISD::MOVSH:
7944 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7944, __extension__
__PRETTY_FUNCTION__))
;
7945 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__))
;
7946 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7947 break;
7948 case X86ISD::VPERM2X128:
7949 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7949, __extension__
__PRETTY_FUNCTION__))
;
7950 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7950, __extension__
__PRETTY_FUNCTION__))
;
7951 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7952 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7953 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7954 break;
7955 case X86ISD::SHUF128:
7956 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7956, __extension__
__PRETTY_FUNCTION__))
;
7957 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__))
;
7958 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7959 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7960 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7961 break;
7962 case X86ISD::MOVSLDUP:
7963 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7963, __extension__
__PRETTY_FUNCTION__))
;
7964 DecodeMOVSLDUPMask(NumElems, Mask);
7965 IsUnary = true;
7966 break;
7967 case X86ISD::MOVSHDUP:
7968 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7968, __extension__
__PRETTY_FUNCTION__))
;
7969 DecodeMOVSHDUPMask(NumElems, Mask);
7970 IsUnary = true;
7971 break;
7972 case X86ISD::MOVDDUP:
7973 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7973, __extension__
__PRETTY_FUNCTION__))
;
7974 DecodeMOVDDUPMask(NumElems, Mask);
7975 IsUnary = true;
7976 break;
7977 case X86ISD::VPERMIL2: {
7978 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__))
;
7979 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7979, __extension__
__PRETTY_FUNCTION__))
;
7980 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7981 SDValue MaskNode = N->getOperand(2);
7982 SDValue CtrlNode = N->getOperand(3);
7983 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7984 unsigned CtrlImm = CtrlOp->getZExtValue();
7985 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7986 RawUndefs)) {
7987 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7988 Mask);
7989 break;
7990 }
7991 }
7992 return false;
7993 }
7994 case X86ISD::VPPERM: {
7995 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7995, __extension__
__PRETTY_FUNCTION__))
;
7996 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7996, __extension__
__PRETTY_FUNCTION__))
;
7997 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7998 SDValue MaskNode = N->getOperand(2);
7999 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
8000 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
8001 break;
8002 }
8003 return false;
8004 }
8005 case X86ISD::VPERMV: {
8006 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8006, __extension__
__PRETTY_FUNCTION__))
;
8007 IsUnary = true;
8008 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
8009 Ops.push_back(N->getOperand(1));
8010 SDValue MaskNode = N->getOperand(0);
8011 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8012 RawUndefs)) {
8013 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
8014 break;
8015 }
8016 return false;
8017 }
8018 case X86ISD::VPERMV3: {
8019 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8019, __extension__
__PRETTY_FUNCTION__))
;
8020 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8020, __extension__
__PRETTY_FUNCTION__))
;
8021 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
8022 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
8023 Ops.push_back(N->getOperand(0));
8024 Ops.push_back(N->getOperand(2));
8025 SDValue MaskNode = N->getOperand(1);
8026 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8027 RawUndefs)) {
8028 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
8029 break;
8030 }
8031 return false;
8032 }
8033 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8033)
;
8034 }
8035
8036 // Empty mask indicates the decode failed.
8037 if (Mask.empty())
8038 return false;
8039
8040 // Check if we're getting a shuffle mask with zero'd elements.
8041 if (!AllowSentinelZero && isAnyZero(Mask))
8042 return false;
8043
8044 // If we have a fake unary shuffle, the shuffle mask is spread across two
8045 // inputs that are actually the same node. Re-map the mask to always point
8046 // into the first input.
8047 if (IsFakeUnary)
8048 for (int &M : Mask)
8049 if (M >= (int)Mask.size())
8050 M -= Mask.size();
8051
8052 // If we didn't already add operands in the opcode-specific code, default to
8053 // adding 1 or 2 operands starting at 0.
8054 if (Ops.empty()) {
8055 Ops.push_back(N->getOperand(0));
8056 if (!IsUnary || IsFakeUnary)
8057 Ops.push_back(N->getOperand(1));
8058 }
8059
8060 return true;
8061}
8062
8063// Wrapper for getTargetShuffleMask with InUnary;
8064static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
8065 SmallVectorImpl<SDValue> &Ops,
8066 SmallVectorImpl<int> &Mask) {
8067 bool IsUnary;
8068 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
8069}
8070
8071/// Compute whether each element of a shuffle is zeroable.
8072///
8073/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8074/// Either it is an undef element in the shuffle mask, the element of the input
8075/// referenced is undef, or the element of the input referenced is known to be
8076/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8077/// as many lanes with this technique as possible to simplify the remaining
8078/// shuffle.
8079static void computeZeroableShuffleElements(ArrayRef<int> Mask,
8080 SDValue V1, SDValue V2,
8081 APInt &KnownUndef, APInt &KnownZero) {
8082 int Size = Mask.size();
8083 KnownUndef = KnownZero = APInt::getZero(Size);
8084
8085 V1 = peekThroughBitcasts(V1);
8086 V2 = peekThroughBitcasts(V2);
8087
8088 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8089 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8090
8091 int VectorSizeInBits = V1.getValueSizeInBits();
8092 int ScalarSizeInBits = VectorSizeInBits / Size;
8093 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8093, __extension__
__PRETTY_FUNCTION__))
;
8094
8095 for (int i = 0; i < Size; ++i) {
8096 int M = Mask[i];
8097 // Handle the easy cases.
8098 if (M < 0) {
8099 KnownUndef.setBit(i);
8100 continue;
8101 }
8102 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8103 KnownZero.setBit(i);
8104 continue;
8105 }
8106
8107 // Determine shuffle input and normalize the mask.
8108 SDValue V = M < Size ? V1 : V2;
8109 M %= Size;
8110
8111 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8112 if (V.getOpcode() != ISD::BUILD_VECTOR)
8113 continue;
8114
8115 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8116 // the (larger) source element must be UNDEF/ZERO.
8117 if ((Size % V.getNumOperands()) == 0) {
8118 int Scale = Size / V->getNumOperands();
8119 SDValue Op = V.getOperand(M / Scale);
8120 if (Op.isUndef())
8121 KnownUndef.setBit(i);
8122 if (X86::isZeroNode(Op))
8123 KnownZero.setBit(i);
8124 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8125 APInt Val = Cst->getAPIntValue();
8126 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8127 if (Val == 0)
8128 KnownZero.setBit(i);
8129 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8130 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8131 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8132 if (Val == 0)
8133 KnownZero.setBit(i);
8134 }
8135 continue;
8136 }
8137
8138 // If the BUILD_VECTOR has more elements then all the (smaller) source
8139 // elements must be UNDEF or ZERO.
8140 if ((V.getNumOperands() % Size) == 0) {
8141 int Scale = V->getNumOperands() / Size;
8142 bool AllUndef = true;
8143 bool AllZero = true;
8144 for (int j = 0; j < Scale; ++j) {
8145 SDValue Op = V.getOperand((M * Scale) + j);
8146 AllUndef &= Op.isUndef();
8147 AllZero &= X86::isZeroNode(Op);
8148 }
8149 if (AllUndef)
8150 KnownUndef.setBit(i);
8151 if (AllZero)
8152 KnownZero.setBit(i);
8153 continue;
8154 }
8155 }
8156}
8157
8158/// Decode a target shuffle mask and inputs and see if any values are
8159/// known to be undef or zero from their inputs.
8160/// Returns true if the target shuffle mask was decoded.
8161/// FIXME: Merge this with computeZeroableShuffleElements?
8162static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
8163 SmallVectorImpl<SDValue> &Ops,
8164 APInt &KnownUndef, APInt &KnownZero) {
8165 bool IsUnary;
8166 if (!isTargetShuffle(N.getOpcode()))
8167 return false;
8168
8169 MVT VT = N.getSimpleValueType();
8170 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
8171 return false;
8172
8173 int Size = Mask.size();
8174 SDValue V1 = Ops[0];
8175 SDValue V2 = IsUnary ? V1 : Ops[1];
8176 KnownUndef = KnownZero = APInt::getZero(Size);
8177
8178 V1 = peekThroughBitcasts(V1);
8179 V2 = peekThroughBitcasts(V2);
8180
8181 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__))
8182 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__))
;
8183 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
8184
8185 // Extract known constant input data.
8186 APInt UndefSrcElts[2];
8187 SmallVector<APInt, 32> SrcEltBits[2];
8188 bool IsSrcConstant[2] = {
8189 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
8190 SrcEltBits[0], true, false),
8191 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
8192 SrcEltBits[1], true, false)};
8193
8194 for (int i = 0; i < Size; ++i) {
8195 int M = Mask[i];
8196
8197 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
8198 if (M < 0) {
8199 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8199, __extension__
__PRETTY_FUNCTION__))
;
8200 if (SM_SentinelUndef == M)
8201 KnownUndef.setBit(i);
8202 if (SM_SentinelZero == M)
8203 KnownZero.setBit(i);
8204 continue;
8205 }
8206
8207 // Determine shuffle input and normalize the mask.
8208 unsigned SrcIdx = M / Size;
8209 SDValue V = M < Size ? V1 : V2;
8210 M %= Size;
8211
8212 // We are referencing an UNDEF input.
8213 if (V.isUndef()) {
8214 KnownUndef.setBit(i);
8215 continue;
8216 }
8217
8218 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
8219 // TODO: We currently only set UNDEF for integer types - floats use the same
8220 // registers as vectors and many of the scalar folded loads rely on the
8221 // SCALAR_TO_VECTOR pattern.
8222 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8223 (Size % V.getValueType().getVectorNumElements()) == 0) {
8224 int Scale = Size / V.getValueType().getVectorNumElements();
8225 int Idx = M / Scale;
8226 if (Idx != 0 && !VT.isFloatingPoint())
8227 KnownUndef.setBit(i);
8228 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
8229 KnownZero.setBit(i);
8230 continue;
8231 }
8232
8233 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
8234 // base vectors.
8235 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
8236 SDValue Vec = V.getOperand(0);
8237 int NumVecElts = Vec.getValueType().getVectorNumElements();
8238 if (Vec.isUndef() && Size == NumVecElts) {
8239 int Idx = V.getConstantOperandVal(2);
8240 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
8241 if (M < Idx || (Idx + NumSubElts) <= M)
8242 KnownUndef.setBit(i);
8243 }
8244 continue;
8245 }
8246
8247 // Attempt to extract from the source's constant bits.
8248 if (IsSrcConstant[SrcIdx]) {
8249 if (UndefSrcElts[SrcIdx][M])
8250 KnownUndef.setBit(i);
8251 else if (SrcEltBits[SrcIdx][M] == 0)
8252 KnownZero.setBit(i);
8253 }
8254 }
8255
8256 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__))
8257 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__))
;
8258 return true;
8259}
8260
8261// Replace target shuffle mask elements with known undef/zero sentinels.
8262static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
8263 const APInt &KnownUndef,
8264 const APInt &KnownZero,
8265 bool ResolveKnownZeros= true) {
8266 unsigned NumElts = Mask.size();
8267 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__))
8268 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__))
;
8269
8270 for (unsigned i = 0; i != NumElts; ++i) {
8271 if (KnownUndef[i])
8272 Mask[i] = SM_SentinelUndef;
8273 else if (ResolveKnownZeros && KnownZero[i])
8274 Mask[i] = SM_SentinelZero;
8275 }
8276}
8277
8278// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
8279static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
8280 APInt &KnownUndef,
8281 APInt &KnownZero) {
8282 unsigned NumElts = Mask.size();
8283 KnownUndef = KnownZero = APInt::getZero(NumElts);
8284
8285 for (unsigned i = 0; i != NumElts; ++i) {
8286 int M = Mask[i];
8287 if (SM_SentinelUndef == M)
8288 KnownUndef.setBit(i);
8289 if (SM_SentinelZero == M)
8290 KnownZero.setBit(i);
8291 }
8292}
8293
8294// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
8295static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
8296 SDValue Cond, bool IsBLENDV = false) {
8297 EVT CondVT = Cond.getValueType();
8298 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
8299 unsigned NumElts = CondVT.getVectorNumElements();
8300
8301 APInt UndefElts;
8302 SmallVector<APInt, 32> EltBits;
8303 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
8304 true, false))
8305 return false;
8306
8307 Mask.resize(NumElts, SM_SentinelUndef);
8308
8309 for (int i = 0; i != (int)NumElts; ++i) {
8310 Mask[i] = i;
8311 // Arbitrarily choose from the 2nd operand if the select condition element
8312 // is undef.
8313 // TODO: Can we do better by matching patterns such as even/odd?
8314 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
8315 (IsBLENDV && EltBits[i].isNonNegative()))
8316 Mask[i] += NumElts;
8317 }
8318
8319 return true;
8320}
8321
8322// Forward declaration (for getFauxShuffleMask recursive check).
8323static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8324 SmallVectorImpl<SDValue> &Inputs,
8325 SmallVectorImpl<int> &Mask,
8326 const SelectionDAG &DAG, unsigned Depth,
8327 bool ResolveKnownElts);
8328
8329// Attempt to decode ops that could be represented as a shuffle mask.
8330// The decoded shuffle mask may contain a different number of elements to the
8331// destination value type.
8332// TODO: Merge into getTargetShuffleInputs()
8333static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
8334 SmallVectorImpl<int> &Mask,
8335 SmallVectorImpl<SDValue> &Ops,
8336 const SelectionDAG &DAG, unsigned Depth,
8337 bool ResolveKnownElts) {
8338 Mask.clear();
8339 Ops.clear();
8340
8341 MVT VT = N.getSimpleValueType();
8342 unsigned NumElts = VT.getVectorNumElements();
8343 unsigned NumSizeInBits = VT.getSizeInBits();
8344 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8345 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8346 return false;
8347 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8347, __extension__
__PRETTY_FUNCTION__))
;
8348 unsigned NumSizeInBytes = NumSizeInBits / 8;
8349 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8350
8351 unsigned Opcode = N.getOpcode();
8352 switch (Opcode) {
8353 case ISD::VECTOR_SHUFFLE: {
8354 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8355 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8356 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8357 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8358 Ops.push_back(N.getOperand(0));
8359 Ops.push_back(N.getOperand(1));
8360 return true;
8361 }
8362 return false;
8363 }
8364 case ISD::AND:
8365 case X86ISD::ANDNP: {
8366 // Attempt to decode as a per-byte mask.
8367 APInt UndefElts;
8368 SmallVector<APInt, 32> EltBits;
8369 SDValue N0 = N.getOperand(0);
8370 SDValue N1 = N.getOperand(1);
8371 bool IsAndN = (X86ISD::ANDNP == Opcode);
8372 uint64_t ZeroMask = IsAndN ? 255 : 0;
8373 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8374 return false;
8375 // We can't assume an undef src element gives an undef dst - the other src
8376 // might be zero.
8377 if (!UndefElts.isZero())
8378 return false;
8379 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8380 const APInt &ByteBits = EltBits[i];
8381 if (ByteBits != 0 && ByteBits != 255)
8382 return false;
8383 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8384 }
8385 Ops.push_back(IsAndN ? N1 : N0);
8386 return true;
8387 }
8388 case ISD::OR: {
8389 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8390 // is a valid shuffle index.
8391 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8392 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8393 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8394 return false;
8395
8396 SmallVector<int, 64> SrcMask0, SrcMask1;
8397 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8398 APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
8399 APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
8400 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
8401 Depth + 1, true) ||
8402 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
8403 Depth + 1, true))
8404 return false;
8405
8406 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8407 SmallVector<int, 64> Mask0, Mask1;
8408 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8409 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8410 for (int i = 0; i != (int)MaskSize; ++i) {
8411 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8412 // loops converting between OR and BLEND shuffles due to
8413 // canWidenShuffleElements merging away undef elements, meaning we
8414 // fail to recognise the OR as the undef element isn't known zero.
8415 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8416 Mask.push_back(SM_SentinelZero);
8417 else if (Mask1[i] == SM_SentinelZero)
8418 Mask.push_back(i);
8419 else if (Mask0[i] == SM_SentinelZero)
8420 Mask.push_back(i + MaskSize);
8421 else
8422 return false;
8423 }
8424 Ops.push_back(N0);
8425 Ops.push_back(N1);
8426 return true;
8427 }
8428 case ISD::INSERT_SUBVECTOR: {
8429 SDValue Src = N.getOperand(0);
8430 SDValue Sub = N.getOperand(1);
8431 EVT SubVT = Sub.getValueType();
8432 unsigned NumSubElts = SubVT.getVectorNumElements();
8433 if (!N->isOnlyUserOf(Sub.getNode()))
8434 return false;
8435 uint64_t InsertIdx = N.getConstantOperandVal(2);
8436 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8437 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8438 Sub.getOperand(0).getValueType() == VT) {
8439 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8440 for (int i = 0; i != (int)NumElts; ++i)
8441 Mask.push_back(i);
8442 for (int i = 0; i != (int)NumSubElts; ++i)
8443 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8444 Ops.push_back(Src);
8445 Ops.push_back(Sub.getOperand(0));
8446 return true;
8447 }
8448 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8449 SmallVector<int, 64> SubMask;
8450 SmallVector<SDValue, 2> SubInputs;
8451 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
8452 EVT SubSrcVT = SubSrc.getValueType();
8453 if (!SubSrcVT.isVector())
8454 return false;
8455
8456 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
8457 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
8458 Depth + 1, ResolveKnownElts))
8459 return false;
8460
8461 // Subvector shuffle inputs must not be larger than the subvector.
8462 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8463 return SubVT.getFixedSizeInBits() <
8464 SubInput.getValueSizeInBits().getFixedValue();
8465 }))
8466 return false;
8467
8468 if (SubMask.size() != NumSubElts) {
8469 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__))
8470 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__))
;
8471 if ((NumSubElts % SubMask.size()) == 0) {
8472 int Scale = NumSubElts / SubMask.size();
8473 SmallVector<int,64> ScaledSubMask;
8474 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8475 SubMask = ScaledSubMask;
8476 } else {
8477 int Scale = SubMask.size() / NumSubElts;
8478 NumSubElts = SubMask.size();
8479 NumElts *= Scale;
8480 InsertIdx *= Scale;
8481 }
8482 }
8483 Ops.push_back(Src);
8484 Ops.append(SubInputs.begin(), SubInputs.end());
8485 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8486 Mask.append(NumElts, SM_SentinelZero);
8487 else
8488 for (int i = 0; i != (int)NumElts; ++i)
8489 Mask.push_back(i);
8490 for (int i = 0; i != (int)NumSubElts; ++i) {
8491 int M = SubMask[i];
8492 if (0 <= M) {
8493 int InputIdx = M / NumSubElts;
8494 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8495 }
8496 Mask[i + InsertIdx] = M;
8497 }
8498 return true;
8499 }
8500 case X86ISD::PINSRB:
8501 case X86ISD::PINSRW:
8502 case ISD::SCALAR_TO_VECTOR:
8503 case ISD::INSERT_VECTOR_ELT: {
8504 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8505 // vector, for matching src/dst vector types.
8506 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8507
8508 unsigned DstIdx = 0;
8509 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8510 // Check we have an in-range constant insertion index.
8511 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8512 N.getConstantOperandAPInt(2).uge(NumElts))
8513 return false;
8514 DstIdx = N.getConstantOperandVal(2);
8515
8516 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8517 if (X86::isZeroNode(Scl)) {
8518 Ops.push_back(N.getOperand(0));
8519 for (unsigned i = 0; i != NumElts; ++i)
8520 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8521 return true;
8522 }
8523 }
8524
8525 // Peek through trunc/aext/zext.
8526 // TODO: aext shouldn't require SM_SentinelZero padding.
8527 // TODO: handle shift of scalars.
8528 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8529 while (Scl.getOpcode() == ISD::TRUNCATE ||
8530 Scl.getOpcode() == ISD::ANY_EXTEND ||
8531 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8532 Scl = Scl.getOperand(0);
8533 MinBitsPerElt =
8534 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8535 }
8536 if ((MinBitsPerElt % 8) != 0)
8537 return false;
8538
8539 // Attempt to find the source vector the scalar was extracted from.
8540 SDValue SrcExtract;
8541 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8542 Scl.getOpcode() == X86ISD::PEXTRW ||
8543 Scl.getOpcode() == X86ISD::PEXTRB) &&
8544 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8545 SrcExtract = Scl;
8546 }
8547 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8548 return false;
8549
8550 SDValue SrcVec = SrcExtract.getOperand(0);
8551 EVT SrcVT = SrcVec.getValueType();
8552 if (!SrcVT.getScalarType().isByteSized())
8553 return false;
8554 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8555 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8556 unsigned DstByte = DstIdx * NumBytesPerElt;
8557 MinBitsPerElt =
8558 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8559
8560 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8561 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8562 Ops.push_back(SrcVec);
8563 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8564 } else {
8565 Ops.push_back(SrcVec);
8566 Ops.push_back(N.getOperand(0));
8567 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8568 Mask.push_back(NumSizeInBytes + i);
8569 }
8570
8571 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8572 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8573 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8574 Mask[DstByte + i] = SrcByte + i;
8575 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8576 Mask[DstByte + i] = SM_SentinelZero;
8577 return true;
8578 }
8579 case X86ISD::PACKSS:
8580 case X86ISD::PACKUS: {
8581 SDValue N0 = N.getOperand(0);
8582 SDValue N1 = N.getOperand(1);
8583 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))
8584 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))
8585 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))
;
8586
8587 APInt EltsLHS, EltsRHS;
8588 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8589
8590 // If we know input saturation won't happen (or we don't care for particular
8591 // lanes), we can treat this as a truncation shuffle.
8592 bool Offset0 = false, Offset1 = false;
8593 if (Opcode == X86ISD::PACKSS) {
8594 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8595 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8596 (!(N1.isUndef() || EltsRHS.isZero()) &&
8597 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8598 return false;
8599 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8600 // PACKSS then it was likely being used for sign-extension for a
8601 // truncation, so just peek through and adjust the mask accordingly.
8602 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8603 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8604 Offset0 = true;
8605 N0 = N0.getOperand(0);
8606 }
8607 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8608 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8609 Offset1 = true;
8610 N1 = N1.getOperand(0);
8611 }
8612 } else {
8613 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8614 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8615 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8616 (!(N1.isUndef() || EltsRHS.isZero()) &&
8617 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8618 return false;
8619 }
8620
8621 bool IsUnary = (N0 == N1);
8622
8623 Ops.push_back(N0);
8624 if (!IsUnary)
8625 Ops.push_back(N1);
8626
8627 createPackShuffleMask(VT, Mask, IsUnary);
8628
8629 if (Offset0 || Offset1) {
8630 for (int &M : Mask)
8631 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8632 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8633 ++M;
8634 }
8635 return true;
8636 }
8637 case ISD::VSELECT:
8638 case X86ISD::BLENDV: {
8639 SDValue Cond = N.getOperand(0);
8640 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
8641 Ops.push_back(N.getOperand(1));
8642 Ops.push_back(N.getOperand(2));
8643 return true;
8644 }
8645 return false;
8646 }
8647 case X86ISD::VTRUNC: {
8648 SDValue Src = N.getOperand(0);
8649 EVT SrcVT = Src.getValueType();
8650 // Truncated source must be a simple vector.
8651 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8652 (SrcVT.getScalarSizeInBits() % 8) != 0)
8653 return false;
8654 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8655 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8656 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8657 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8657, __extension__
__PRETTY_FUNCTION__))
;
8658 for (unsigned i = 0; i != NumSrcElts; ++i)
8659 Mask.push_back(i * Scale);
8660 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8661 Ops.push_back(Src);
8662 return true;
8663 }
8664 case X86ISD::VSHLI:
8665 case X86ISD::VSRLI: {
8666 uint64_t ShiftVal = N.getConstantOperandVal(1);
8667 // Out of range bit shifts are guaranteed to be zero.
8668 if (NumBitsPerElt <= ShiftVal) {
8669 Mask.append(NumElts, SM_SentinelZero);
8670 return true;
8671 }
8672
8673 // We can only decode 'whole byte' bit shifts as shuffles.
8674 if ((ShiftVal % 8) != 0)
8675 break;
8676
8677 uint64_t ByteShift = ShiftVal / 8;
8678 Ops.push_back(N.getOperand(0));
8679
8680 // Clear mask to all zeros and insert the shifted byte indices.
8681 Mask.append(NumSizeInBytes, SM_SentinelZero);
8682
8683 if (X86ISD::VSHLI == Opcode) {
8684 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8685 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8686 Mask[i + j] = i + j - ByteShift;
8687 } else {
8688 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8689 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8690 Mask[i + j - ByteShift] = i + j;
8691 }
8692 return true;
8693 }
8694 case X86ISD::VROTLI:
8695 case X86ISD::VROTRI: {
8696 // We can only decode 'whole byte' bit rotates as shuffles.
8697 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8698 if ((RotateVal % 8) != 0)
8699 return false;
8700 Ops.push_back(N.getOperand(0));
8701 int Offset = RotateVal / 8;
8702 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8703 for (int i = 0; i != (int)NumElts; ++i) {
8704 int BaseIdx = i * NumBytesPerElt;
8705 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8706 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8707 }
8708 }
8709 return true;
8710 }
8711 case X86ISD::VBROADCAST: {
8712 SDValue Src = N.getOperand(0);
8713 if (!Src.getSimpleValueType().isVector()) {
8714 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8715 !isNullConstant(Src.getOperand(1)) ||
8716 Src.getOperand(0).getValueType().getScalarType() !=
8717 VT.getScalarType())
8718 return false;
8719 Src = Src.getOperand(0);
8720 }
8721 Ops.push_back(Src);
8722 Mask.append(NumElts, 0);
8723 return true;
8724 }
8725 case ISD::ZERO_EXTEND:
8726 case ISD::ANY_EXTEND:
8727 case ISD::ZERO_EXTEND_VECTOR_INREG:
8728 case ISD::ANY_EXTEND_VECTOR_INREG: {
8729 SDValue Src = N.getOperand(0);
8730 EVT SrcVT = Src.getValueType();
8731
8732 // Extended source must be a simple vector.
8733 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8734 (SrcVT.getScalarSizeInBits() % 8) != 0)
8735 return false;
8736
8737 bool IsAnyExtend =
8738 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8739 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8740 IsAnyExtend, Mask);
8741 Ops.push_back(Src);
8742 return true;
8743 }
8744 }
8745
8746 return false;
8747}
8748
8749/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8750static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8751 SmallVectorImpl<int> &Mask) {
8752 int MaskWidth = Mask.size();
8753 SmallVector<SDValue, 16> UsedInputs;
8754 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8755 int lo = UsedInputs.size() * MaskWidth;
8756 int hi = lo + MaskWidth;
8757
8758 // Strip UNDEF input usage.
8759 if (Inputs[i].isUndef())
8760 for (int &M : Mask)
8761 if ((lo <= M) && (M < hi))
8762 M = SM_SentinelUndef;
8763
8764 // Check for unused inputs.
8765 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8766 for (int &M : Mask)
8767 if (lo <= M)
8768 M -= MaskWidth;
8769 continue;
8770 }
8771
8772 // Check for repeated inputs.
8773 bool IsRepeat = false;
8774 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8775 if (UsedInputs[j] != Inputs[i])
8776 continue;
8777 for (int &M : Mask)
8778 if (lo <= M)
8779 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8780 IsRepeat = true;
8781 break;
8782 }
8783 if (IsRepeat)
8784 continue;
8785
8786 UsedInputs.push_back(Inputs[i]);
8787 }
8788 Inputs = UsedInputs;
8789}
8790
8791/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8792/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8793/// Returns true if the target shuffle mask was decoded.
8794static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8795 SmallVectorImpl<SDValue> &Inputs,
8796 SmallVectorImpl<int> &Mask,
8797 APInt &KnownUndef, APInt &KnownZero,
8798 const SelectionDAG &DAG, unsigned Depth,
8799 bool ResolveKnownElts) {
8800 if (Depth >= SelectionDAG::MaxRecursionDepth)
8801 return false; // Limit search depth.
8802
8803 EVT VT = Op.getValueType();
8804 if (!VT.isSimple() || !VT.isVector())
8805 return false;
8806
8807 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8808 if (ResolveKnownElts)
8809 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8810 return true;
8811 }
8812 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8813 ResolveKnownElts)) {
8814 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8815 return true;
8816 }
8817 return false;
8818}
8819
8820static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8821 SmallVectorImpl<SDValue> &Inputs,
8822 SmallVectorImpl<int> &Mask,
8823 const SelectionDAG &DAG, unsigned Depth,
8824 bool ResolveKnownElts) {
8825 APInt KnownUndef, KnownZero;
8826 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8827 KnownZero, DAG, Depth, ResolveKnownElts);
8828}
8829
8830static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8831 SmallVectorImpl<int> &Mask,
8832 const SelectionDAG &DAG, unsigned Depth = 0,
8833 bool ResolveKnownElts = true) {
8834 EVT VT = Op.getValueType();
8835 if (!VT.isSimple() || !VT.isVector())
8836 return false;
8837
8838 unsigned NumElts = Op.getValueType().getVectorNumElements();
8839 APInt DemandedElts = APInt::getAllOnes(NumElts);
8840 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
8841 ResolveKnownElts);
8842}
8843
8844// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8845static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8846 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8847 SelectionDAG &DAG) {
8848 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
8849 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
8850 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))
;
8851
8852 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8853 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8854 return SDValue();
8855
8856 SDValue Ptr =
8857 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8858 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8859 SDValue Ops[] = {Mem->getChain(), Ptr};
8860 SDValue BcstLd = DAG.getMemIntrinsicNode(
8861 Opcode, DL, Tys, Ops, MemVT,
8862 DAG.getMachineFunction().getMachineMemOperand(
8863 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8864 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8865 return BcstLd;
8866}
8867
8868/// Returns the scalar element that will make up the i'th
8869/// element of the result of the vector shuffle.
8870static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8871 SelectionDAG &DAG, unsigned Depth) {
8872 if (Depth >= SelectionDAG::MaxRecursionDepth)
8873 return SDValue(); // Limit search depth.
8874
8875 EVT VT = Op.getValueType();
8876 unsigned Opcode = Op.getOpcode();
8877 unsigned NumElems = VT.getVectorNumElements();
8878
8879 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8880 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8881 int Elt = SV->getMaskElt(Index);
8882
8883 if (Elt < 0)
8884 return DAG.getUNDEF(VT.getVectorElementType());
8885
8886 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8887 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8888 }
8889
8890 // Recurse into target specific vector shuffles to find scalars.
8891 if (isTargetShuffle(Opcode)) {
8892 MVT ShufVT = VT.getSimpleVT();
8893 MVT ShufSVT = ShufVT.getVectorElementType();
8894 int NumElems = (int)ShufVT.getVectorNumElements();
8895 SmallVector<int, 16> ShuffleMask;
8896 SmallVector<SDValue, 16> ShuffleOps;
8897 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8898 ShuffleMask))
8899 return SDValue();
8900
8901 int Elt = ShuffleMask[Index];
8902 if (Elt == SM_SentinelZero)
8903 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8904 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8905 if (Elt == SM_SentinelUndef)
8906 return DAG.getUNDEF(ShufSVT);
8907
8908 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8908, __extension__
__PRETTY_FUNCTION__))
;
8909 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8910 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8911 }
8912
8913 // Recurse into insert_subvector base/sub vector to find scalars.
8914 if (Opcode == ISD::INSERT_SUBVECTOR) {
8915 SDValue Vec = Op.getOperand(0);
8916 SDValue Sub = Op.getOperand(1);
8917 uint64_t SubIdx = Op.getConstantOperandVal(2);
8918 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8919
8920 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8921 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8922 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8923 }
8924
8925 // Recurse into concat_vectors sub vector to find scalars.
8926 if (Opcode == ISD::CONCAT_VECTORS) {
8927 EVT SubVT = Op.getOperand(0).getValueType();
8928 unsigned NumSubElts = SubVT.getVectorNumElements();
8929 uint64_t SubIdx = Index / NumSubElts;
8930 uint64_t SubElt = Index % NumSubElts;
8931 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8932 }
8933
8934 // Recurse into extract_subvector src vector to find scalars.
8935 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8936 SDValue Src = Op.getOperand(0);
8937 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8938 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8939 }
8940
8941 // We only peek through bitcasts of the same vector width.
8942 if (Opcode == ISD::BITCAST) {
8943 SDValue Src = Op.getOperand(0);
8944 EVT SrcVT = Src.getValueType();
8945 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8946 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8947 return SDValue();
8948 }
8949
8950 // Actual nodes that may contain scalar elements
8951
8952 // For insert_vector_elt - either return the index matching scalar or recurse
8953 // into the base vector.
8954 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8955 isa<ConstantSDNode>(Op.getOperand(2))) {
8956 if (Op.getConstantOperandAPInt(2) == Index)
8957 return Op.getOperand(1);
8958 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8959 }
8960
8961 if (Opcode == ISD::SCALAR_TO_VECTOR)
8962 return (Index == 0) ? Op.getOperand(0)
8963 : DAG.getUNDEF(VT.getVectorElementType());
8964
8965 if (Opcode == ISD::BUILD_VECTOR)
8966 return Op.getOperand(Index);
8967
8968 return SDValue();
8969}
8970
8971// Use PINSRB/PINSRW/PINSRD to create a build vector.
8972static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8973 unsigned NumNonZero, unsigned NumZero,
8974 SelectionDAG &DAG,
8975 const X86Subtarget &Subtarget) {
8976 MVT VT = Op.getSimpleValueType();
8977 unsigned NumElts = VT.getVectorNumElements();
8978 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))
8979 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))
8980 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))
;
8981
8982 SDLoc dl(Op);
8983 SDValue V;
8984 bool First = true;
8985
8986 for (unsigned i = 0; i < NumElts; ++i) {
8987 bool IsNonZero = NonZeroMask[i];
8988 if (!IsNonZero)
8989 continue;
8990
8991 // If the build vector contains zeros or our first insertion is not the
8992 // first index then insert into zero vector to break any register
8993 // dependency else use SCALAR_TO_VECTOR.
8994 if (First) {
8995 First = false;
8996 if (NumZero || 0 != i)
8997 V = getZeroVector(VT, Subtarget, DAG, dl);
8998 else {
8999 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8999, __extension__
__PRETTY_FUNCTION__))
;
9000 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9001 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
9002 V = DAG.getBitcast(VT, V);
9003 continue;
9004 }
9005 }
9006 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
9007 DAG.getIntPtrConstant(i, dl));
9008 }
9009
9010 return V;
9011}
9012
9013/// Custom lower build_vector of v16i8.
9014static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
9015 unsigned NumNonZero, unsigned NumZero,
9016 SelectionDAG &DAG,
9017 const X86Subtarget &Subtarget) {
9018 if (NumNonZero > 8 && !Subtarget.hasSSE41())
9019 return SDValue();
9020
9021 // SSE4.1 - use PINSRB to insert each byte directly.
9022 if (Subtarget.hasSSE41())
9023 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9024 Subtarget);
9025
9026 SDLoc dl(Op);
9027 SDValue V;
9028
9029 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
9030 for (unsigned i = 0; i < 16; i += 2) {
9031 bool ThisIsNonZero = NonZeroMask[i];
9032 bool NextIsNonZero = NonZeroMask[i + 1];
9033 if (!ThisIsNonZero && !NextIsNonZero)
9034 continue;
9035
9036 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
9037 SDValue Elt;
9038 if (ThisIsNonZero) {
9039 if (NumZero || NextIsNonZero)
9040 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9041 else
9042 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9043 }
9044
9045 if (NextIsNonZero) {
9046 SDValue NextElt = Op.getOperand(i + 1);
9047 if (i == 0 && NumZero)
9048 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
9049 else
9050 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
9051 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
9052 DAG.getConstant(8, dl, MVT::i8));
9053 if (ThisIsNonZero)
9054 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
9055 else
9056 Elt = NextElt;
9057 }
9058
9059 // If our first insertion is not the first index or zeros are needed, then
9060 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
9061 // elements undefined).
9062 if (!V) {
9063 if (i != 0 || NumZero)
9064 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
9065 else {
9066 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
9067 V = DAG.getBitcast(MVT::v8i16, V);
9068 continue;
9069 }
9070 }
9071 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
9072 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
9073 DAG.getIntPtrConstant(i / 2, dl));
9074 }
9075
9076 return DAG.getBitcast(MVT::v16i8, V);
9077}
9078
9079/// Custom lower build_vector of v8i16.
9080static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
9081 unsigned NumNonZero, unsigned NumZero,
9082 SelectionDAG &DAG,
9083 const X86Subtarget &Subtarget) {
9084 if (NumNonZero > 4 && !Subtarget.hasSSE41())
9085 return SDValue();
9086
9087 // Use PINSRW to insert each byte directly.
9088 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9089 Subtarget);
9090}
9091
9092/// Custom lower build_vector of v4i32 or v4f32.
9093static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
9094 const X86Subtarget &Subtarget) {
9095 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
9096 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
9097 // Because we're creating a less complicated build vector here, we may enable
9098 // further folding of the MOVDDUP via shuffle transforms.
9099 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
9100 Op.getOperand(0) == Op.getOperand(2) &&
9101 Op.getOperand(1) == Op.getOperand(3) &&
9102 Op.getOperand(0) != Op.getOperand(1)) {
9103 SDLoc DL(Op);
9104 MVT VT = Op.getSimpleValueType();
9105 MVT EltVT = VT.getVectorElementType();
9106 // Create a new build vector with the first 2 elements followed by undef
9107 // padding, bitcast to v2f64, duplicate, and bitcast back.
9108 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9109 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9110 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
9111 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
9112 return DAG.getBitcast(VT, Dup);
9113 }
9114
9115 // Find all zeroable elements.
9116 std::bitset<4> Zeroable, Undefs;
9117 for (int i = 0; i < 4; ++i) {
9118 SDValue Elt = Op.getOperand(i);
9119 Undefs[i] = Elt.isUndef();
9120 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
9121 }
9122 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__))
9123 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__))
;
9124
9125 // We only know how to deal with build_vector nodes where elements are either
9126 // zeroable or extract_vector_elt with constant index.
9127 SDValue FirstNonZero;
9128 unsigned FirstNonZeroIdx;
9129 for (unsigned i = 0; i < 4; ++i) {
9130 if (Zeroable[i])
9131 continue;
9132 SDValue Elt = Op.getOperand(i);
9133 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9134 !isa<ConstantSDNode>(Elt.getOperand(1)))
9135 return SDValue();
9136 // Make sure that this node is extracting from a 128-bit vector.
9137 MVT VT = Elt.getOperand(0).getSimpleValueType();
9138 if (!VT.is128BitVector())
9139 return SDValue();
9140 if (!FirstNonZero.getNode()) {
9141 FirstNonZero = Elt;
9142 FirstNonZeroIdx = i;
9143 }
9144 }
9145
9146 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9146, __extension__
__PRETTY_FUNCTION__))
;
9147 SDValue V1 = FirstNonZero.getOperand(0);
9148 MVT VT = V1.getSimpleValueType();
9149
9150 // See if this build_vector can be lowered as a blend with zero.
9151 SDValue Elt;
9152 unsigned EltMaskIdx, EltIdx;
9153 int Mask[4];
9154 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
9155 if (Zeroable[EltIdx]) {
9156 // The zero vector will be on the right hand side.
9157 Mask[EltIdx] = EltIdx+4;
9158 continue;
9159 }
9160
9161 Elt = Op->getOperand(EltIdx);
9162 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
9163 EltMaskIdx = Elt.getConstantOperandVal(1);
9164 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
9165 break;
9166 Mask[EltIdx] = EltIdx;
9167 }
9168
9169 if (EltIdx == 4) {
9170 // Let the shuffle legalizer deal with blend operations.
9171 SDValue VZeroOrUndef = (Zeroable == Undefs)
9172 ? DAG.getUNDEF(VT)
9173 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
9174 if (V1.getSimpleValueType() != VT)
9175 V1 = DAG.getBitcast(VT, V1);
9176 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
9177 }
9178
9179 // See if we can lower this build_vector to a INSERTPS.
9180 if (!Subtarget.hasSSE41())
9181 return SDValue();
9182
9183 SDValue V2 = Elt.getOperand(0);
9184 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
9185 V1 = SDValue();
9186
9187 bool CanFold = true;
9188 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
9189 if (Zeroable[i])
9190 continue;
9191
9192 SDValue Current = Op->getOperand(i);
9193 SDValue SrcVector = Current->getOperand(0);
9194 if (!V1.getNode())
9195 V1 = SrcVector;
9196 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
9197 }
9198
9199 if (!CanFold)
9200 return SDValue();
9201
9202 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9202, __extension__
__PRETTY_FUNCTION__))
;
9203 if (V1.getSimpleValueType() != MVT::v4f32)
9204 V1 = DAG.getBitcast(MVT::v4f32, V1);
9205 if (V2.getSimpleValueType() != MVT::v4f32)
9206 V2 = DAG.getBitcast(MVT::v4f32, V2);
9207
9208 // Ok, we can emit an INSERTPS instruction.
9209 unsigned ZMask = Zeroable.to_ulong();
9210
9211 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
9212 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9212, __extension__
__PRETTY_FUNCTION__))
;
9213 SDLoc DL(Op);
9214 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9215 DAG.getIntPtrConstant(InsertPSMask, DL, true));
9216 return DAG.getBitcast(VT, Result);
9217}
9218
9219/// Return a vector logical shift node.
9220static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
9221 SelectionDAG &DAG, const TargetLowering &TLI,
9222 const SDLoc &dl) {
9223 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__))
;
9224 MVT ShVT = MVT::v16i8;
9225 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
9226 SrcOp = DAG.getBitcast(ShVT, SrcOp);
9227 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9227, __extension__
__PRETTY_FUNCTION__))
;
9228 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
9229 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
9230}
9231
9232static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
9233 SelectionDAG &DAG) {
9234
9235 // Check if the scalar load can be widened into a vector load. And if
9236 // the address is "base + cst" see if the cst can be "absorbed" into
9237 // the shuffle mask.
9238 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
9239 SDValue Ptr = LD->getBasePtr();
9240 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
9241 return SDValue();
9242 EVT PVT = LD->getValueType(0);
9243 if (PVT != MVT::i32 && PVT != MVT::f32)
9244 return SDValue();
9245
9246 int FI = -1;
9247 int64_t Offset = 0;
9248 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
9249 FI = FINode->getIndex();
9250 Offset = 0;
9251 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
9252 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
9253 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
9254 Offset = Ptr.getConstantOperandVal(1);
9255 Ptr = Ptr.getOperand(0);
9256 } else {
9257 return SDValue();
9258 }
9259
9260 // FIXME: 256-bit vector instructions don't require a strict alignment,
9261 // improve this code to support it better.
9262 Align RequiredAlign(VT.getSizeInBits() / 8);
9263 SDValue Chain = LD->getChain();
9264 // Make sure the stack object alignment is at least 16 or 32.
9265 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9266 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
9267 if (!InferredAlign || *InferredAlign < RequiredAlign) {
9268 if (MFI.isFixedObjectIndex(FI)) {
9269 // Can't change the alignment. FIXME: It's possible to compute
9270 // the exact stack offset and reference FI + adjust offset instead.
9271 // If someone *really* cares about this. That's the way to implement it.
9272 return SDValue();
9273 } else {
9274 MFI.setObjectAlignment(FI, RequiredAlign);
9275 }
9276 }
9277
9278 // (Offset % 16 or 32) must be multiple of 4. Then address is then
9279 // Ptr + (Offset & ~15).
9280 if (Offset < 0)
9281 return SDValue();
9282 if ((Offset % RequiredAlign.value()) & 3)
9283 return SDValue();
9284 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
9285 if (StartOffset) {
9286 SDLoc DL(Ptr);
9287 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9288 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
9289 }
9290
9291 int EltNo = (Offset - StartOffset) >> 2;
9292 unsigned NumElems = VT.getVectorNumElements();
9293
9294 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
9295 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
9296 LD->getPointerInfo().getWithOffset(StartOffset));
9297
9298 SmallVector<int, 8> Mask(NumElems, EltNo);
9299
9300 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
9301 }
9302
9303 return SDValue();
9304}
9305
9306// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
9307static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
9308 if (ISD::isNON_EXTLoad(Elt.getNode())) {
9309 auto *BaseLd = cast<LoadSDNode>(Elt);
9310 if (!BaseLd->isSimple())
9311 return false;
9312 Ld = BaseLd;
9313 ByteOffset = 0;
9314 return true;
9315 }
9316
9317 switch (Elt.getOpcode()) {
9318 case ISD::BITCAST:
9319 case ISD::TRUNCATE:
9320 case ISD::SCALAR_TO_VECTOR:
9321 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
9322 case ISD::SRL:
9323 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9324 uint64_t Amt = AmtC->getZExtValue();
9325 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
9326 ByteOffset += Amt / 8;
9327 return true;
9328 }
9329 }
9330 break;
9331 case ISD::EXTRACT_VECTOR_ELT:
9332 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9333 SDValue Src = Elt.getOperand(0);
9334 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
9335 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
9336 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
9337 findEltLoadSrc(Src, Ld, ByteOffset)) {
9338 uint64_t Idx = IdxC->getZExtValue();
9339 ByteOffset += Idx * (SrcSizeInBits / 8);
9340 return true;
9341 }
9342 }
9343 break;
9344 }
9345
9346 return false;
9347}
9348
9349/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
9350/// elements can be replaced by a single large load which has the same value as
9351/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
9352///
9353/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
9354static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
9355 const SDLoc &DL, SelectionDAG &DAG,
9356 const X86Subtarget &Subtarget,
9357 bool IsAfterLegalize) {
9358 if ((VT.getScalarSizeInBits() % 8) != 0)
9359 return SDValue();
9360
9361 unsigned NumElems = Elts.size();
9362
9363 int LastLoadedElt = -1;
9364 APInt LoadMask = APInt::getZero(NumElems);
9365 APInt ZeroMask = APInt::getZero(NumElems);
9366 APInt UndefMask = APInt::getZero(NumElems);
9367
9368 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
9369 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
9370
9371 // For each element in the initializer, see if we've found a load, zero or an
9372 // undef.
9373 for (unsigned i = 0; i < NumElems; ++i) {
9374 SDValue Elt = peekThroughBitcasts(Elts[i]);
9375 if (!Elt.getNode())
9376 return SDValue();
9377 if (Elt.isUndef()) {
9378 UndefMask.setBit(i);
9379 continue;
9380 }
9381 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9382 ZeroMask.setBit(i);
9383 continue;
9384 }
9385
9386 // Each loaded element must be the correct fractional portion of the
9387 // requested vector load.
9388 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9389 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9390 return SDValue();
9391
9392 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9393 return SDValue();
9394 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9395 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9396 return SDValue();
9397
9398 LoadMask.setBit(i);
9399 LastLoadedElt = i;
9400 }
9401 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))
9402 NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))
9403 "Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))
;
9404
9405 // Handle Special Cases - all undef or undef/zero.
9406 if (UndefMask.popcount() == NumElems)
9407 return DAG.getUNDEF(VT);
9408 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
9409 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9410 : DAG.getConstantFP(0.0, DL, VT);
9411
9412 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9413 int FirstLoadedElt = LoadMask.countr_zero();
9414 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9415 EVT EltBaseVT = EltBase.getValueType();
9416 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__))
9417 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__))
;
9418 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9419 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9419, __extension__
__PRETTY_FUNCTION__))
;
9420 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9421 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9422 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9423 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9424 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__))
;
9425
9426 // TODO: Support offsetting the base load.
9427 if (ByteOffsets[FirstLoadedElt] != 0)
9428 return SDValue();
9429
9430 // Check to see if the element's load is consecutive to the base load
9431 // or offset from a previous (already checked) load.
9432 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9433 LoadSDNode *Ld = Loads[EltIdx];
9434 int64_t ByteOffset = ByteOffsets[EltIdx];
9435 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9436 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9437 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9438 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9439 }
9440 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9441 EltIdx - FirstLoadedElt);
9442 };
9443
9444 // Consecutive loads can contain UNDEFS but not ZERO elements.
9445 // Consecutive loads with UNDEFs and ZEROs elements require a
9446 // an additional shuffle stage to clear the ZERO elements.
9447 bool IsConsecutiveLoad = true;
9448 bool IsConsecutiveLoadWithZeros = true;
9449 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9450 if (LoadMask[i]) {
9451 if (!CheckConsecutiveLoad(LDBase, i)) {
9452 IsConsecutiveLoad = false;
9453 IsConsecutiveLoadWithZeros = false;
9454 break;
9455 }
9456 } else if (ZeroMask[i]) {
9457 IsConsecutiveLoad = false;
9458 }
9459 }
9460
9461 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9462 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9463 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__))
9464 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__))
;
9465 SDValue NewLd =
9466 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9467 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9468 MMOFlags);
9469 for (auto *LD : Loads)
9470 if (LD)
9471 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9472 return NewLd;
9473 };
9474
9475 // Check if the base load is entirely dereferenceable.
9476 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9477 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9478
9479 // LOAD - all consecutive load/undefs (must start/end with a load or be
9480 // entirely dereferenceable). If we have found an entire vector of loads and
9481 // undefs, then return a large load of the entire vector width starting at the
9482 // base pointer. If the vector contains zeros, then attempt to shuffle those
9483 // elements.
9484 if (FirstLoadedElt == 0 &&
9485 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9486 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9487 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9488 return SDValue();
9489
9490 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9491 // will lower to regular temporal loads and use the cache.
9492 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
9493 VT.is256BitVector() && !Subtarget.hasInt256())
9494 return SDValue();
9495
9496 if (NumElems == 1)
9497 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9498
9499 if (!ZeroMask)
9500 return CreateLoad(VT, LDBase);
9501
9502 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9503 // vector and a zero vector to clear out the zero elements.
9504 if (!IsAfterLegalize && VT.isVector()) {
9505 unsigned NumMaskElts = VT.getVectorNumElements();
9506 if ((NumMaskElts % NumElems) == 0) {
9507 unsigned Scale = NumMaskElts / NumElems;
9508 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9509 for (unsigned i = 0; i < NumElems; ++i) {
9510 if (UndefMask[i])
9511 continue;
9512 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9513 for (unsigned j = 0; j != Scale; ++j)
9514 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9515 }
9516 SDValue V = CreateLoad(VT, LDBase);
9517 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9518 : DAG.getConstantFP(0.0, DL, VT);
9519 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9520 }
9521 }
9522 }
9523
9524 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9525 if (VT.is256BitVector() || VT.is512BitVector()) {
9526 unsigned HalfNumElems = NumElems / 2;
9527 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9528 EVT HalfVT =
9529 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9530 SDValue HalfLD =
9531 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9532 DAG, Subtarget, IsAfterLegalize);
9533 if (HalfLD)
9534 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9535 HalfLD, DAG.getIntPtrConstant(0, DL));
9536 }
9537 }
9538
9539 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9540 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9541 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9542 LoadSizeInBits == 64) &&
9543 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9544 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9545 : MVT::getIntegerVT(LoadSizeInBits);
9546 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9547 // Allow v4f32 on SSE1 only targets.
9548 // FIXME: Add more isel patterns so we can just use VT directly.
9549 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9550 VecVT = MVT::v4f32;
9551 if (TLI.isTypeLegal(VecVT)) {
9552 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9553 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9554 SDValue ResNode = DAG.getMemIntrinsicNode(
9555 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9556 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9557 for (auto *LD : Loads)
9558 if (LD)
9559 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9560 return DAG.getBitcast(VT, ResNode);
9561 }
9562 }
9563
9564 // BROADCAST - match the smallest possible repetition pattern, load that
9565 // scalar/subvector element and then broadcast to the entire vector.
9566 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9567 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9568 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9569 unsigned RepeatSize = SubElems * BaseSizeInBits;
9570 unsigned ScalarSize = std::min(RepeatSize, 64u);
9571 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9572 continue;
9573
9574 // Don't attempt a 1:N subvector broadcast - it should be caught by
9575 // combineConcatVectorOps, else will cause infinite loops.
9576 if (RepeatSize > ScalarSize && SubElems == 1)
9577 continue;
9578
9579 bool Match = true;
9580 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9581 for (unsigned i = 0; i != NumElems && Match; ++i) {
9582 if (!LoadMask[i])
9583 continue;
9584 SDValue Elt = peekThroughBitcasts(Elts[i]);
9585 if (RepeatedLoads[i % SubElems].isUndef())
9586 RepeatedLoads[i % SubElems] = Elt;
9587 else
9588 Match &= (RepeatedLoads[i % SubElems] == Elt);
9589 }
9590
9591 // We must have loads at both ends of the repetition.
9592 Match &= !RepeatedLoads.front().isUndef();
9593 Match &= !RepeatedLoads.back().isUndef();
9594 if (!Match)
9595 continue;
9596
9597 EVT RepeatVT =
9598 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9599 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9600 : EVT::getFloatingPointVT(ScalarSize);
9601 if (RepeatSize > ScalarSize)
9602 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9603 RepeatSize / ScalarSize);
9604 EVT BroadcastVT =
9605 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9606 VT.getSizeInBits() / ScalarSize);
9607 if (TLI.isTypeLegal(BroadcastVT)) {
9608 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9609 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9610 SDValue Broadcast = RepeatLoad;
9611 if (RepeatSize > ScalarSize) {
9612 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9613 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9614 } else {
9615 if (!Subtarget.hasAVX2() &&
9616 !X86::mayFoldLoadIntoBroadcastFromMem(
9617 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9618 Subtarget,
9619 /*AssumeSingleUse=*/true))
9620 return SDValue();
9621 Broadcast =
9622 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9623 }
9624 return DAG.getBitcast(VT, Broadcast);
9625 }
9626 }
9627 }
9628 }
9629
9630 return SDValue();
9631}
9632
9633// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9634// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9635// are consecutive, non-overlapping, and in the right order.
9636static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9637 SelectionDAG &DAG,
9638 const X86Subtarget &Subtarget,
9639 bool IsAfterLegalize) {
9640 SmallVector<SDValue, 64> Elts;
9641 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9642 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9643 Elts.push_back(Elt);
9644 continue;
9645 }
9646 return SDValue();
9647 }
9648 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9648, __extension__
__PRETTY_FUNCTION__))
;
9649 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9650 IsAfterLegalize);
9651}
9652
9653static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9654 unsigned SplatBitSize, LLVMContext &C) {
9655 unsigned ScalarSize = VT.getScalarSizeInBits();
9656 unsigned NumElm = SplatBitSize / ScalarSize;
9657
9658 SmallVector<Constant *, 32> ConstantVec;
9659 for (unsigned i = 0; i < NumElm; i++) {
9660 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9661 Constant *Const;
9662 if (VT.isFloatingPoint()) {
9663 if (ScalarSize == 16) {
9664 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9665 } else if (ScalarSize == 32) {
9666 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9667 } else {
9668 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9668, __extension__
__PRETTY_FUNCTION__))
;
9669 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9670 }
9671 } else
9672 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9673 ConstantVec.push_back(Const);
9674 }
9675 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9676}
9677
9678static bool isFoldableUseOfShuffle(SDNode *N) {
9679 for (auto *U : N->uses()) {
9680 unsigned Opc = U->getOpcode();
9681 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9682 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9683 return false;
9684 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9685 return false;
9686 if (isTargetShuffle(Opc))
9687 return true;
9688 if (Opc == ISD::BITCAST) // Ignore bitcasts
9689 return isFoldableUseOfShuffle(U);
9690 if (N->hasOneUse()) {
9691 // TODO, there may be some general way to know if a SDNode can
9692 // be folded. We now only know whether an MI is foldable.
9693 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9694 return false;
9695 return true;
9696 }
9697 }
9698 return false;
9699}
9700
9701/// Attempt to use the vbroadcast instruction to generate a splat value
9702/// from a splat BUILD_VECTOR which uses:
9703/// a. A single scalar load, or a constant.
9704/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9705///
9706/// The VBROADCAST node is returned when a pattern is found,
9707/// or SDValue() otherwise.
9708static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9709 const X86Subtarget &Subtarget,
9710 SelectionDAG &DAG) {
9711 // VBROADCAST requires AVX.
9712 // TODO: Splats could be generated for non-AVX CPUs using SSE
9713 // instructions, but there's less potential gain for only 128-bit vectors.
9714 if (!Subtarget.hasAVX())
9715 return SDValue();
9716
9717 MVT VT = BVOp->getSimpleValueType(0);
9718 unsigned NumElts = VT.getVectorNumElements();
9719 SDLoc dl(BVOp);
9720
9721 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__))
9722 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__))
;
9723
9724 // See if the build vector is a repeating sequence of scalars (inc. splat).
9725 SDValue Ld;
9726 BitVector UndefElements;
9727 SmallVector<SDValue, 16> Sequence;
9728 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9729 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9729, __extension__
__PRETTY_FUNCTION__))
;
9730 if (Sequence.size() == 1)
9731 Ld = Sequence[0];
9732 }
9733
9734 // Attempt to use VBROADCASTM
9735 // From this pattern:
9736 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9737 // b. t1 = (build_vector t0 t0)
9738 //
9739 // Create (VBROADCASTM v2i1 X)
9740 if (!Sequence.empty() && Subtarget.hasCDI()) {
9741 // If not a splat, are the upper sequence values zeroable?
9742 unsigned SeqLen = Sequence.size();
9743 bool UpperZeroOrUndef =
9744 SeqLen == 1 ||
9745 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
9746 return !V || V.isUndef() || isNullConstant(V);
9747 });
9748 SDValue Op0 = Sequence[0];
9749 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9750 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9751 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9752 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9753 ? Op0.getOperand(0)
9754 : Op0.getOperand(0).getOperand(0);
9755 MVT MaskVT = BOperand.getSimpleValueType();
9756 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9757 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9758 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9759 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9760 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9761 unsigned Scale = 512 / VT.getSizeInBits();
9762 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9763 }
9764 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9765 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9766 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9767 return DAG.getBitcast(VT, Bcst);
9768 }
9769 }
9770 }
9771
9772 unsigned NumUndefElts = UndefElements.count();
9773 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9774 APInt SplatValue, Undef;
9775 unsigned SplatBitSize;
9776 bool HasUndef;
9777 // Check if this is a repeated constant pattern suitable for broadcasting.
9778 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9779 SplatBitSize > VT.getScalarSizeInBits() &&
9780 SplatBitSize < VT.getSizeInBits()) {
9781 // Avoid replacing with broadcast when it's a use of a shuffle
9782 // instruction to preserve the present custom lowering of shuffles.
9783 if (isFoldableUseOfShuffle(BVOp))
9784 return SDValue();
9785 // replace BUILD_VECTOR with broadcast of the repeated constants.
9786 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9787 LLVMContext *Ctx = DAG.getContext();
9788 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9789 if (Subtarget.hasAVX()) {
9790 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9791 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9792 // Splatted value can fit in one INTEGER constant in constant pool.
9793 // Load the constant and broadcast it.
9794 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9795 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9796 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9797 SDValue CP = DAG.getConstantPool(C, PVT);
9798 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9799
9800 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9801 SDVTList Tys =
9802 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9803 SDValue Ops[] = {DAG.getEntryNode(), CP};
9804 MachinePointerInfo MPI =
9805 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9806 SDValue Brdcst = DAG.getMemIntrinsicNode(
9807 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9808 MachineMemOperand::MOLoad);
9809 return DAG.getBitcast(VT, Brdcst);
9810 }
9811 if (SplatBitSize > 64) {
9812 // Load the vector of constants and broadcast it.
9813 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9814 *Ctx);
9815 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9816 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9817 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9818 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9819 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9820 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9821 MachinePointerInfo MPI =
9822 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9823 return DAG.getMemIntrinsicNode(
9824 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9825 MachineMemOperand::MOLoad);
9826 }
9827 }
9828 }
9829
9830 // If we are moving a scalar into a vector (Ld must be set and all elements
9831 // but 1 are undef) and that operation is not obviously supported by
9832 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9833 // That's better than general shuffling and may eliminate a load to GPR and
9834 // move from scalar to vector register.
9835 if (!Ld || NumElts - NumUndefElts != 1)
9836 return SDValue();
9837 unsigned ScalarSize = Ld.getValueSizeInBits();
9838 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9839 return SDValue();
9840 }
9841
9842 bool ConstSplatVal =
9843 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9844 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9845
9846 // TODO: Handle broadcasts of non-constant sequences.
9847
9848 // Make sure that all of the users of a non-constant load are from the
9849 // BUILD_VECTOR node.
9850 // FIXME: Is the use count needed for non-constant, non-load case?
9851 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9852 return SDValue();
9853
9854 unsigned ScalarSize = Ld.getValueSizeInBits();
9855 bool IsGE256 = (VT.getSizeInBits() >= 256);
9856
9857 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9858 // instruction to save 8 or more bytes of constant pool data.
9859 // TODO: If multiple splats are generated to load the same constant,
9860 // it may be detrimental to overall size. There needs to be a way to detect
9861 // that condition to know if this is truly a size win.
9862 bool OptForSize = DAG.shouldOptForSize();
9863
9864 // Handle broadcasting a single constant scalar from the constant pool
9865 // into a vector.
9866 // On Sandybridge (no AVX2), it is still better to load a constant vector
9867 // from the constant pool and not to broadcast it from a scalar.
9868 // But override that restriction when optimizing for size.
9869 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9870 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9871 EVT CVT = Ld.getValueType();
9872 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9872, __extension__
__PRETTY_FUNCTION__))
;
9873
9874 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
9875 // For size optimization, also splat v2f64 and v2i64, and for size opt
9876 // with AVX2, also splat i8 and i16.
9877 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9878 if (ScalarSize == 32 ||
9879 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9880 CVT == MVT::f16 ||
9881 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9882 const Constant *C = nullptr;
9883 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9884 C = CI->getConstantIntValue();
9885 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9886 C = CF->getConstantFPValue();
9887
9888 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9888, __extension__
__PRETTY_FUNCTION__))
;
9889
9890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9891 SDValue CP =
9892 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9893 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9894
9895 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9896 SDValue Ops[] = {DAG.getEntryNode(), CP};
9897 MachinePointerInfo MPI =
9898 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9899 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9900 MPI, Alignment, MachineMemOperand::MOLoad);
9901 }
9902 }
9903
9904 // Handle AVX2 in-register broadcasts.
9905 if (!IsLoad && Subtarget.hasInt256() &&
9906 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9907 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9908
9909 // The scalar source must be a normal load.
9910 if (!IsLoad)
9911 return SDValue();
9912
9913 // Make sure the non-chain result is only used by this build vector.
9914 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9915 return SDValue();
9916
9917 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9918 (Subtarget.hasVLX() && ScalarSize == 64)) {
9919 auto *LN = cast<LoadSDNode>(Ld);
9920 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9921 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9922 SDValue BCast =
9923 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9924 LN->getMemoryVT(), LN->getMemOperand());
9925 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9926 return BCast;
9927 }
9928
9929 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9930 // double since there is no vbroadcastsd xmm
9931 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9932 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9933 auto *LN = cast<LoadSDNode>(Ld);
9934 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9935 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9936 SDValue BCast =
9937 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9938 LN->getMemoryVT(), LN->getMemOperand());
9939 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9940 return BCast;
9941 }
9942
9943 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9944 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9945
9946 // Unsupported broadcast.
9947 return SDValue();
9948}
9949
9950/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9951/// underlying vector and index.
9952///
9953/// Modifies \p ExtractedFromVec to the real vector and returns the real
9954/// index.
9955static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9956 SDValue ExtIdx) {
9957 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9958 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9959 return Idx;
9960
9961 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9962 // lowered this:
9963 // (extract_vector_elt (v8f32 %1), Constant<6>)
9964 // to:
9965 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9966 // (extract_subvector (v8f32 %0), Constant<4>),
9967 // undef)
9968 // Constant<0>)
9969 // In this case the vector is the extract_subvector expression and the index
9970 // is 2, as specified by the shuffle.
9971 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9972 SDValue ShuffleVec = SVOp->getOperand(0);
9973 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9974 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__))
9975 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__))
;
9976
9977 int ShuffleIdx = SVOp->getMaskElt(Idx);
9978 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9979 ExtractedFromVec = ShuffleVec;
9980 return ShuffleIdx;
9981 }
9982 return Idx;
9983}
9984
9985static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9986 MVT VT = Op.getSimpleValueType();
9987
9988 // Skip if insert_vec_elt is not supported.
9989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9990 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9991 return SDValue();
9992
9993 SDLoc DL(Op);
9994 unsigned NumElems = Op.getNumOperands();
9995
9996 SDValue VecIn1;
9997 SDValue VecIn2;
9998 SmallVector<unsigned, 4> InsertIndices;
9999 SmallVector<int, 8> Mask(NumElems, -1);
10000
10001 for (unsigned i = 0; i != NumElems; ++i) {
10002 unsigned Opc = Op.getOperand(i).getOpcode();
10003
10004 if (Opc == ISD::UNDEF)
10005 continue;
10006
10007 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
10008 // Quit if more than 1 elements need inserting.
10009 if (InsertIndices.size() > 1)
10010 return SDValue();
10011
10012 InsertIndices.push_back(i);
10013 continue;
10014 }
10015
10016 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
10017 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
10018
10019 // Quit if non-constant index.
10020 if (!isa<ConstantSDNode>(ExtIdx))
10021 return SDValue();
10022 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
10023
10024 // Quit if extracted from vector of different type.
10025 if (ExtractedFromVec.getValueType() != VT)
10026 return SDValue();
10027
10028 if (!VecIn1.getNode())
10029 VecIn1 = ExtractedFromVec;
10030 else if (VecIn1 != ExtractedFromVec) {
10031 if (!VecIn2.getNode())
10032 VecIn2 = ExtractedFromVec;
10033 else if (VecIn2 != ExtractedFromVec)
10034 // Quit if more than 2 vectors to shuffle
10035 return SDValue();
10036 }
10037
10038 if (ExtractedFromVec == VecIn1)
10039 Mask[i] = Idx;
10040 else if (ExtractedFromVec == VecIn2)
10041 Mask[i] = Idx + NumElems;
10042 }
10043
10044 if (!VecIn1.getNode())
10045 return SDValue();
10046
10047 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
10048 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
10049
10050 for (unsigned Idx : InsertIndices)
10051 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
10052 DAG.getIntPtrConstant(Idx, DL));
10053
10054 return NV;
10055}
10056
10057// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
10058static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
10059 const X86Subtarget &Subtarget) {
10060 MVT VT = Op.getSimpleValueType();
10061 MVT IVT = VT.changeVectorElementTypeToInteger();
10062 SmallVector<SDValue, 16> NewOps;
10063 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
10064 NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
10065 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
10066 return DAG.getBitcast(VT, Res);
10067}
10068
10069// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
10070static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
10071 const X86Subtarget &Subtarget) {
10072
10073 MVT VT = Op.getSimpleValueType();
10074 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__))
10075 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__))
;
10076
10077 SDLoc dl(Op);
10078 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
10079 ISD::isBuildVectorAllOnes(Op.getNode()))
10080 return Op;
10081
10082 uint64_t Immediate = 0;
10083 SmallVector<unsigned, 16> NonConstIdx;
10084 bool IsSplat = true;
10085 bool HasConstElts = false;
10086 int SplatIdx = -1;
10087 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
10088 SDValue In = Op.getOperand(idx);
10089 if (In.isUndef())
10090 continue;
10091 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
10092 Immediate |= (InC->getZExtValue() & 0x1) << idx;
10093 HasConstElts = true;
10094 } else {
10095 NonConstIdx.push_back(idx);
10096 }
10097 if (SplatIdx < 0)
10098 SplatIdx = idx;
10099 else if (In != Op.getOperand(SplatIdx))
10100 IsSplat = false;
10101 }
10102
10103 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
10104 if (IsSplat) {
10105 // The build_vector allows the scalar element to be larger than the vector
10106 // element type. We need to mask it to use as a condition unless we know
10107 // the upper bits are zero.
10108 // FIXME: Use computeKnownBits instead of checking specific opcode?
10109 SDValue Cond = Op.getOperand(SplatIdx);
10110 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10110, __extension__
__PRETTY_FUNCTION__))
;
10111 if (Cond.getOpcode() != ISD::SETCC)
10112 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
10113 DAG.getConstant(1, dl, MVT::i8));
10114
10115 // Perform the select in the scalar domain so we can use cmov.
10116 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10117 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
10118 DAG.getAllOnesConstant(dl, MVT::i32),
10119 DAG.getConstant(0, dl, MVT::i32));
10120 Select = DAG.getBitcast(MVT::v32i1, Select);
10121 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
10122 } else {
10123 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10124 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
10125 DAG.getAllOnesConstant(dl, ImmVT),
10126 DAG.getConstant(0, dl, ImmVT));
10127 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10128 Select = DAG.getBitcast(VecVT, Select);
10129 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
10130 DAG.getIntPtrConstant(0, dl));
10131 }
10132 }
10133
10134 // insert elements one by one
10135 SDValue DstVec;
10136 if (HasConstElts) {
10137 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10138 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
10139 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
10140 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
10141 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
10142 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
10143 } else {
10144 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10145 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
10146 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10147 DstVec = DAG.getBitcast(VecVT, Imm);
10148 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
10149 DAG.getIntPtrConstant(0, dl));
10150 }
10151 } else
10152 DstVec = DAG.getUNDEF(VT);
10153
10154 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
10155 unsigned InsertIdx = NonConstIdx[i];
10156 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10157 Op.getOperand(InsertIdx),
10158 DAG.getIntPtrConstant(InsertIdx, dl));
10159 }
10160 return DstVec;
10161}
10162
10163LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
10164 switch (Opcode) {
10165 case X86ISD::PACKSS:
10166 case X86ISD::PACKUS:
10167 case X86ISD::FHADD:
10168 case X86ISD::FHSUB:
10169 case X86ISD::HADD:
10170 case X86ISD::HSUB:
10171 return true;
10172 }
10173 return false;
10174}
10175
10176/// This is a helper function of LowerToHorizontalOp().
10177/// This function checks that the build_vector \p N in input implements a
10178/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
10179/// may not match the layout of an x86 256-bit horizontal instruction.
10180/// In other words, if this returns true, then some extraction/insertion will
10181/// be required to produce a valid horizontal instruction.
10182///
10183/// Parameter \p Opcode defines the kind of horizontal operation to match.
10184/// For example, if \p Opcode is equal to ISD::ADD, then this function
10185/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
10186/// is equal to ISD::SUB, then this function checks if this is a horizontal
10187/// arithmetic sub.
10188///
10189/// This function only analyzes elements of \p N whose indices are
10190/// in range [BaseIdx, LastIdx).
10191///
10192/// TODO: This function was originally used to match both real and fake partial
10193/// horizontal operations, but the index-matching logic is incorrect for that.
10194/// See the corrected implementation in isHopBuildVector(). Can we reduce this
10195/// code because it is only used for partial h-op matching now?
10196static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
10197 SelectionDAG &DAG,
10198 unsigned BaseIdx, unsigned LastIdx,
10199 SDValue &V0, SDValue &V1) {
10200 EVT VT = N->getValueType(0);
10201 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10201, __extension__
__PRETTY_FUNCTION__))
;
10202 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10202, __extension__
__PRETTY_FUNCTION__))
;
10203 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__))
10204 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__))
;
10205
10206 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
10207 bool CanFold = true;
10208 unsigned ExpectedVExtractIdx = BaseIdx;
10209 unsigned NumElts = LastIdx - BaseIdx;
10210 V0 = DAG.getUNDEF(VT);
10211 V1 = DAG.getUNDEF(VT);
10212
10213 // Check if N implements a horizontal binop.
10214 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
10215 SDValue Op = N->getOperand(i + BaseIdx);
10216
10217 // Skip UNDEFs.
10218 if (Op->isUndef()) {
10219 // Update the expected vector extract index.
10220 if (i * 2 == NumElts)
10221 ExpectedVExtractIdx = BaseIdx;
10222 ExpectedVExtractIdx += 2;
10223 continue;
10224 }
10225
10226 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
10227
10228 if (!CanFold)
10229 break;
10230
10231 SDValue Op0 = Op.getOperand(0);
10232 SDValue Op1 = Op.getOperand(1);
10233
10234 // Try to match the following pattern:
10235 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
10236 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10237 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10238 Op0.getOperand(0) == Op1.getOperand(0) &&
10239 isa<ConstantSDNode>(Op0.getOperand(1)) &&
10240 isa<ConstantSDNode>(Op1.getOperand(1)));
10241 if (!CanFold)
10242 break;
10243
10244 unsigned I0 = Op0.getConstantOperandVal(1);
10245 unsigned I1 = Op1.getConstantOperandVal(1);
10246
10247 if (i * 2 < NumElts) {
10248 if (V0.isUndef()) {
10249 V0 = Op0.getOperand(0);
10250 if (V0.getValueType() != VT)
10251 return false;
10252 }
10253 } else {
10254 if (V1.isUndef()) {
10255 V1 = Op0.getOperand(0);
10256 if (V1.getValueType() != VT)
10257 return false;
10258 }
10259 if (i * 2 == NumElts)
10260 ExpectedVExtractIdx = BaseIdx;
10261 }
10262
10263 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
10264 if (I0 == ExpectedVExtractIdx)
10265 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
10266 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
10267 // Try to match the following dag sequence:
10268 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
10269 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
10270 } else
10271 CanFold = false;
10272
10273 ExpectedVExtractIdx += 2;
10274 }
10275
10276 return CanFold;
10277}
10278
10279/// Emit a sequence of two 128-bit horizontal add/sub followed by
10280/// a concat_vector.
10281///
10282/// This is a helper function of LowerToHorizontalOp().
10283/// This function expects two 256-bit vectors called V0 and V1.
10284/// At first, each vector is split into two separate 128-bit vectors.
10285/// Then, the resulting 128-bit vectors are used to implement two
10286/// horizontal binary operations.
10287///
10288/// The kind of horizontal binary operation is defined by \p X86Opcode.
10289///
10290/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
10291/// the two new horizontal binop.
10292/// When Mode is set, the first horizontal binop dag node would take as input
10293/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
10294/// horizontal binop dag node would take as input the lower 128-bit of V1
10295/// and the upper 128-bit of V1.
10296/// Example:
10297/// HADD V0_LO, V0_HI
10298/// HADD V1_LO, V1_HI
10299///
10300/// Otherwise, the first horizontal binop dag node takes as input the lower
10301/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
10302/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
10303/// Example:
10304/// HADD V0_LO, V1_LO
10305/// HADD V0_HI, V1_HI
10306///
10307/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
10308/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
10309/// the upper 128-bits of the result.
10310static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
10311 const SDLoc &DL, SelectionDAG &DAG,
10312 unsigned X86Opcode, bool Mode,
10313 bool isUndefLO, bool isUndefHI) {
10314 MVT VT = V0.getSimpleValueType();
10315 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__))
10316 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__))
;
10317
10318 unsigned NumElts = VT.getVectorNumElements();
10319 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
10320 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
10321 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
10322 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
10323 MVT NewVT = V0_LO.getSimpleValueType();
10324
10325 SDValue LO = DAG.getUNDEF(NewVT);
10326 SDValue HI = DAG.getUNDEF(NewVT);
10327
10328 if (Mode) {
10329 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10330 if (!isUndefLO && !V0->isUndef())
10331 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
10332 if (!isUndefHI && !V1->isUndef())
10333 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
10334 } else {
10335 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10336 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
10337 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
10338
10339 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
10340 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
10341 }
10342
10343 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
10344}
10345
10346/// Returns true iff \p BV builds a vector with the result equivalent to
10347/// the result of ADDSUB/SUBADD operation.
10348/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
10349/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
10350/// \p Opnd0 and \p Opnd1.
10351static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
10352 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10353 SDValue &Opnd0, SDValue &Opnd1,
10354 unsigned &NumExtracts,
10355 bool &IsSubAdd) {
10356
10357 MVT VT = BV->getSimpleValueType(0);
10358 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
10359 return false;
10360
10361 unsigned NumElts = VT.getVectorNumElements();
10362 SDValue InVec0 = DAG.getUNDEF(VT);
10363 SDValue InVec1 = DAG.getUNDEF(VT);
10364
10365 NumExtracts = 0;
10366
10367 // Odd-numbered elements in the input build vector are obtained from
10368 // adding/subtracting two integer/float elements.
10369 // Even-numbered elements in the input build vector are obtained from
10370 // subtracting/adding two integer/float elements.
10371 unsigned Opc[2] = {0, 0};
10372 for (unsigned i = 0, e = NumElts; i != e; ++i) {
10373 SDValue Op = BV->getOperand(i);
10374
10375 // Skip 'undef' values.
10376 unsigned Opcode = Op.getOpcode();
10377 if (Opcode == ISD::UNDEF)
10378 continue;
10379
10380 // Early exit if we found an unexpected opcode.
10381 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
10382 return false;
10383
10384 SDValue Op0 = Op.getOperand(0);
10385 SDValue Op1 = Op.getOperand(1);
10386
10387 // Try to match the following pattern:
10388 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10389 // Early exit if we cannot match that sequence.
10390 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10391 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10392 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10393 Op0.getOperand(1) != Op1.getOperand(1))
10394 return false;
10395
10396 unsigned I0 = Op0.getConstantOperandVal(1);
10397 if (I0 != i)
10398 return false;
10399
10400 // We found a valid add/sub node, make sure its the same opcode as previous
10401 // elements for this parity.
10402 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10403 return false;
10404 Opc[i % 2] = Opcode;
10405
10406 // Update InVec0 and InVec1.
10407 if (InVec0.isUndef()) {
10408 InVec0 = Op0.getOperand(0);
10409 if (InVec0.getSimpleValueType() != VT)
10410 return false;
10411 }
10412 if (InVec1.isUndef()) {
10413 InVec1 = Op1.getOperand(0);
10414 if (InVec1.getSimpleValueType() != VT)
10415 return false;
10416 }
10417
10418 // Make sure that operands in input to each add/sub node always
10419 // come from a same pair of vectors.
10420 if (InVec0 != Op0.getOperand(0)) {
10421 if (Opcode == ISD::FSUB)
10422 return false;
10423
10424 // FADD is commutable. Try to commute the operands
10425 // and then test again.
10426 std::swap(Op0, Op1);
10427 if (InVec0 != Op0.getOperand(0))
10428 return false;
10429 }
10430
10431 if (InVec1 != Op1.getOperand(0))
10432 return false;
10433
10434 // Increment the number of extractions done.
10435 ++NumExtracts;
10436 }
10437
10438 // Ensure we have found an opcode for both parities and that they are
10439 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10440 // inputs are undef.
10441 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10442 InVec0.isUndef() || InVec1.isUndef())
10443 return false;
10444
10445 IsSubAdd = Opc[0] == ISD::FADD;
10446
10447 Opnd0 = InVec0;
10448 Opnd1 = InVec1;
10449 return true;
10450}
10451
10452/// Returns true if is possible to fold MUL and an idiom that has already been
10453/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10454/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10455/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10456///
10457/// Prior to calling this function it should be known that there is some
10458/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10459/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10460/// before replacement of such SDNode with ADDSUB operation. Thus the number
10461/// of \p Opnd0 uses is expected to be equal to 2.
10462/// For example, this function may be called for the following IR:
10463/// %AB = fmul fast <2 x double> %A, %B
10464/// %Sub = fsub fast <2 x double> %AB, %C
10465/// %Add = fadd fast <2 x double> %AB, %C
10466/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10467/// <2 x i32> <i32 0, i32 3>
10468/// There is a def for %Addsub here, which potentially can be replaced by
10469/// X86ISD::ADDSUB operation:
10470/// %Addsub = X86ISD::ADDSUB %AB, %C
10471/// and such ADDSUB can further be replaced with FMADDSUB:
10472/// %Addsub = FMADDSUB %A, %B, %C.
10473///
10474/// The main reason why this method is called before the replacement of the
10475/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10476/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10477/// FMADDSUB is.
10478static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10479 SelectionDAG &DAG,
10480 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10481 unsigned ExpectedUses) {
10482 if (Opnd0.getOpcode() != ISD::FMUL ||
10483 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10484 return false;
10485
10486 // FIXME: These checks must match the similar ones in
10487 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10488 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10489 // or MUL + ADDSUB to FMADDSUB.
10490 const TargetOptions &Options = DAG.getTarget().Options;
10491 bool AllowFusion =
10492 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10493 if (!AllowFusion)
10494 return false;
10495
10496 Opnd2 = Opnd1;
10497 Opnd1 = Opnd0.getOperand(1);
10498 Opnd0 = Opnd0.getOperand(0);
10499
10500 return true;
10501}
10502
10503/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10504/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10505/// X86ISD::FMSUBADD node.
10506static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10507 const X86Subtarget &Subtarget,
10508 SelectionDAG &DAG) {
10509 SDValue Opnd0, Opnd1;
10510 unsigned NumExtracts;
10511 bool IsSubAdd;
10512 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10513 IsSubAdd))
10514 return SDValue();
10515
10516 MVT VT = BV->getSimpleValueType(0);
10517 SDLoc DL(BV);
10518
10519 // Try to generate X86ISD::FMADDSUB node here.
10520 SDValue Opnd2;
10521 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10522 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10523 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10524 }
10525
10526 // We only support ADDSUB.
10527 if (IsSubAdd)
10528 return SDValue();
10529
10530 // There are no known X86 targets with 512-bit ADDSUB instructions!
10531 // Convert to blend(fsub,fadd).
10532 if (VT.is512BitVector()) {
10533 SmallVector<int> Mask;
10534 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10535 Mask.push_back(I);
10536 Mask.push_back(I + E + 1);
10537 }
10538 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10539 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10540 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10541 }
10542
10543 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10544}
10545
10546static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10547 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10548 // Initialize outputs to known values.
10549 MVT VT = BV->getSimpleValueType(0);
10550 HOpcode = ISD::DELETED_NODE;
10551 V0 = DAG.getUNDEF(VT);
10552 V1 = DAG.getUNDEF(VT);
10553
10554 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10555 // half of the result is calculated independently from the 128-bit halves of
10556 // the inputs, so that makes the index-checking logic below more complicated.
10557 unsigned NumElts = VT.getVectorNumElements();
10558 unsigned GenericOpcode = ISD::DELETED_NODE;
10559 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10560 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10561 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10562 for (unsigned i = 0; i != Num128BitChunks; ++i) {
10563 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10564 // Ignore undef elements.
10565 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10566 if (Op.isUndef())
10567 continue;
10568
10569 // If there's an opcode mismatch, we're done.
10570 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10571 return false;
10572
10573 // Initialize horizontal opcode.
10574 if (HOpcode == ISD::DELETED_NODE) {
10575 GenericOpcode = Op.getOpcode();
10576 switch (GenericOpcode) {
10577 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10578 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10579 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10580 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10581 default: return false;
10582 }
10583 }
10584
10585 SDValue Op0 = Op.getOperand(0);
10586 SDValue Op1 = Op.getOperand(1);
10587 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10588 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10589 Op0.getOperand(0) != Op1.getOperand(0) ||
10590 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10591 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10592 return false;
10593
10594 // The source vector is chosen based on which 64-bit half of the
10595 // destination vector is being calculated.
10596 if (j < NumEltsIn64Bits) {
10597 if (V0.isUndef())
10598 V0 = Op0.getOperand(0);
10599 } else {
10600 if (V1.isUndef())
10601 V1 = Op0.getOperand(0);
10602 }
10603
10604 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10605 if (SourceVec != Op0.getOperand(0))
10606 return false;
10607
10608 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10609 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10610 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10611 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10612 (j % NumEltsIn64Bits) * 2;
10613 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10614 continue;
10615
10616 // If this is not a commutative op, this does not match.
10617 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10618 return false;
10619
10620 // Addition is commutative, so try swapping the extract indexes.
10621 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10622 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10623 continue;
10624
10625 // Extract indexes do not match horizontal requirement.
10626 return false;
10627 }
10628 }
10629 // We matched. Opcode and operands are returned by reference as arguments.
10630 return true;
10631}
10632
10633static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10634 SelectionDAG &DAG, unsigned HOpcode,
10635 SDValue V0, SDValue V1) {
10636 // If either input vector is not the same size as the build vector,
10637 // extract/insert the low bits to the correct size.
10638 // This is free (examples: zmm --> xmm, xmm --> ymm).
10639 MVT VT = BV->getSimpleValueType(0);
10640 unsigned Width = VT.getSizeInBits();
10641 if (V0.getValueSizeInBits() > Width)
10642 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10643 else if (V0.getValueSizeInBits() < Width)
10644 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10645
10646 if (V1.getValueSizeInBits() > Width)
10647 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10648 else if (V1.getValueSizeInBits() < Width)
10649 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10650
10651 unsigned NumElts = VT.getVectorNumElements();
10652 APInt DemandedElts = APInt::getAllOnes(NumElts);
10653 for (unsigned i = 0; i != NumElts; ++i)
10654 if (BV->getOperand(i).isUndef())
10655 DemandedElts.clearBit(i);
10656
10657 // If we don't need the upper xmm, then perform as a xmm hop.
10658 unsigned HalfNumElts = NumElts / 2;
10659 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10660 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10661 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10662 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10663 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10664 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10665 }
10666
10667 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10668}
10669
10670/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10671static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10672 const X86Subtarget &Subtarget,
10673 SelectionDAG &DAG) {
10674 // We need at least 2 non-undef elements to make this worthwhile by default.
10675 unsigned NumNonUndefs =
10676 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10677 if (NumNonUndefs < 2)
10678 return SDValue();
10679
10680 // There are 4 sets of horizontal math operations distinguished by type:
10681 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10682 // subtarget feature. Try to match those "native" patterns first.
10683 MVT VT = BV->getSimpleValueType(0);
10684 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10685 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10686 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10687 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10688 unsigned HOpcode;
10689 SDValue V0, V1;
10690 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10691 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10692 }
10693
10694 // Try harder to match 256-bit ops by using extract/concat.
10695 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10696 return SDValue();
10697
10698 // Count the number of UNDEF operands in the build_vector in input.
10699 unsigned NumElts = VT.getVectorNumElements();
10700 unsigned Half = NumElts / 2;
10701 unsigned NumUndefsLO = 0;
10702 unsigned NumUndefsHI = 0;
10703 for (unsigned i = 0, e = Half; i != e; ++i)
10704 if (BV->getOperand(i)->isUndef())
10705 NumUndefsLO++;
10706
10707 for (unsigned i = Half, e = NumElts; i != e; ++i)
10708 if (BV->getOperand(i)->isUndef())
10709 NumUndefsHI++;
10710
10711 SDLoc DL(BV);
10712 SDValue InVec0, InVec1;
10713 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10714 SDValue InVec2, InVec3;
10715 unsigned X86Opcode;
10716 bool CanFold = true;
10717
10718 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10719 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10720 InVec3) &&
10721 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10722 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10723 X86Opcode = X86ISD::HADD;
10724 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10725 InVec1) &&
10726 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10727 InVec3) &&
10728 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10729 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10730 X86Opcode = X86ISD::HSUB;
10731 else
10732 CanFold = false;
10733
10734 if (CanFold) {
10735 // Do not try to expand this build_vector into a pair of horizontal
10736 // add/sub if we can emit a pair of scalar add/sub.
10737 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10738 return SDValue();
10739
10740 // Convert this build_vector into a pair of horizontal binops followed by
10741 // a concat vector. We must adjust the outputs from the partial horizontal
10742 // matching calls above to account for undefined vector halves.
10743 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10744 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10745 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10745, __extension__
__PRETTY_FUNCTION__))
;
10746 bool isUndefLO = NumUndefsLO == Half;
10747 bool isUndefHI = NumUndefsHI == Half;
10748 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10749 isUndefHI);
10750 }
10751 }
10752
10753 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10754 VT == MVT::v16i16) {
10755 unsigned X86Opcode;
10756 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10757 X86Opcode = X86ISD::HADD;
10758 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10759 InVec1))
10760 X86Opcode = X86ISD::HSUB;
10761 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10762 InVec1))
10763 X86Opcode = X86ISD::FHADD;
10764 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10765 InVec1))
10766 X86Opcode = X86ISD::FHSUB;
10767 else
10768 return SDValue();
10769
10770 // Don't try to expand this build_vector into a pair of horizontal add/sub
10771 // if we can simply emit a pair of scalar add/sub.
10772 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10773 return SDValue();
10774
10775 // Convert this build_vector into two horizontal add/sub followed by
10776 // a concat vector.
10777 bool isUndefLO = NumUndefsLO == Half;
10778 bool isUndefHI = NumUndefsHI == Half;
10779 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10780 isUndefLO, isUndefHI);
10781 }
10782
10783 return SDValue();
10784}
10785
10786static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10787 SelectionDAG &DAG);
10788
10789/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10790/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10791/// just apply the bit to the vectors.
10792/// NOTE: Its not in our interest to start make a general purpose vectorizer
10793/// from this, but enough scalar bit operations are created from the later
10794/// legalization + scalarization stages to need basic support.
10795static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10796 const X86Subtarget &Subtarget,
10797 SelectionDAG &DAG) {
10798 SDLoc DL(Op);
10799 MVT VT = Op->getSimpleValueType(0);
10800 unsigned NumElems = VT.getVectorNumElements();
10801 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10802
10803 // Check that all elements have the same opcode.
10804 // TODO: Should we allow UNDEFS and if so how many?
10805 unsigned Opcode = Op->getOperand(0).getOpcode();
10806 for (unsigned i = 1; i < NumElems; ++i)
10807 if (Opcode != Op->getOperand(i).getOpcode())
10808 return SDValue();
10809
10810 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10811 bool IsShift = false;
10812 switch (Opcode) {
10813 default:
10814 return SDValue();
10815 case ISD::SHL:
10816 case ISD::SRL:
10817 case ISD::SRA:
10818 IsShift = true;
10819 break;
10820 case ISD::AND:
10821 case ISD::XOR:
10822 case ISD::OR:
10823 // Don't do this if the buildvector is a splat - we'd replace one
10824 // constant with an entire vector.
10825 if (Op->getSplatValue())
10826 return SDValue();
10827 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10828 return SDValue();
10829 break;
10830 }
10831
10832 SmallVector<SDValue, 4> LHSElts, RHSElts;
10833 for (SDValue Elt : Op->ops()) {
10834 SDValue LHS = Elt.getOperand(0);
10835 SDValue RHS = Elt.getOperand(1);
10836
10837 // We expect the canonicalized RHS operand to be the constant.
10838 if (!isa<ConstantSDNode>(RHS))
10839 return SDValue();
10840
10841 // Extend shift amounts.
10842 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10843 if (!IsShift)
10844 return SDValue();
10845 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10846 }
10847
10848 LHSElts.push_back(LHS);
10849 RHSElts.push_back(RHS);
10850 }
10851
10852 // Limit to shifts by uniform immediates.
10853 // TODO: Only accept vXi8/vXi64 special cases?
10854 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10855 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10856 return SDValue();
10857
10858 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10859 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10860 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10861
10862 if (!IsShift)
10863 return Res;
10864
10865 // Immediately lower the shift to ensure the constant build vector doesn't
10866 // get converted to a constant pool before the shift is lowered.
10867 return LowerShift(Res, Subtarget, DAG);
10868}
10869
10870/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10871/// functionality to do this, so it's all zeros, all ones, or some derivation
10872/// that is cheap to calculate.
10873static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10874 const X86Subtarget &Subtarget) {
10875 SDLoc DL(Op);
10876 MVT VT = Op.getSimpleValueType();
10877
10878 // Vectors containing all zeros can be matched by pxor and xorps.
10879 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10880 return Op;
10881
10882 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10883 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10884 // vpcmpeqd on 256-bit vectors.
10885 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10886 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10887 return Op;
10888
10889 return getOnesVector(VT, DAG, DL);
10890 }
10891
10892 return SDValue();
10893}
10894
10895/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10896/// from a vector of source values and a vector of extraction indices.
10897/// The vectors might be manipulated to match the type of the permute op.
10898static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10899 SDLoc &DL, SelectionDAG &DAG,
10900 const X86Subtarget &Subtarget) {
10901 MVT ShuffleVT = VT;
10902 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10903 unsigned NumElts = VT.getVectorNumElements();
10904 unsigned SizeInBits = VT.getSizeInBits();
10905
10906 // Adjust IndicesVec to match VT size.
10907 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__))
10908 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__))
;
10909 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10910 // Narrow/widen the indices vector to the correct size.
10911 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10912 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10913 NumElts * VT.getScalarSizeInBits());
10914 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10915 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10916 SDLoc(IndicesVec), SizeInBits);
10917 // Zero-extend the index elements within the vector.
10918 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10919 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10920 IndicesVT, IndicesVec);
10921 }
10922 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10923
10924 // Handle SrcVec that don't match VT type.
10925 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10926 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10927 // Handle larger SrcVec by treating it as a larger permute.
10928 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10929 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10930 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10931 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10932 Subtarget, DAG, SDLoc(IndicesVec));
10933 SDValue NewSrcVec =
10934 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10935 if (NewSrcVec)
10936 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10937 return SDValue();
10938 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10939 // Widen smaller SrcVec to match VT.
10940 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10941 } else
10942 return SDValue();
10943 }
10944
10945 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10946 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10946, __extension__
__PRETTY_FUNCTION__))
;
10947 EVT SrcVT = Idx.getValueType();
10948 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10949 uint64_t IndexScale = 0;
10950 uint64_t IndexOffset = 0;
10951
10952 // If we're scaling a smaller permute op, then we need to repeat the
10953 // indices, scaling and offsetting them as well.
10954 // e.g. v4i32 -> v16i8 (Scale = 4)
10955 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10956 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10957 for (uint64_t i = 0; i != Scale; ++i) {
10958 IndexScale |= Scale << (i * NumDstBits);
10959 IndexOffset |= i << (i * NumDstBits);
10960 }
10961
10962 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10963 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10964 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10965 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10966 return Idx;
10967 };
10968
10969 unsigned Opcode = 0;
10970 switch (VT.SimpleTy) {
10971 default:
10972 break;
10973 case MVT::v16i8:
10974 if (Subtarget.hasSSSE3())
10975 Opcode = X86ISD::PSHUFB;
10976 break;
10977 case MVT::v8i16:
10978 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10979 Opcode = X86ISD::VPERMV;
10980 else if (Subtarget.hasSSSE3()) {
10981 Opcode = X86ISD::PSHUFB;
10982 ShuffleVT = MVT::v16i8;
10983 }
10984 break;
10985 case MVT::v4f32:
10986 case MVT::v4i32:
10987 if (Subtarget.hasAVX()) {
10988 Opcode = X86ISD::VPERMILPV;
10989 ShuffleVT = MVT::v4f32;
10990 } else if (Subtarget.hasSSSE3()) {
10991 Opcode = X86ISD::PSHUFB;
10992 ShuffleVT = MVT::v16i8;
10993 }
10994 break;
10995 case MVT::v2f64:
10996 case MVT::v2i64:
10997 if (Subtarget.hasAVX()) {
10998 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10999 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11000 Opcode = X86ISD::VPERMILPV;
11001 ShuffleVT = MVT::v2f64;
11002 } else if (Subtarget.hasSSE41()) {
11003 // SSE41 can compare v2i64 - select between indices 0 and 1.
11004 return DAG.getSelectCC(
11005 DL, IndicesVec,
11006 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
11007 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
11008 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
11009 ISD::CondCode::SETEQ);
11010 }
11011 break;
11012 case MVT::v32i8:
11013 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
11014 Opcode = X86ISD::VPERMV;
11015 else if (Subtarget.hasXOP()) {
11016 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
11017 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
11018 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
11019 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
11020 return DAG.getNode(
11021 ISD::CONCAT_VECTORS, DL, VT,
11022 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
11023 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
11024 } else if (Subtarget.hasAVX()) {
11025 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
11026 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
11027 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
11028 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
11029 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
11030 ArrayRef<SDValue> Ops) {
11031 // Permute Lo and Hi and then select based on index range.
11032 // This works as SHUFB uses bits[3:0] to permute elements and we don't
11033 // care about the bit[7] as its just an index vector.
11034 SDValue Idx = Ops[2];
11035 EVT VT = Idx.getValueType();
11036 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
11037 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
11038 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
11039 ISD::CondCode::SETGT);
11040 };
11041 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
11042 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
11043 PSHUFBBuilder);
11044 }
11045 break;
11046 case MVT::v16i16:
11047 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11048 Opcode = X86ISD::VPERMV;
11049 else if (Subtarget.hasAVX()) {
11050 // Scale to v32i8 and perform as v32i8.
11051 IndicesVec = ScaleIndices(IndicesVec, 2);
11052 return DAG.getBitcast(
11053 VT, createVariablePermute(
11054 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
11055 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
11056 }
11057 break;
11058 case MVT::v8f32:
11059 case MVT::v8i32:
11060 if (Subtarget.hasAVX2())
11061 Opcode = X86ISD::VPERMV;
11062 else if (Subtarget.hasAVX()) {
11063 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
11064 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11065 {0, 1, 2, 3, 0, 1, 2, 3});
11066 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11067 {4, 5, 6, 7, 4, 5, 6, 7});
11068 if (Subtarget.hasXOP())
11069 return DAG.getBitcast(
11070 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
11071 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11072 // Permute Lo and Hi and then select based on index range.
11073 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
11074 SDValue Res = DAG.getSelectCC(
11075 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
11076 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
11077 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
11078 ISD::CondCode::SETGT);
11079 return DAG.getBitcast(VT, Res);
11080 }
11081 break;
11082 case MVT::v4i64:
11083 case MVT::v4f64:
11084 if (Subtarget.hasAVX512()) {
11085 if (!Subtarget.hasVLX()) {
11086 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
11087 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
11088 SDLoc(SrcVec));
11089 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
11090 DAG, SDLoc(IndicesVec));
11091 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
11092 DAG, Subtarget);
11093 return extract256BitVector(Res, 0, DAG, DL);
11094 }
11095 Opcode = X86ISD::VPERMV;
11096 } else if (Subtarget.hasAVX()) {
11097 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
11098 SDValue LoLo =
11099 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
11100 SDValue HiHi =
11101 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
11102 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
11103 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11104 if (Subtarget.hasXOP())
11105 return DAG.getBitcast(
11106 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
11107 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11108 // Permute Lo and Hi and then select based on index range.
11109 // This works as VPERMILPD only uses index bit[1] to permute elements.
11110 SDValue Res = DAG.getSelectCC(
11111 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
11112 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
11113 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
11114 ISD::CondCode::SETGT);
11115 return DAG.getBitcast(VT, Res);
11116 }
11117 break;
11118 case MVT::v64i8:
11119 if (Subtarget.hasVBMI())
11120 Opcode = X86ISD::VPERMV;
11121 break;
11122 case MVT::v32i16:
11123 if (Subtarget.hasBWI())
11124 Opcode = X86ISD::VPERMV;
11125 break;
11126 case MVT::v16f32:
11127 case MVT::v16i32:
11128 case MVT::v8f64:
11129 case MVT::v8i64:
11130 if (Subtarget.hasAVX512())
11131 Opcode = X86ISD::VPERMV;
11132 break;
11133 }
11134 if (!Opcode)
11135 return SDValue();
11136
11137 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))
11138 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))
11139 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))
;
11140
11141 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
11142 if (Scale > 1)
11143 IndicesVec = ScaleIndices(IndicesVec, Scale);
11144
11145 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
11146 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
11147
11148 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
11149 SDValue Res = Opcode == X86ISD::VPERMV
11150 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
11151 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
11152 return DAG.getBitcast(VT, Res);
11153}
11154
11155// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
11156// reasoned to be a permutation of a vector by indices in a non-constant vector.
11157// (build_vector (extract_elt V, (extract_elt I, 0)),
11158// (extract_elt V, (extract_elt I, 1)),
11159// ...
11160// ->
11161// (vpermv I, V)
11162//
11163// TODO: Handle undefs
11164// TODO: Utilize pshufb and zero mask blending to support more efficient
11165// construction of vectors with constant-0 elements.
11166static SDValue
11167LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
11168 const X86Subtarget &Subtarget) {
11169 SDValue SrcVec, IndicesVec;
11170 // Check for a match of the permute source vector and permute index elements.
11171 // This is done by checking that the i-th build_vector operand is of the form:
11172 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
11173 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
11174 SDValue Op = V.getOperand(Idx);
11175 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11176 return SDValue();
11177
11178 // If this is the first extract encountered in V, set the source vector,
11179 // otherwise verify the extract is from the previously defined source
11180 // vector.
11181 if (!SrcVec)
11182 SrcVec = Op.getOperand(0);
11183 else if (SrcVec != Op.getOperand(0))
11184 return SDValue();
11185 SDValue ExtractedIndex = Op->getOperand(1);
11186 // Peek through extends.
11187 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
11188 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
11189 ExtractedIndex = ExtractedIndex.getOperand(0);
11190 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11191 return SDValue();
11192
11193 // If this is the first extract from the index vector candidate, set the
11194 // indices vector, otherwise verify the extract is from the previously
11195 // defined indices vector.
11196 if (!IndicesVec)
11197 IndicesVec = ExtractedIndex.getOperand(0);
11198 else if (IndicesVec != ExtractedIndex.getOperand(0))
11199 return SDValue();
11200
11201 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
11202 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
11203 return SDValue();
11204 }
11205
11206 SDLoc DL(V);
11207 MVT VT = V.getSimpleValueType();
11208 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11209}
11210
11211SDValue
11212X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
11213 SDLoc dl(Op);
11214
11215 MVT VT = Op.getSimpleValueType();
11216 MVT EltVT = VT.getVectorElementType();
11217 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
11218 unsigned NumElems = Op.getNumOperands();
11219
11220 // Generate vectors for predicate vectors.
11221 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
11222 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
11223
11224 if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
11225 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
11226
11227 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
11228 return VectorConstant;
11229
11230 unsigned EVTBits = EltVT.getSizeInBits();
11231 APInt UndefMask = APInt::getZero(NumElems);
11232 APInt FrozenUndefMask = APInt::getZero(NumElems);
11233 APInt ZeroMask = APInt::getZero(NumElems);
11234 APInt NonZeroMask = APInt::getZero(NumElems);
11235 bool IsAllConstants = true;
11236 SmallSet<SDValue, 8> Values;
11237 unsigned NumConstants = NumElems;
11238 for (unsigned i = 0; i < NumElems; ++i) {
11239 SDValue Elt = Op.getOperand(i);
11240 if (Elt.isUndef()) {
11241 UndefMask.setBit(i);
11242 continue;
11243 }
11244 if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {
11245 FrozenUndefMask.setBit(i);
11246 continue;
11247 }
11248 Values.insert(Elt);
11249 if (!isIntOrFPConstant(Elt)) {
11250 IsAllConstants = false;
11251 NumConstants--;
11252 }
11253 if (X86::isZeroNode(Elt)) {
11254 ZeroMask.setBit(i);
11255 } else {
11256 NonZeroMask.setBit(i);
11257 }
11258 }
11259
11260 // All undef vector. Return an UNDEF.
11261 if (UndefMask.isAllOnes())
11262 return DAG.getUNDEF(VT);
11263
11264 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
11265 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
11266 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
11267 // and blend the FREEZE-UNDEF operands back in.
11268 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
11269 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
11270 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
11271 SmallVector<int, 16> BlendMask(NumElems, -1);
11272 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
11273 for (unsigned i = 0; i < NumElems; ++i) {
11274 if (UndefMask[i]) {
11275 BlendMask[i] = -1;
11276 continue;
11277 }
11278 BlendMask[i] = i;
11279 if (!FrozenUndefMask[i])
11280 Elts[i] = Op.getOperand(i);
11281 else
11282 BlendMask[i] += NumElems;
11283 }
11284 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
11285 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
11286 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
11287 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
11288 }
11289
11290 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
11291
11292 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
11293 // lowering to a smaller build vector and padding with undef/zero.
11294 if ((VT.is256BitVector() || VT.is512BitVector()) &&
11295 !isFoldableUseOfShuffle(BV)) {
11296 unsigned UpperElems = NumElems / 2;
11297 APInt UndefOrZeroMask = UndefMask | ZeroMask;
11298 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
11299 if (NumUpperUndefsOrZeros >= UpperElems) {
11300 if (VT.is512BitVector() &&
11301 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
11302 UpperElems = NumElems - (NumElems / 4);
11303 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
11304 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
11305 SDValue NewBV =
11306 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
11307 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
11308 }
11309 }
11310
11311 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
11312 return AddSub;
11313 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
11314 return HorizontalOp;
11315 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
11316 return Broadcast;
11317 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
11318 return BitOp;
11319
11320 unsigned NumZero = ZeroMask.popcount();
11321 unsigned NumNonZero = NonZeroMask.popcount();
11322
11323 // If we are inserting one variable into a vector of non-zero constants, try
11324 // to avoid loading each constant element as a scalar. Load the constants as a
11325 // vector and then insert the variable scalar element. If insertion is not
11326 // supported, fall back to a shuffle to get the scalar blended with the
11327 // constants. Insertion into a zero vector is handled as a special-case
11328 // somewhere below here.
11329 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
11330 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
11331 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
11332 // Create an all-constant vector. The variable element in the old
11333 // build vector is replaced by undef in the constant vector. Save the
11334 // variable scalar element and its index for use in the insertelement.
11335 LLVMContext &Context = *DAG.getContext();
11336 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
11337 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
11338 SDValue VarElt;
11339 SDValue InsIndex;
11340 for (unsigned i = 0; i != NumElems; ++i) {
11341 SDValue Elt = Op.getOperand(i);
11342 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
11343 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
11344 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
11345 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
11346 else if (!Elt.isUndef()) {
11347 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__))
11348 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__))
;
11349 VarElt = Elt;
11350 InsIndex = DAG.getVectorIdxConstant(i, dl);
11351 }
11352 }
11353 Constant *CV = ConstantVector::get(ConstVecOps);
11354 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
11355
11356 // The constants we just created may not be legal (eg, floating point). We
11357 // must lower the vector right here because we can not guarantee that we'll
11358 // legalize it before loading it. This is also why we could not just create
11359 // a new build vector here. If the build vector contains illegal constants,
11360 // it could get split back up into a series of insert elements.
11361 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
11362 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
11363 MachineFunction &MF = DAG.getMachineFunction();
11364 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
11365 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
11366 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
11367 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
11368 if (InsertC < NumEltsInLow128Bits)
11369 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
11370
11371 // There's no good way to insert into the high elements of a >128-bit
11372 // vector, so use shuffles to avoid an extract/insert sequence.
11373 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11373, __extension__
__PRETTY_FUNCTION__))
;
11374 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11374, __extension__
__PRETTY_FUNCTION__))
;
11375 SmallVector<int, 8> ShuffleMask;
11376 unsigned NumElts = VT.getVectorNumElements();
11377 for (unsigned i = 0; i != NumElts; ++i)
11378 ShuffleMask.push_back(i == InsertC ? NumElts : i);
11379 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
11380 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
11381 }
11382
11383 // Special case for single non-zero, non-undef, element.
11384 if (NumNonZero == 1) {
11385 unsigned Idx = NonZeroMask.countr_zero();
11386 SDValue Item = Op.getOperand(Idx);
11387
11388 // If we have a constant or non-constant insertion into the low element of
11389 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
11390 // the rest of the elements. This will be matched as movd/movq/movss/movsd
11391 // depending on what the source datatype is.
11392 if (Idx == 0) {
11393 if (NumZero == 0)
11394 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11395
11396 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
11397 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
11398 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
11399 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))
11400 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))
11401 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))
;
11402 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11403 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
11404 // zero vector.
11405 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11406 }
11407
11408 // We can't directly insert an i8 or i16 into a vector, so zero extend
11409 // it to i32 first.
11410 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
11411 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
11412 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
11413 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
11414 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11415 return DAG.getBitcast(VT, Item);
11416 }
11417 }
11418
11419 // Is it a vector logical left shift?
11420 if (NumElems == 2 && Idx == 1 &&
11421 X86::isZeroNode(Op.getOperand(0)) &&
11422 !X86::isZeroNode(Op.getOperand(1))) {
11423 unsigned NumBits = VT.getSizeInBits();
11424 return getVShift(true, VT,
11425 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11426 VT, Op.getOperand(1)),
11427 NumBits/2, DAG, *this, dl);
11428 }
11429
11430 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11431 return SDValue();
11432
11433 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11434 // is a non-constant being inserted into an element other than the low one,
11435 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11436 // movd/movss) to move this into the low element, then shuffle it into
11437 // place.
11438 if (EVTBits == 32) {
11439 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11440 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11441 }
11442 }
11443
11444 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11445 if (Values.size() == 1) {
11446 if (EVTBits == 32) {
11447 // Instead of a shuffle like this:
11448 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11449 // Check if it's possible to issue this instead.
11450 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11451 unsigned Idx = NonZeroMask.countr_zero();
11452 SDValue Item = Op.getOperand(Idx);
11453 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11454 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11455 }
11456 return SDValue();
11457 }
11458
11459 // A vector full of immediates; various special cases are already
11460 // handled, so this is best done with a single constant-pool load.
11461 if (IsAllConstants)
11462 return SDValue();
11463
11464 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11465 return V;
11466
11467 // See if we can use a vector load to get all of the elements.
11468 {
11469 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11470 if (SDValue LD =
11471 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11472 return LD;
11473 }
11474
11475 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11476 // build_vector and broadcast it.
11477 // TODO: We could probably generalize this more.
11478 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11479 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11480 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11481 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11482 // Make sure all the even/odd operands match.
11483 for (unsigned i = 2; i != NumElems; ++i)
11484 if (Ops[i % 2] != Op.getOperand(i))
11485 return false;
11486 return true;
11487 };
11488 if (CanSplat(Op, NumElems, Ops)) {
11489 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11490 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11491 // Create a new build vector and cast to v2i64/v2f64.
11492 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11493 DAG.getBuildVector(NarrowVT, dl, Ops));
11494 // Broadcast from v2i64/v2f64 and cast to final VT.
11495 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11496 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11497 NewBV));
11498 }
11499 }
11500
11501 // For AVX-length vectors, build the individual 128-bit pieces and use
11502 // shuffles to put them in place.
11503 if (VT.getSizeInBits() > 128) {
11504 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11505
11506 // Build both the lower and upper subvector.
11507 SDValue Lower =
11508 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11509 SDValue Upper = DAG.getBuildVector(
11510 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11511
11512 // Recreate the wider vector with the lower and upper part.
11513 return concatSubVectors(Lower, Upper, DAG, dl);
11514 }
11515
11516 // Let legalizer expand 2-wide build_vectors.
11517 if (EVTBits == 64) {
11518 if (NumNonZero == 1) {
11519 // One half is zero or undef.
11520 unsigned Idx = NonZeroMask.countr_zero();
11521 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11522 Op.getOperand(Idx));
11523 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11524 }
11525 return SDValue();
11526 }
11527
11528 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11529 if (EVTBits == 8 && NumElems == 16)
11530 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11531 DAG, Subtarget))
11532 return V;
11533
11534 if (EltVT == MVT::i16 && NumElems == 8)
11535 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11536 DAG, Subtarget))
11537 return V;
11538
11539 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11540 if (EVTBits == 32 && NumElems == 4)
11541 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11542 return V;
11543
11544 // If element VT is == 32 bits, turn it into a number of shuffles.
11545 if (NumElems == 4 && NumZero > 0) {
11546 SmallVector<SDValue, 8> Ops(NumElems);
11547 for (unsigned i = 0; i < 4; ++i) {
11548 bool isZero = !NonZeroMask[i];
11549 if (isZero)
11550 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11551 else
11552 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11553 }
11554
11555 for (unsigned i = 0; i < 2; ++i) {
11556 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11557 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11557)
;
11558 case 0:
11559 Ops[i] = Ops[i*2]; // Must be a zero vector.
11560 break;
11561 case 1:
11562 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11563 break;
11564 case 2:
11565 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11566 break;
11567 case 3:
11568 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11569 break;
11570 }
11571 }
11572
11573 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11574 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11575 int MaskVec[] = {
11576 Reverse1 ? 1 : 0,
11577 Reverse1 ? 0 : 1,
11578 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11579 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11580 };
11581 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11582 }
11583
11584 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11584, __extension__
__PRETTY_FUNCTION__))
;
11585
11586 // Check for a build vector from mostly shuffle plus few inserting.
11587 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11588 return Sh;
11589
11590 // For SSE 4.1, use insertps to put the high elements into the low element.
11591 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11592 SDValue Result;
11593 if (!Op.getOperand(0).isUndef())
11594 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11595 else
11596 Result = DAG.getUNDEF(VT);
11597
11598 for (unsigned i = 1; i < NumElems; ++i) {
11599 if (Op.getOperand(i).isUndef()) continue;
11600 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11601 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11602 }
11603 return Result;
11604 }
11605
11606 // Otherwise, expand into a number of unpckl*, start by extending each of
11607 // our (non-undef) elements to the full vector width with the element in the
11608 // bottom slot of the vector (which generates no code for SSE).
11609 SmallVector<SDValue, 8> Ops(NumElems);
11610 for (unsigned i = 0; i < NumElems; ++i) {
11611 if (!Op.getOperand(i).isUndef())
11612 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11613 else
11614 Ops[i] = DAG.getUNDEF(VT);
11615 }
11616
11617 // Next, we iteratively mix elements, e.g. for v4f32:
11618 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11619 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11620 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11621 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11622 // Generate scaled UNPCKL shuffle mask.
11623 SmallVector<int, 16> Mask;
11624 for(unsigned i = 0; i != Scale; ++i)
11625 Mask.push_back(i);
11626 for (unsigned i = 0; i != Scale; ++i)
11627 Mask.push_back(NumElems+i);
11628 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11629
11630 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11631 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11632 }
11633 return Ops[0];
11634}
11635
11636// 256-bit AVX can use the vinsertf128 instruction
11637// to create 256-bit vectors from two other 128-bit ones.
11638// TODO: Detect subvector broadcast here instead of DAG combine?
11639static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11640 const X86Subtarget &Subtarget) {
11641 SDLoc dl(Op);
11642 MVT ResVT = Op.getSimpleValueType();
11643
11644 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__))
11645 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__))
;
11646
11647 unsigned NumOperands = Op.getNumOperands();
11648 unsigned NumFreezeUndef = 0;
11649 unsigned NumZero = 0;
11650 unsigned NumNonZero = 0;
11651 unsigned NonZeros = 0;
11652 for (unsigned i = 0; i != NumOperands; ++i) {
11653 SDValue SubVec = Op.getOperand(i);
11654 if (SubVec.isUndef())
11655 continue;
11656 if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())
11657 ++NumFreezeUndef;
11658 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11659 ++NumZero;
11660 else {
11661 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11661, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11662 NonZeros |= 1 << i;
11663 ++NumNonZero;
11664 }
11665 }
11666
11667 // If we have more than 2 non-zeros, build each half separately.
11668 if (NumNonZero > 2) {
11669 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11670 ArrayRef<SDUse> Ops = Op->ops();
11671 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11672 Ops.slice(0, NumOperands/2));
11673 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11674 Ops.slice(NumOperands/2));
11675 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11676 }
11677
11678 // Otherwise, build it up through insert_subvectors.
11679 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11680 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
11681 : DAG.getUNDEF(ResVT));
11682
11683 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11684 unsigned NumSubElems = SubVT.getVectorNumElements();
11685 for (unsigned i = 0; i != NumOperands; ++i) {
11686 if ((NonZeros & (1 << i)) == 0)
11687 continue;
11688
11689 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11690 Op.getOperand(i),
11691 DAG.getIntPtrConstant(i * NumSubElems, dl));
11692 }
11693
11694 return Vec;
11695}
11696
11697// Returns true if the given node is a type promotion (by concatenating i1
11698// zeros) of the result of a node that already zeros all upper bits of
11699// k-register.
11700// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11701static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11702 const X86Subtarget &Subtarget,
11703 SelectionDAG & DAG) {
11704 SDLoc dl(Op);
11705 MVT ResVT = Op.getSimpleValueType();
11706 unsigned NumOperands = Op.getNumOperands();
11707
11708 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__))
11709 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__))
;
11710
11711 uint64_t Zeros = 0;
11712 uint64_t NonZeros = 0;
11713 for (unsigned i = 0; i != NumOperands; ++i) {
11714 SDValue SubVec = Op.getOperand(i);
11715 if (SubVec.isUndef())
11716 continue;
11717 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11717, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11718 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11719 Zeros |= (uint64_t)1 << i;
11720 else
11721 NonZeros |= (uint64_t)1 << i;
11722 }
11723
11724 unsigned NumElems = ResVT.getVectorNumElements();
11725
11726 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11727 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11728 // insert_subvector will give us two kshifts.
11729 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11730 Log2_64(NonZeros) != NumOperands - 1) {
11731 MVT ShiftVT = ResVT;
11732 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11733 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11734 unsigned Idx = Log2_64(NonZeros);
11735 SDValue SubVec = Op.getOperand(Idx);
11736 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11737 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11738 DAG.getUNDEF(ShiftVT), SubVec,
11739 DAG.getIntPtrConstant(0, dl));
11740 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11741 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11742 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11743 DAG.getIntPtrConstant(0, dl));
11744 }
11745
11746 // If there are zero or one non-zeros we can handle this very simply.
11747 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11748 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11749 if (!NonZeros)
11750 return Vec;
11751 unsigned Idx = Log2_64(NonZeros);
11752 SDValue SubVec = Op.getOperand(Idx);
11753 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11754 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11755 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11756 }
11757
11758 if (NumOperands > 2) {
11759 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11760 ArrayRef<SDUse> Ops = Op->ops();
11761 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11762 Ops.slice(0, NumOperands/2));
11763 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11764 Ops.slice(NumOperands/2));
11765 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11766 }
11767
11768 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11768, __extension__
__PRETTY_FUNCTION__))
;
11769
11770 if (ResVT.getVectorNumElements() >= 16)
11771 return Op; // The operation is legal with KUNPCK
11772
11773 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11774 DAG.getUNDEF(ResVT), Op.getOperand(0),
11775 DAG.getIntPtrConstant(0, dl));
11776 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11777 DAG.getIntPtrConstant(NumElems/2, dl));
11778}
11779
11780static SDValue LowerCONCAT_VECTORS(SDValue Op,
11781 const X86Subtarget &Subtarget,
11782 SelectionDAG &DAG) {
11783 MVT VT = Op.getSimpleValueType();
11784 if (VT.getVectorElementType() == MVT::i1)
11785 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11786
11787 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))
11788 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))
11789 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))
;
11790
11791 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11792 // from two other 128-bit ones.
11793
11794 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11795 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11796}
11797
11798//===----------------------------------------------------------------------===//
11799// Vector shuffle lowering
11800//
11801// This is an experimental code path for lowering vector shuffles on x86. It is
11802// designed to handle arbitrary vector shuffles and blends, gracefully
11803// degrading performance as necessary. It works hard to recognize idiomatic
11804// shuffles and lower them to optimal instruction patterns without leaving
11805// a framework that allows reasonably efficient handling of all vector shuffle
11806// patterns.
11807//===----------------------------------------------------------------------===//
11808
11809/// Tiny helper function to identify a no-op mask.
11810///
11811/// This is a somewhat boring predicate function. It checks whether the mask
11812/// array input, which is assumed to be a single-input shuffle mask of the kind
11813/// used by the X86 shuffle instructions (not a fully general
11814/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11815/// in-place shuffle are 'no-op's.
11816static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11817 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11818 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11818, __extension__
__PRETTY_FUNCTION__))
;
11819 if (Mask[i] >= 0 && Mask[i] != i)
11820 return false;
11821 }
11822 return true;
11823}
11824
11825/// Test whether there are elements crossing LaneSizeInBits lanes in this
11826/// shuffle mask.
11827///
11828/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11829/// and we routinely test for these.
11830static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11831 unsigned ScalarSizeInBits,
11832 ArrayRef<int> Mask) {
11833 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))
11834 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))
11835 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))
;
11836 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11837 int Size = Mask.size();
11838 for (int i = 0; i < Size; ++i)
11839 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11840 return true;
11841 return false;
11842}
11843
11844/// Test whether there are elements crossing 128-bit lanes in this
11845/// shuffle mask.
11846static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11847 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11848}
11849
11850/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11851/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11852/// better support 'repeated mask + lane permute' style shuffles.
11853static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11854 unsigned ScalarSizeInBits,
11855 ArrayRef<int> Mask) {
11856 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))
11857 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))
11858 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))
;
11859 int NumElts = Mask.size();
11860 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11861 int NumLanes = NumElts / NumEltsPerLane;
11862 if (NumLanes > 1) {
11863 for (int i = 0; i != NumLanes; ++i) {
11864 int SrcLane = -1;
11865 for (int j = 0; j != NumEltsPerLane; ++j) {
11866 int M = Mask[(i * NumEltsPerLane) + j];
11867 if (M < 0)
11868 continue;
11869 int Lane = (M % NumElts) / NumEltsPerLane;
11870 if (SrcLane >= 0 && SrcLane != Lane)
11871 return true;
11872 SrcLane = Lane;
11873 }
11874 }
11875 }
11876 return false;
11877}
11878
11879/// Test whether a shuffle mask is equivalent within each sub-lane.
11880///
11881/// This checks a shuffle mask to see if it is performing the same
11882/// lane-relative shuffle in each sub-lane. This trivially implies
11883/// that it is also not lane-crossing. It may however involve a blend from the
11884/// same lane of a second vector.
11885///
11886/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11887/// non-trivial to compute in the face of undef lanes. The representation is
11888/// suitable for use with existing 128-bit shuffles as entries from the second
11889/// vector have been remapped to [LaneSize, 2*LaneSize).
11890static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11891 ArrayRef<int> Mask,
11892 SmallVectorImpl<int> &RepeatedMask) {
11893 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11894 RepeatedMask.assign(LaneSize, -1);
11895 int Size = Mask.size();
11896 for (int i = 0; i < Size; ++i) {
11897 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11897, __extension__
__PRETTY_FUNCTION__))
;
11898 if (Mask[i] < 0)
11899 continue;
11900 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11901 // This entry crosses lanes, so there is no way to model this shuffle.
11902 return false;
11903
11904 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11905 // Adjust second vector indices to start at LaneSize instead of Size.
11906 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11907 : Mask[i] % LaneSize + LaneSize;
11908 if (RepeatedMask[i % LaneSize] < 0)
11909 // This is the first non-undef entry in this slot of a 128-bit lane.
11910 RepeatedMask[i % LaneSize] = LocalM;
11911 else if (RepeatedMask[i % LaneSize] != LocalM)
11912 // Found a mismatch with the repeated mask.
11913 return false;
11914 }
11915 return true;
11916}
11917
11918/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11919static bool
11920is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11921 SmallVectorImpl<int> &RepeatedMask) {
11922 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11923}
11924
11925static bool
11926is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11927 SmallVector<int, 32> RepeatedMask;
11928 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11929}
11930
11931/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11932static bool
11933is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11934 SmallVectorImpl<int> &RepeatedMask) {
11935 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11936}
11937
11938/// Test whether a target shuffle mask is equivalent within each sub-lane.
11939/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11940static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11941 unsigned EltSizeInBits,
11942 ArrayRef<int> Mask,
11943 SmallVectorImpl<int> &RepeatedMask) {
11944 int LaneSize = LaneSizeInBits / EltSizeInBits;
11945 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11946 int Size = Mask.size();
11947 for (int i = 0; i < Size; ++i) {
11948 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11948, __extension__
__PRETTY_FUNCTION__))
;
11949 if (Mask[i] == SM_SentinelUndef)
11950 continue;
11951 if (Mask[i] == SM_SentinelZero) {
11952 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11953 return false;
11954 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11955 continue;
11956 }
11957 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11958 // This entry crosses lanes, so there is no way to model this shuffle.
11959 return false;
11960
11961 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11962 // later vector indices to start at multiples of LaneSize instead of Size.
11963 int LaneM = Mask[i] / Size;
11964 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11965 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11966 // This is the first non-undef entry in this slot of a 128-bit lane.
11967 RepeatedMask[i % LaneSize] = LocalM;
11968 else if (RepeatedMask[i % LaneSize] != LocalM)
11969 // Found a mismatch with the repeated mask.
11970 return false;
11971 }
11972 return true;
11973}
11974
11975/// Test whether a target shuffle mask is equivalent within each sub-lane.
11976/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11977static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11978 ArrayRef<int> Mask,
11979 SmallVectorImpl<int> &RepeatedMask) {
11980 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11981 Mask, RepeatedMask);
11982}
11983
11984/// Checks whether the vector elements referenced by two shuffle masks are
11985/// equivalent.
11986static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11987 int Idx, int ExpectedIdx) {
11988 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__))
11989 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__))
;
11990 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11991 return false;
11992
11993 switch (Op.getOpcode()) {
11994 case ISD::BUILD_VECTOR:
11995 // If the values are build vectors, we can look through them to find
11996 // equivalent inputs that make the shuffles equivalent.
11997 // TODO: Handle MaskSize != Op.getNumOperands()?
11998 if (MaskSize == (int)Op.getNumOperands() &&
11999 MaskSize == (int)ExpectedOp.getNumOperands())
12000 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
12001 break;
12002 case X86ISD::VBROADCAST:
12003 case X86ISD::VBROADCAST_LOAD:
12004 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
12005 return (Op == ExpectedOp &&
12006 (int)Op.getValueType().getVectorNumElements() == MaskSize);
12007 case X86ISD::HADD:
12008 case X86ISD::HSUB:
12009 case X86ISD::FHADD:
12010 case X86ISD::FHSUB:
12011 case X86ISD::PACKSS:
12012 case X86ISD::PACKUS:
12013 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
12014 // TODO: Handle MaskSize != NumElts?
12015 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
12016 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
12017 MVT VT = Op.getSimpleValueType();
12018 int NumElts = VT.getVectorNumElements();
12019 if (MaskSize == NumElts) {
12020 int NumLanes = VT.getSizeInBits() / 128;
12021 int NumEltsPerLane = NumElts / NumLanes;
12022 int NumHalfEltsPerLane = NumEltsPerLane / 2;
12023 bool SameLane =
12024 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
12025 bool SameElt =
12026 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
12027 return SameLane && SameElt;
12028 }
12029 }
12030 break;
12031 }
12032
12033 return false;
12034}
12035
12036/// Checks whether a shuffle mask is equivalent to an explicit list of
12037/// arguments.
12038///
12039/// This is a fast way to test a shuffle mask against a fixed pattern:
12040///
12041/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
12042///
12043/// It returns true if the mask is exactly as wide as the argument list, and
12044/// each element of the mask is either -1 (signifying undef) or the value given
12045/// in the argument.
12046static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
12047 SDValue V1 = SDValue(),
12048 SDValue V2 = SDValue()) {
12049 int Size = Mask.size();
12050 if (Size != (int)ExpectedMask.size())
12051 return false;
12052
12053 for (int i = 0; i < Size; ++i) {
12054 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12054, __extension__
__PRETTY_FUNCTION__))
;
12055 int MaskIdx = Mask[i];
12056 int ExpectedIdx = ExpectedMask[i];
12057 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
12058 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12059 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12060 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12061 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12062 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12063 return false;
12064 }
12065 }
12066 return true;
12067}
12068
12069/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
12070///
12071/// The masks must be exactly the same width.
12072///
12073/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
12074/// value in ExpectedMask is always accepted. Otherwise the indices must match.
12075///
12076/// SM_SentinelZero is accepted as a valid negative index but must match in
12077/// both, or via a known bits test.
12078static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
12079 ArrayRef<int> ExpectedMask,
12080 const SelectionDAG &DAG,
12081 SDValue V1 = SDValue(),
12082 SDValue V2 = SDValue()) {
12083 int Size = Mask.size();
12084 if (Size != (int)ExpectedMask.size())
12085 return false;
12086 assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))
12087 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))
12088 "Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))
;
12089
12090 // Check for out-of-range target shuffle mask indices.
12091 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
12092 return false;
12093
12094 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
12095 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
12096 V1 = SDValue();
12097 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
12098 V2 = SDValue();
12099
12100 APInt ZeroV1 = APInt::getZero(Size);
12101 APInt ZeroV2 = APInt::getZero(Size);
12102
12103 for (int i = 0; i < Size; ++i) {
12104 int MaskIdx = Mask[i];
12105 int ExpectedIdx = ExpectedMask[i];
12106 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
12107 continue;
12108 if (MaskIdx == SM_SentinelZero) {
12109 // If we need this expected index to be a zero element, then update the
12110 // relevant zero mask and perform the known bits at the end to minimize
12111 // repeated computes.
12112 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12113 if (ExpectedV &&
12114 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
12115 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12116 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
12117 ZeroMask.setBit(BitIdx);
12118 continue;
12119 }
12120 }
12121 if (MaskIdx >= 0) {
12122 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12123 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12124 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12125 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12126 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12127 continue;
12128 }
12129 return false;
12130 }
12131 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
12132 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
12133}
12134
12135// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
12136// instructions.
12137static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
12138 const SelectionDAG &DAG) {
12139 if (VT != MVT::v8i32 && VT != MVT::v8f32)
12140 return false;
12141
12142 SmallVector<int, 8> Unpcklwd;
12143 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
12144 /* Unary = */ false);
12145 SmallVector<int, 8> Unpckhwd;
12146 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
12147 /* Unary = */ false);
12148 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
12149 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
12150 return IsUnpackwdMask;
12151}
12152
12153static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
12154 const SelectionDAG &DAG) {
12155 // Create 128-bit vector type based on mask size.
12156 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
12157 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
12158
12159 // We can't assume a canonical shuffle mask, so try the commuted version too.
12160 SmallVector<int, 4> CommutedMask(Mask);
12161 ShuffleVectorSDNode::commuteMask(CommutedMask);
12162
12163 // Match any of unary/binary or low/high.
12164 for (unsigned i = 0; i != 4; ++i) {
12165 SmallVector<int, 16> UnpackMask;
12166 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
12167 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
12168 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
12169 return true;
12170 }
12171 return false;
12172}
12173
12174/// Return true if a shuffle mask chooses elements identically in its top and
12175/// bottom halves. For example, any splat mask has the same top and bottom
12176/// halves. If an element is undefined in only one half of the mask, the halves
12177/// are not considered identical.
12178static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
12179 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12179, __extension__
__PRETTY_FUNCTION__))
;
12180 unsigned HalfSize = Mask.size() / 2;
12181 for (unsigned i = 0; i != HalfSize; ++i) {
12182 if (Mask[i] != Mask[i + HalfSize])
12183 return false;
12184 }
12185 return true;
12186}
12187
12188/// Get a 4-lane 8-bit shuffle immediate for a mask.
12189///
12190/// This helper function produces an 8-bit shuffle immediate corresponding to
12191/// the ubiquitous shuffle encoding scheme used in x86 instructions for
12192/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
12193/// example.
12194///
12195/// NB: We rely heavily on "undef" masks preserving the input lane.
12196static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
12197 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12197, __extension__
__PRETTY_FUNCTION__))
;
12198 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12198, __extension__
__PRETTY_FUNCTION__))
;
12199 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12199, __extension__
__PRETTY_FUNCTION__))
;
12200 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12200, __extension__
__PRETTY_FUNCTION__))
;
12201 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12201, __extension__
__PRETTY_FUNCTION__))
;
12202
12203 // If the mask only uses one non-undef element, then fully 'splat' it to
12204 // improve later broadcast matching.
12205 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
12206 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12206, __extension__
__PRETTY_FUNCTION__))
;
12207
12208 int FirstElt = Mask[FirstIndex];
12209 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
12210 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
12211
12212 unsigned Imm = 0;
12213 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
12214 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
12215 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
12216 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
12217 return Imm;
12218}
12219
12220static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
12221 SelectionDAG &DAG) {
12222 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
12223}
12224
12225// The Shuffle result is as follow:
12226// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
12227// Each Zeroable's element correspond to a particular Mask's element.
12228// As described in computeZeroableShuffleElements function.
12229//
12230// The function looks for a sub-mask that the nonzero elements are in
12231// increasing order. If such sub-mask exist. The function returns true.
12232static bool isNonZeroElementsInOrder(const APInt &Zeroable,
12233 ArrayRef<int> Mask, const EVT &VectorType,
12234 bool &IsZeroSideLeft) {
12235 int NextElement = -1;
12236 // Check if the Mask's nonzero elements are in increasing order.
12237 for (int i = 0, e = Mask.size(); i < e; i++) {
12238 // Checks if the mask's zeros elements are built from only zeros.
12239 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__))
;
12240 if (Mask[i] < 0)
12241 return false;
12242 if (Zeroable[i])
12243 continue;
12244 // Find the lowest non zero element
12245 if (NextElement < 0) {
12246 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
12247 IsZeroSideLeft = NextElement != 0;
12248 }
12249 // Exit if the mask's non zero elements are not in increasing order.
12250 if (NextElement != Mask[i])
12251 return false;
12252 NextElement++;
12253 }
12254 return true;
12255}
12256
12257/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
12258static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
12259 ArrayRef<int> Mask, SDValue V1,
12260 SDValue V2, const APInt &Zeroable,
12261 const X86Subtarget &Subtarget,
12262 SelectionDAG &DAG) {
12263 int Size = Mask.size();
12264 int LaneSize = 128 / VT.getScalarSizeInBits();
12265 const int NumBytes = VT.getSizeInBits() / 8;
12266 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
12267
12268 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
12269 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
12270 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
;
12271
12272 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
12273 // Sign bit set in i8 mask means zero element.
12274 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
12275
12276 SDValue V;
12277 for (int i = 0; i < NumBytes; ++i) {
12278 int M = Mask[i / NumEltBytes];
12279 if (M < 0) {
12280 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
12281 continue;
12282 }
12283 if (Zeroable[i / NumEltBytes]) {
12284 PSHUFBMask[i] = ZeroMask;
12285 continue;
12286 }
12287
12288 // We can only use a single input of V1 or V2.
12289 SDValue SrcV = (M >= Size ? V2 : V1);
12290 if (V && V != SrcV)
12291 return SDValue();
12292 V = SrcV;
12293 M %= Size;
12294
12295 // PSHUFB can't cross lanes, ensure this doesn't happen.
12296 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
12297 return SDValue();
12298
12299 M = M % LaneSize;
12300 M = M * NumEltBytes + (i % NumEltBytes);
12301 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
12302 }
12303 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12303, __extension__
__PRETTY_FUNCTION__))
;
12304
12305 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
12306 return DAG.getBitcast(
12307 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
12308 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
12309}
12310
12311static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
12312 const X86Subtarget &Subtarget, SelectionDAG &DAG,
12313 const SDLoc &dl);
12314
12315// X86 has dedicated shuffle that can be lowered to VEXPAND
12316static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
12317 const APInt &Zeroable,
12318 ArrayRef<int> Mask, SDValue &V1,
12319 SDValue &V2, SelectionDAG &DAG,
12320 const X86Subtarget &Subtarget) {
12321 bool IsLeftZeroSide = true;
12322 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
12323 IsLeftZeroSide))
12324 return SDValue();
12325 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
12326 MVT IntegerType =
12327 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12328 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
12329 unsigned NumElts = VT.getVectorNumElements();
12330 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__))
12331 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__))
;
12332 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
12333 Subtarget, DAG, DL);
12334 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
12335 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
12336 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
12337}
12338
12339static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
12340 unsigned &UnpackOpcode, bool IsUnary,
12341 ArrayRef<int> TargetMask, const SDLoc &DL,
12342 SelectionDAG &DAG,
12343 const X86Subtarget &Subtarget) {
12344 int NumElts = VT.getVectorNumElements();
12345
12346 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
12347 for (int i = 0; i != NumElts; i += 2) {
12348 int M1 = TargetMask[i + 0];
12349 int M2 = TargetMask[i + 1];
12350 Undef1 &= (SM_SentinelUndef == M1);
12351 Undef2 &= (SM_SentinelUndef == M2);
12352 Zero1 &= isUndefOrZero(M1);
12353 Zero2 &= isUndefOrZero(M2);
12354 }
12355 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__))
12356 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__))
;
12357
12358 // Attempt to match the target mask against the unpack lo/hi mask patterns.
12359 SmallVector<int, 64> Unpckl, Unpckh;
12360 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
12361 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
12362 (IsUnary ? V1 : V2))) {
12363 UnpackOpcode = X86ISD::UNPCKL;
12364 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12365 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12366 return true;
12367 }
12368
12369 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
12370 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
12371 (IsUnary ? V1 : V2))) {
12372 UnpackOpcode = X86ISD::UNPCKH;
12373 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12374 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12375 return true;
12376 }
12377
12378 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
12379 if (IsUnary && (Zero1 || Zero2)) {
12380 // Don't bother if we can blend instead.
12381 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
12382 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
12383 return false;
12384
12385 bool MatchLo = true, MatchHi = true;
12386 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
12387 int M = TargetMask[i];
12388
12389 // Ignore if the input is known to be zero or the index is undef.
12390 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
12391 (M == SM_SentinelUndef))
12392 continue;
12393
12394 MatchLo &= (M == Unpckl[i]);
12395 MatchHi &= (M == Unpckh[i]);
12396 }
12397
12398 if (MatchLo || MatchHi) {
12399 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12400 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12401 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12402 return true;
12403 }
12404 }
12405
12406 // If a binary shuffle, commute and try again.
12407 if (!IsUnary) {
12408 ShuffleVectorSDNode::commuteMask(Unpckl);
12409 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
12410 UnpackOpcode = X86ISD::UNPCKL;
12411 std::swap(V1, V2);
12412 return true;
12413 }
12414
12415 ShuffleVectorSDNode::commuteMask(Unpckh);
12416 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
12417 UnpackOpcode = X86ISD::UNPCKH;
12418 std::swap(V1, V2);
12419 return true;
12420 }
12421 }
12422
12423 return false;
12424}
12425
12426// X86 has dedicated unpack instructions that can handle specific blend
12427// operations: UNPCKH and UNPCKL.
12428static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12429 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12430 SelectionDAG &DAG) {
12431 SmallVector<int, 8> Unpckl;
12432 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12433 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12434 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12435
12436 SmallVector<int, 8> Unpckh;
12437 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12438 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12439 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12440
12441 // Commute and try again.
12442 ShuffleVectorSDNode::commuteMask(Unpckl);
12443 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12444 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12445
12446 ShuffleVectorSDNode::commuteMask(Unpckh);
12447 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12448 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12449
12450 return SDValue();
12451}
12452
12453/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12454/// followed by unpack 256-bit.
12455static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12456 ArrayRef<int> Mask, SDValue V1,
12457 SDValue V2, SelectionDAG &DAG) {
12458 SmallVector<int, 32> Unpckl, Unpckh;
12459 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12460 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12461
12462 unsigned UnpackOpcode;
12463 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12464 UnpackOpcode = X86ISD::UNPCKL;
12465 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12466 UnpackOpcode = X86ISD::UNPCKH;
12467 else
12468 return SDValue();
12469
12470 // This is a "natural" unpack operation (rather than the 128-bit sectored
12471 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12472 // input in order to use the x86 instruction.
12473 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12474 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12475 V1 = DAG.getBitcast(VT, V1);
12476 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12477}
12478
12479// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12480// source into the lower elements and zeroing the upper elements.
12481static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12482 ArrayRef<int> Mask, const APInt &Zeroable,
12483 const X86Subtarget &Subtarget) {
12484 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12485 return false;
12486
12487 unsigned NumElts = Mask.size();
12488 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12489 unsigned MaxScale = 64 / EltSizeInBits;
12490
12491 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12492 unsigned SrcEltBits = EltSizeInBits * Scale;
12493 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12494 continue;
12495 unsigned NumSrcElts = NumElts / Scale;
12496 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12497 continue;
12498 unsigned UpperElts = NumElts - NumSrcElts;
12499 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12500 continue;
12501 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12502 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12503 DstVT = MVT::getIntegerVT(EltSizeInBits);
12504 if ((NumSrcElts * EltSizeInBits) >= 128) {
12505 // ISD::TRUNCATE
12506 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12507 } else {
12508 // X86ISD::VTRUNC
12509 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12510 }
12511 return true;
12512 }
12513
12514 return false;
12515}
12516
12517// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12518// element padding to the final DstVT.
12519static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12520 const X86Subtarget &Subtarget,
12521 SelectionDAG &DAG, bool ZeroUppers) {
12522 MVT SrcVT = Src.getSimpleValueType();
12523 MVT DstSVT = DstVT.getScalarType();
12524 unsigned NumDstElts = DstVT.getVectorNumElements();
12525 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12526 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12527
12528 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12529 return SDValue();
12530
12531 // Perform a direct ISD::TRUNCATE if possible.
12532 if (NumSrcElts == NumDstElts)
12533 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12534
12535 if (NumSrcElts > NumDstElts) {
12536 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12537 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12538 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12539 }
12540
12541 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12542 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12543 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12544 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12545 DstVT.getSizeInBits());
12546 }
12547
12548 // Non-VLX targets must truncate from a 512-bit type, so we need to
12549 // widen, truncate and then possibly extract the original subvector.
12550 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12551 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12552 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12553 }
12554
12555 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12556 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12557 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12558 if (DstVT != TruncVT)
12559 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12560 DstVT.getSizeInBits());
12561 return Trunc;
12562}
12563
12564// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12565//
12566// An example is the following:
12567//
12568// t0: ch = EntryToken
12569// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12570// t25: v4i32 = truncate t2
12571// t41: v8i16 = bitcast t25
12572// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12573// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12574// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12575// t18: v2i64 = bitcast t51
12576//
12577// One can just use a single vpmovdw instruction, without avx512vl we need to
12578// use the zmm variant and extract the lower subvector, padding with zeroes.
12579// TODO: Merge with lowerShuffleAsVTRUNC.
12580static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12581 SDValue V2, ArrayRef<int> Mask,
12582 const APInt &Zeroable,
12583 const X86Subtarget &Subtarget,
12584 SelectionDAG &DAG) {
12585 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12585, __extension__
__PRETTY_FUNCTION__))
;
12586 if (!Subtarget.hasAVX512())
12587 return SDValue();
12588
12589 unsigned NumElts = VT.getVectorNumElements();
12590 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12591 unsigned MaxScale = 64 / EltSizeInBits;
12592 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12593 unsigned SrcEltBits = EltSizeInBits * Scale;
12594 unsigned NumSrcElts = NumElts / Scale;
12595 unsigned UpperElts = NumElts - NumSrcElts;
12596 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12597 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12598 continue;
12599
12600 // Attempt to find a matching source truncation, but as a fall back VLX
12601 // cases can use the VPMOV directly.
12602 SDValue Src = peekThroughBitcasts(V1);
12603 if (Src.getOpcode() == ISD::TRUNCATE &&
12604 Src.getScalarValueSizeInBits() == SrcEltBits) {
12605 Src = Src.getOperand(0);
12606 } else if (Subtarget.hasVLX()) {
12607 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12608 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12609 Src = DAG.getBitcast(SrcVT, Src);
12610 // Don't do this if PACKSS/PACKUS could perform it cheaper.
12611 if (Scale == 2 &&
12612 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12613 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12614 return SDValue();
12615 } else
12616 return SDValue();
12617
12618 // VPMOVWB is only available with avx512bw.
12619 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
12620 return SDValue();
12621
12622 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12623 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12624 }
12625
12626 return SDValue();
12627}
12628
12629// Attempt to match binary shuffle patterns as a truncate.
12630static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12631 SDValue V2, ArrayRef<int> Mask,
12632 const APInt &Zeroable,
12633 const X86Subtarget &Subtarget,
12634 SelectionDAG &DAG) {
12635 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))
12636 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))
;
12637 if (!Subtarget.hasAVX512())
12638 return SDValue();
12639
12640 unsigned NumElts = VT.getVectorNumElements();
12641 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12642 unsigned MaxScale = 64 / EltSizeInBits;
12643 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12644 // TODO: Support non-BWI VPMOVWB truncations?
12645 unsigned SrcEltBits = EltSizeInBits * Scale;
12646 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12647 continue;
12648
12649 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
12650 // Bail if the V2 elements are undef.
12651 unsigned NumHalfSrcElts = NumElts / Scale;
12652 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12653 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
12654 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
12655 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12656 continue;
12657
12658 // The elements beyond the truncation must be undef/zero.
12659 unsigned UpperElts = NumElts - NumSrcElts;
12660 if (UpperElts > 0 &&
12661 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12662 continue;
12663 bool UndefUppers =
12664 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12665
12666 // For offset truncations, ensure that the concat is cheap.
12667 if (Offset) {
12668 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
12669 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12670 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12671 return Lo.getOperand(0) == Hi.getOperand(0);
12672 if (ISD::isNormalLoad(Lo.getNode()) &&
12673 ISD::isNormalLoad(Hi.getNode())) {
12674 auto *LDLo = cast<LoadSDNode>(Lo);
12675 auto *LDHi = cast<LoadSDNode>(Hi);
12676 return DAG.areNonVolatileConsecutiveLoads(
12677 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
12678 }
12679 return false;
12680 };
12681 if (!IsCheapConcat(V1, V2))
12682 continue;
12683 }
12684
12685 // As we're using both sources then we need to concat them together
12686 // and truncate from the double-sized src.
12687 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12688 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12689
12690 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12691 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12692 Src = DAG.getBitcast(SrcVT, Src);
12693
12694 // Shift the offset'd elements into place for the truncation.
12695 // TODO: Use getTargetVShiftByConstNode.
12696 if (Offset)
12697 Src = DAG.getNode(
12698 X86ISD::VSRLI, DL, SrcVT, Src,
12699 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
12700
12701 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12702 }
12703 }
12704
12705 return SDValue();
12706}
12707
12708/// Check whether a compaction lowering can be done by dropping even/odd
12709/// elements and compute how many times even/odd elements must be dropped.
12710///
12711/// This handles shuffles which take every Nth element where N is a power of
12712/// two. Example shuffle masks:
12713///
12714/// (even)
12715/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12716/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12717/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12718/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12719/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12720/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12721///
12722/// (odd)
12723/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12724/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12725///
12726/// Any of these lanes can of course be undef.
12727///
12728/// This routine only supports N <= 3.
12729/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12730/// for larger N.
12731///
12732/// \returns N above, or the number of times even/odd elements must be dropped
12733/// if there is such a number. Otherwise returns zero.
12734static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12735 bool IsSingleInput) {
12736 // The modulus for the shuffle vector entries is based on whether this is
12737 // a single input or not.
12738 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12739 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__))
12740 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__))
;
12741
12742 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12743 int Offset = MatchEven ? 0 : 1;
12744
12745 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12746 // and 2^3 simultaneously. This is because we may have ambiguity with
12747 // partially undef inputs.
12748 bool ViableForN[3] = {true, true, true};
12749
12750 for (int i = 0, e = Mask.size(); i < e; ++i) {
12751 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12752 // want.
12753 if (Mask[i] < 0)
12754 continue;
12755
12756 bool IsAnyViable = false;
12757 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12758 if (ViableForN[j]) {
12759 uint64_t N = j + 1;
12760
12761 // The shuffle mask must be equal to (i * 2^N) % M.
12762 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12763 IsAnyViable = true;
12764 else
12765 ViableForN[j] = false;
12766 }
12767 // Early exit if we exhaust the possible powers of two.
12768 if (!IsAnyViable)
12769 break;
12770 }
12771
12772 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12773 if (ViableForN[j])
12774 return j + 1;
12775
12776 // Return 0 as there is no viable power of two.
12777 return 0;
12778}
12779
12780// X86 has dedicated pack instructions that can handle specific truncation
12781// operations: PACKSS and PACKUS.
12782// Checks for compaction shuffle masks if MaxStages > 1.
12783// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12784static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12785 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12786 const SelectionDAG &DAG,
12787 const X86Subtarget &Subtarget,
12788 unsigned MaxStages = 1) {
12789 unsigned NumElts = VT.getVectorNumElements();
12790 unsigned BitSize = VT.getScalarSizeInBits();
12791 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__))
12792 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__))
;
12793
12794 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12795 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12796 unsigned NumPackedBits = NumSrcBits - BitSize;
12797 N1 = peekThroughBitcasts(N1);
12798 N2 = peekThroughBitcasts(N2);
12799 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12800 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12801 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12802 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12803 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12804 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12805 return false;
12806 if (Subtarget.hasSSE41() || BitSize == 8) {
12807 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12808 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12809 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12810 V1 = N1;
12811 V2 = N2;
12812 SrcVT = PackVT;
12813 PackOpcode = X86ISD::PACKUS;
12814 return true;
12815 }
12816 }
12817 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12818 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12819 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12820 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12821 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12822 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12823 V1 = N1;
12824 V2 = N2;
12825 SrcVT = PackVT;
12826 PackOpcode = X86ISD::PACKSS;
12827 return true;
12828 }
12829 return false;
12830 };
12831
12832 // Attempt to match against wider and wider compaction patterns.
12833 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12834 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12835 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12836
12837 // Try binary shuffle.
12838 SmallVector<int, 32> BinaryMask;
12839 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12840 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
12841 if (MatchPACK(V1, V2, PackVT))
12842 return true;
12843
12844 // Try unary shuffle.
12845 SmallVector<int, 32> UnaryMask;
12846 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12847 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
12848 if (MatchPACK(V1, V1, PackVT))
12849 return true;
12850 }
12851
12852 return false;
12853}
12854
12855static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12856 SDValue V1, SDValue V2, SelectionDAG &DAG,
12857 const X86Subtarget &Subtarget) {
12858 MVT PackVT;
12859 unsigned PackOpcode;
12860 unsigned SizeBits = VT.getSizeInBits();
12861 unsigned EltBits = VT.getScalarSizeInBits();
12862 unsigned MaxStages = Log2_32(64 / EltBits);
12863 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12864 Subtarget, MaxStages))
12865 return SDValue();
12866
12867 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12868 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12869
12870 // Don't lower multi-stage packs on AVX512, truncation is better.
12871 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12872 return SDValue();
12873
12874 // Pack to the largest type possible:
12875 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12876 unsigned MaxPackBits = 16;
12877 if (CurrentEltBits > 16 &&
12878 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12879 MaxPackBits = 32;
12880
12881 // Repeatedly pack down to the target size.
12882 SDValue Res;
12883 for (unsigned i = 0; i != NumStages; ++i) {
12884 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12885 unsigned NumSrcElts = SizeBits / SrcEltBits;
12886 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12887 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12888 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12889 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12890 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12891 DAG.getBitcast(SrcVT, V2));
12892 V1 = V2 = Res;
12893 CurrentEltBits /= 2;
12894 }
12895 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__))
12896 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__))
;
12897 return Res;
12898}
12899
12900/// Try to emit a bitmask instruction for a shuffle.
12901///
12902/// This handles cases where we can model a blend exactly as a bitmask due to
12903/// one of the inputs being zeroable.
12904static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12905 SDValue V2, ArrayRef<int> Mask,
12906 const APInt &Zeroable,
12907 const X86Subtarget &Subtarget,
12908 SelectionDAG &DAG) {
12909 MVT MaskVT = VT;
12910 MVT EltVT = VT.getVectorElementType();
12911 SDValue Zero, AllOnes;
12912 // Use f64 if i64 isn't legal.
12913 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12914 EltVT = MVT::f64;
12915 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12916 }
12917
12918 MVT LogicVT = VT;
12919 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12920 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12921 APFloat AllOnesValue =
12922 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12923 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12924 LogicVT =
12925 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12926 } else {
12927 Zero = DAG.getConstant(0, DL, EltVT);
12928 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12929 }
12930
12931 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12932 SDValue V;
12933 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12934 if (Zeroable[i])
12935 continue;
12936 if (Mask[i] % Size != i)
12937 return SDValue(); // Not a blend.
12938 if (!V)
12939 V = Mask[i] < Size ? V1 : V2;
12940 else if (V != (Mask[i] < Size ? V1 : V2))
12941 return SDValue(); // Can only let one input through the mask.
12942
12943 VMaskOps[i] = AllOnes;
12944 }
12945 if (!V)
12946 return SDValue(); // No non-zeroable elements!
12947
12948 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12949 VMask = DAG.getBitcast(LogicVT, VMask);
12950 V = DAG.getBitcast(LogicVT, V);
12951 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12952 return DAG.getBitcast(VT, And);
12953}
12954
12955/// Try to emit a blend instruction for a shuffle using bit math.
12956///
12957/// This is used as a fallback approach when first class blend instructions are
12958/// unavailable. Currently it is only suitable for integer vectors, but could
12959/// be generalized for floating point vectors if desirable.
12960static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12961 SDValue V2, ArrayRef<int> Mask,
12962 SelectionDAG &DAG) {
12963 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12963, __extension__
__PRETTY_FUNCTION__))
;
12964 MVT EltVT = VT.getVectorElementType();
12965 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12966 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12967 SmallVector<SDValue, 16> MaskOps;
12968 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12969 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12970 return SDValue(); // Shuffled input!
12971 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12972 }
12973
12974 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12975 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12976 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12977 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12978}
12979
12980static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12981 SDValue PreservedSrc,
12982 const X86Subtarget &Subtarget,
12983 SelectionDAG &DAG);
12984
12985static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
12986 MutableArrayRef<int> Mask,
12987 const APInt &Zeroable, bool &ForceV1Zero,
12988 bool &ForceV2Zero, uint64_t &BlendMask) {
12989 bool V1IsZeroOrUndef =
12990 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12991 bool V2IsZeroOrUndef =
12992 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12993
12994 BlendMask = 0;
12995 ForceV1Zero = false, ForceV2Zero = false;
12996 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12996, __extension__
__PRETTY_FUNCTION__))
;
12997
12998 int NumElts = Mask.size();
12999 int NumLanes = VT.getSizeInBits() / 128;
13000 int NumEltsPerLane = NumElts / NumLanes;
13001 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__))
;
13002
13003 // For 32/64-bit elements, if we only reference one input (plus any undefs),
13004 // then ensure the blend mask part for that lane just references that input.
13005 bool ForceWholeLaneMasks =
13006 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
13007
13008 // Attempt to generate the binary blend mask. If an input is zero then
13009 // we can use any lane.
13010 for (int Lane = 0; Lane != NumLanes; ++Lane) {
13011 // Keep track of the inputs used per lane.
13012 bool LaneV1InUse = false;
13013 bool LaneV2InUse = false;
13014 uint64_t LaneBlendMask = 0;
13015 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
13016 int Elt = (Lane * NumEltsPerLane) + LaneElt;
13017 int M = Mask[Elt];
13018 if (M == SM_SentinelUndef)
13019 continue;
13020 if (M == Elt || (0 <= M && M < NumElts &&
13021 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
13022 Mask[Elt] = Elt;
13023 LaneV1InUse = true;
13024 continue;
13025 }
13026 if (M == (Elt + NumElts) ||
13027 (NumElts <= M &&
13028 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
13029 LaneBlendMask |= 1ull << LaneElt;
13030 Mask[Elt] = Elt + NumElts;
13031 LaneV2InUse = true;
13032 continue;
13033 }
13034 if (Zeroable[Elt]) {
13035 if (V1IsZeroOrUndef) {
13036 ForceV1Zero = true;
13037 Mask[Elt] = Elt;
13038 LaneV1InUse = true;
13039 continue;
13040 }
13041 if (V2IsZeroOrUndef) {
13042 ForceV2Zero = true;
13043 LaneBlendMask |= 1ull << LaneElt;
13044 Mask[Elt] = Elt + NumElts;
13045 LaneV2InUse = true;
13046 continue;
13047 }
13048 }
13049 return false;
13050 }
13051
13052 // If we only used V2 then splat the lane blend mask to avoid any demanded
13053 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
13054 // blend mask bit).
13055 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
13056 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
13057
13058 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
13059 }
13060 return true;
13061}
13062
13063static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
13064 int Scale) {
13065 uint64_t ScaledMask = 0;
13066 for (int i = 0; i != Size; ++i)
13067 if (BlendMask & (1ull << i))
13068 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
13069 return ScaledMask;
13070}
13071
13072/// Try to emit a blend instruction for a shuffle.
13073///
13074/// This doesn't do any checks for the availability of instructions for blending
13075/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
13076/// be matched in the backend with the type given. What it does check for is
13077/// that the shuffle mask is a blend, or convertible into a blend with zero.
13078static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
13079 SDValue V2, ArrayRef<int> Original,
13080 const APInt &Zeroable,
13081 const X86Subtarget &Subtarget,
13082 SelectionDAG &DAG) {
13083 uint64_t BlendMask = 0;
13084 bool ForceV1Zero = false, ForceV2Zero = false;
13085 SmallVector<int, 64> Mask(Original);
13086 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
13087 BlendMask))
13088 return SDValue();
13089
13090 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
13091 if (ForceV1Zero)
13092 V1 = getZeroVector(VT, Subtarget, DAG, DL);
13093 if (ForceV2Zero)
13094 V2 = getZeroVector(VT, Subtarget, DAG, DL);
13095
13096 unsigned NumElts = VT.getVectorNumElements();
13097
13098 switch (VT.SimpleTy) {
13099 case MVT::v4i64:
13100 case MVT::v8i32:
13101 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13101, __extension__
__PRETTY_FUNCTION__))
;
13102 [[fallthrough]];
13103 case MVT::v4f64:
13104 case MVT::v8f32:
13105 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13105, __extension__
__PRETTY_FUNCTION__))
;
13106 [[fallthrough]];
13107 case MVT::v2f64:
13108 case MVT::v2i64:
13109 case MVT::v4f32:
13110 case MVT::v4i32:
13111 case MVT::v8i16:
13112 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13112, __extension__
__PRETTY_FUNCTION__))
;
13113 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
13114 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13115 case MVT::v16i16: {
13116 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13116, __extension__
__PRETTY_FUNCTION__))
;
13117 SmallVector<int, 8> RepeatedMask;
13118 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13119 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
13120 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13120, __extension__
__PRETTY_FUNCTION__))
;
13121 BlendMask = 0;
13122 for (int i = 0; i < 8; ++i)
13123 if (RepeatedMask[i] >= 8)
13124 BlendMask |= 1ull << i;
13125 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13126 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13127 }
13128 // Use PBLENDW for lower/upper lanes and then blend lanes.
13129 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
13130 // merge to VSELECT where useful.
13131 uint64_t LoMask = BlendMask & 0xFF;
13132 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
13133 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
13134 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13135 DAG.getTargetConstant(LoMask, DL, MVT::i8));
13136 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13137 DAG.getTargetConstant(HiMask, DL, MVT::i8));
13138 return DAG.getVectorShuffle(
13139 MVT::v16i16, DL, Lo, Hi,
13140 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
13141 }
13142 [[fallthrough]];
13143 }
13144 case MVT::v32i8:
13145 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13145, __extension__
__PRETTY_FUNCTION__))
;
13146 [[fallthrough]];
13147 case MVT::v16i8: {
13148 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13148, __extension__
__PRETTY_FUNCTION__))
;
13149
13150 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
13151 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13152 Subtarget, DAG))
13153 return Masked;
13154
13155 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
13156 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13157 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13158 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13159 }
13160
13161 // If we have VPTERNLOG, we can use that as a bit blend.
13162 if (Subtarget.hasVLX())
13163 if (SDValue BitBlend =
13164 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13165 return BitBlend;
13166
13167 // Scale the blend by the number of bytes per element.
13168 int Scale = VT.getScalarSizeInBits() / 8;
13169
13170 // This form of blend is always done on bytes. Compute the byte vector
13171 // type.
13172 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13173
13174 // x86 allows load folding with blendvb from the 2nd source operand. But
13175 // we are still using LLVM select here (see comment below), so that's V1.
13176 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
13177 // allow that load-folding possibility.
13178 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
13179 ShuffleVectorSDNode::commuteMask(Mask);
13180 std::swap(V1, V2);
13181 }
13182
13183 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
13184 // mix of LLVM's code generator and the x86 backend. We tell the code
13185 // generator that boolean values in the elements of an x86 vector register
13186 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
13187 // mapping a select to operand #1, and 'false' mapping to operand #2. The
13188 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
13189 // of the element (the remaining are ignored) and 0 in that high bit would
13190 // mean operand #1 while 1 in the high bit would mean operand #2. So while
13191 // the LLVM model for boolean values in vector elements gets the relevant
13192 // bit set, it is set backwards and over constrained relative to x86's
13193 // actual model.
13194 SmallVector<SDValue, 32> VSELECTMask;
13195 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13196 for (int j = 0; j < Scale; ++j)
13197 VSELECTMask.push_back(
13198 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
13199 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
13200 MVT::i8));
13201
13202 V1 = DAG.getBitcast(BlendVT, V1);
13203 V2 = DAG.getBitcast(BlendVT, V2);
13204 return DAG.getBitcast(
13205 VT,
13206 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
13207 V1, V2));
13208 }
13209 case MVT::v16f32:
13210 case MVT::v8f64:
13211 case MVT::v8i64:
13212 case MVT::v16i32:
13213 case MVT::v32i16:
13214 case MVT::v64i8: {
13215 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
13216 bool OptForSize = DAG.shouldOptForSize();
13217 if (!OptForSize) {
13218 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13219 Subtarget, DAG))
13220 return Masked;
13221 }
13222
13223 // Otherwise load an immediate into a GPR, cast to k-register, and use a
13224 // masked move.
13225 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13226 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13227 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13228 }
13229 default:
13230 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13230)
;
13231 }
13232}
13233
13234/// Try to lower as a blend of elements from two inputs followed by
13235/// a single-input permutation.
13236///
13237/// This matches the pattern where we can blend elements from two inputs and
13238/// then reduce the shuffle to a single-input permutation.
13239static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
13240 SDValue V1, SDValue V2,
13241 ArrayRef<int> Mask,
13242 SelectionDAG &DAG,
13243 bool ImmBlends = false) {
13244 // We build up the blend mask while checking whether a blend is a viable way
13245 // to reduce the shuffle.
13246 SmallVector<int, 32> BlendMask(Mask.size(), -1);
13247 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
13248
13249 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13250 if (Mask[i] < 0)
13251 continue;
13252
13253 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13253, __extension__
__PRETTY_FUNCTION__))
;
13254
13255 if (BlendMask[Mask[i] % Size] < 0)
13256 BlendMask[Mask[i] % Size] = Mask[i];
13257 else if (BlendMask[Mask[i] % Size] != Mask[i])
13258 return SDValue(); // Can't blend in the needed input!
13259
13260 PermuteMask[i] = Mask[i] % Size;
13261 }
13262
13263 // If only immediate blends, then bail if the blend mask can't be widened to
13264 // i16.
13265 unsigned EltSize = VT.getScalarSizeInBits();
13266 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
13267 return SDValue();
13268
13269 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
13270 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
13271}
13272
13273/// Try to lower as an unpack of elements from two inputs followed by
13274/// a single-input permutation.
13275///
13276/// This matches the pattern where we can unpack elements from two inputs and
13277/// then reduce the shuffle to a single-input (wider) permutation.
13278static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
13279 SDValue V1, SDValue V2,
13280 ArrayRef<int> Mask,
13281 SelectionDAG &DAG) {
13282 int NumElts = Mask.size();
13283 int NumLanes = VT.getSizeInBits() / 128;
13284 int NumLaneElts = NumElts / NumLanes;
13285 int NumHalfLaneElts = NumLaneElts / 2;
13286
13287 bool MatchLo = true, MatchHi = true;
13288 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13289
13290 // Determine UNPCKL/UNPCKH type and operand order.
13291 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
30
Assuming 'Lane' is not equal to 'NumElts'
31
Loop condition is true. Entering loop body
43
Assuming 'Lane' is equal to 'NumElts'
13292 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
32
Assuming 'Elt' is not equal to 'NumLaneElts'
33
Loop condition is true. Entering loop body
41
Assuming 'Elt' is equal to 'NumLaneElts'
42
Loop condition is false. Execution continues on line 13291
13293 int M = Mask[Lane + Elt];
13294 if (M < 0)
34
Assuming 'M' is >= 0
35
Taking false branch
13295 continue;
13296
13297 SDValue &Op = Ops[Elt & 1];
13298 if (M < NumElts && (Op.isUndef() || Op == V1))
36
Assuming 'M' is >= 'NumElts'
13299 Op = V1;
13300 else if (NumElts <= M && (Op.isUndef() || Op == V2))
37
Assuming 'NumElts' is <= 'M'
13301 Op = V2;
13302 else
13303 return SDValue();
13304
13305 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
13306 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
38
Assuming the condition is false
13307 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
13308 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
39
Assuming the condition is false
13309 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
13310 if (!MatchLo && !MatchHi)
40
Assuming 'MatchLo' is true
13311 return SDValue();
13312 }
13313 }
13314 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13314, __extension__
__PRETTY_FUNCTION__))
;
44
Loop condition is false. Execution continues on line 13314
45
Assuming the condition is true
46
'?' condition is true
13315
13316 // Now check that each pair of elts come from the same unpack pair
13317 // and set the permute mask based on each pair.
13318 // TODO - Investigate cases where we permute individual elements.
13319 SmallVector<int, 32> PermuteMask(NumElts, -1);
13320 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
47
Loop condition is true. Entering loop body
13321 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
48
Loop condition is true. Entering loop body
13322 int M0 = Mask[Lane + Elt + 0];
13323 int M1 = Mask[Lane + Elt + 1];
13324 if (0 <= M0 && 0 <= M1 &&
49
Assuming 'M0' is < 0
13325 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
13326 return SDValue();
13327 if (0 <= M0
49.1
'M0' is < 0
)
50
Taking false branch
13328 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
13329 if (0 <= M1)
51
Assuming 'M1' is >= 0
52
Taking true branch
13330 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
53
The result of the '%' expression is undefined
13331 }
13332 }
13333
13334 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
13335 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
13336 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
13337}
13338
13339/// Try to lower a shuffle as a permute of the inputs followed by an
13340/// UNPCK instruction.
13341///
13342/// This specifically targets cases where we end up with alternating between
13343/// the two inputs, and so can permute them into something that feeds a single
13344/// UNPCK instruction. Note that this routine only targets integer vectors
13345/// because for floating point vectors we have a generalized SHUFPS lowering
13346/// strategy that handles everything that doesn't *exactly* match an unpack,
13347/// making this clever lowering unnecessary.
13348static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
13349 SDValue V1, SDValue V2,
13350 ArrayRef<int> Mask,
13351 const X86Subtarget &Subtarget,
13352 SelectionDAG &DAG) {
13353 int Size = Mask.size();
13354 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13354, __extension__
__PRETTY_FUNCTION__))
;
13355
13356 // This routine only supports 128-bit integer dual input vectors.
13357 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
13358 return SDValue();
13359
13360 int NumLoInputs =
13361 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13362 int NumHiInputs =
13363 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13364
13365 bool UnpackLo = NumLoInputs >= NumHiInputs;
13366
13367 auto TryUnpack = [&](int ScalarSize, int Scale) {
13368 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13369 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13370
13371 for (int i = 0; i < Size; ++i) {
13372 if (Mask[i] < 0)
13373 continue;
13374
13375 // Each element of the unpack contains Scale elements from this mask.
13376 int UnpackIdx = i / Scale;
13377
13378 // We only handle the case where V1 feeds the first slots of the unpack.
13379 // We rely on canonicalization to ensure this is the case.
13380 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13381 return SDValue();
13382
13383 // Setup the mask for this input. The indexing is tricky as we have to
13384 // handle the unpack stride.
13385 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13386 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13387 Mask[i] % Size;
13388 }
13389
13390 // If we will have to shuffle both inputs to use the unpack, check whether
13391 // we can just unpack first and shuffle the result. If so, skip this unpack.
13392 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13393 !isNoopShuffleMask(V2Mask))
13394 return SDValue();
13395
13396 // Shuffle the inputs into place.
13397 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13398 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13399
13400 // Cast the inputs to the type we will use to unpack them.
13401 MVT UnpackVT =
13402 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13403 V1 = DAG.getBitcast(UnpackVT, V1);
13404 V2 = DAG.getBitcast(UnpackVT, V2);
13405
13406 // Unpack the inputs and cast the result back to the desired type.
13407 return DAG.getBitcast(
13408 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13409 UnpackVT, V1, V2));
13410 };
13411
13412 // We try each unpack from the largest to the smallest to try and find one
13413 // that fits this mask.
13414 int OrigScalarSize = VT.getScalarSizeInBits();
13415 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13416 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13417 return Unpack;
13418
13419 // If we're shuffling with a zero vector then we're better off not doing
13420 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13421 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13422 ISD::isBuildVectorAllZeros(V2.getNode()))
13423 return SDValue();
13424
13425 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13426 // initial unpack.
13427 if (NumLoInputs == 0 || NumHiInputs == 0) {
13428 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13429, __extension__
__PRETTY_FUNCTION__))
13429 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13429, __extension__
__PRETTY_FUNCTION__))
;
13430 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13431
13432 // FIXME: We could consider the total complexity of the permute of each
13433 // possible unpacking. Or at the least we should consider how many
13434 // half-crossings are created.
13435 // FIXME: We could consider commuting the unpacks.
13436
13437 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13438 for (int i = 0; i < Size; ++i) {
13439 if (Mask[i] < 0)
13440 continue;
13441
13442 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13442, __extension__
__PRETTY_FUNCTION__))
;
13443
13444 PermMask[i] =
13445 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13446 }
13447 return DAG.getVectorShuffle(
13448 VT, DL,
13449 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
13450 V1, V2),
13451 DAG.getUNDEF(VT), PermMask);
13452 }
13453
13454 return SDValue();
13455}
13456
13457/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
13458/// permuting the elements of the result in place.
13459static SDValue lowerShuffleAsByteRotateAndPermute(
13460 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13461 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13462 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
13463 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
13464 (VT.is512BitVector() && !Subtarget.hasBWI()))
13465 return SDValue();
13466
13467 // We don't currently support lane crossing permutes.
13468 if (is128BitLaneCrossingShuffleMask(VT, Mask))
13469 return SDValue();
13470
13471 int Scale = VT.getScalarSizeInBits() / 8;
13472 int NumLanes = VT.getSizeInBits() / 128;
13473 int NumElts = VT.getVectorNumElements();
13474 int NumEltsPerLane = NumElts / NumLanes;
13475
13476 // Determine range of mask elts.
13477 bool Blend1 = true;
13478 bool Blend2 = true;
13479 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13480 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13481 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13482 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13483 int M = Mask[Lane + Elt];
13484 if (M < 0)
13485 continue;
13486 if (M < NumElts) {
13487 Blend1 &= (M == (Lane + Elt));
13488 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13488, __extension__
__PRETTY_FUNCTION__))
;
13489 M = M % NumEltsPerLane;
13490 Range1.first = std::min(Range1.first, M);
13491 Range1.second = std::max(Range1.second, M);
13492 } else {
13493 M -= NumElts;
13494 Blend2 &= (M == (Lane + Elt));
13495 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13495, __extension__
__PRETTY_FUNCTION__))
;
13496 M = M % NumEltsPerLane;
13497 Range2.first = std::min(Range2.first, M);
13498 Range2.second = std::max(Range2.second, M);
13499 }
13500 }
13501 }
13502
13503 // Bail if we don't need both elements.
13504 // TODO - it might be worth doing this for unary shuffles if the permute
13505 // can be widened.
13506 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
13507 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
13508 return SDValue();
13509
13510 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
13511 return SDValue();
13512
13513 // Rotate the 2 ops so we can access both ranges, then permute the result.
13514 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
13515 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13516 SDValue Rotate = DAG.getBitcast(
13517 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
13518 DAG.getBitcast(ByteVT, Lo),
13519 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
13520 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
13521 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13522 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13523 int M = Mask[Lane + Elt];
13524 if (M < 0)
13525 continue;
13526 if (M < NumElts)
13527 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
13528 else
13529 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
13530 }
13531 }
13532 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
13533 };
13534
13535 // Check if the ranges are small enough to rotate from either direction.
13536 if (Range2.second < Range1.first)
13537 return RotateAndPermute(V1, V2, Range1.first, 0);
13538 if (Range1.second < Range2.first)
13539 return RotateAndPermute(V2, V1, Range2.first, NumElts);
13540 return SDValue();
13541}
13542
13543static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
13544 return isUndefOrEqual(Mask, 0);
13545}
13546
13547static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
13548 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
13549}
13550
13551/// Generic routine to decompose a shuffle and blend into independent
13552/// blends and permutes.
13553///
13554/// This matches the extremely common pattern for handling combined
13555/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
13556/// operations. It will try to pick the best arrangement of shuffles and
13557/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
13558static SDValue lowerShuffleAsDecomposedShuffleMerge(
13559 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13560 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13561 int NumElts = Mask.size();
13562 int NumLanes = VT.getSizeInBits() / 128;
13563 int NumEltsPerLane = NumElts / NumLanes;
13564
13565 // Shuffle the input elements into the desired positions in V1 and V2 and
13566 // unpack/blend them together.
13567 bool IsAlternating = true;
13568 SmallVector<int, 32> V1Mask(NumElts, -1);
13569 SmallVector<int, 32> V2Mask(NumElts, -1);
13570 SmallVector<int, 32> FinalMask(NumElts, -1);
13571 for (int i = 0; i < NumElts; ++i) {
23
Assuming 'i' is >= 'NumElts'
24
Loop condition is false. Execution continues on line 13588
13572 int M = Mask[i];
13573 if (M >= 0 && M < NumElts) {
13574 V1Mask[i] = M;
13575 FinalMask[i] = i;
13576 IsAlternating &= (i & 1) == 0;
13577 } else if (M >= NumElts) {
13578 V2Mask[i] = M - NumElts;
13579 FinalMask[i] = i + NumElts;
13580 IsAlternating &= (i & 1) == 1;
13581 }
13582 }
13583
13584 // If we effectively only demand the 0'th element of \p Input, and not only
13585 // as 0'th element, then broadcast said input,
13586 // and change \p InputMask to be a no-op (identity) mask.
13587 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
13588 &DAG](SDValue &Input,
13589 MutableArrayRef<int> InputMask) {
13590 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
13591 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
13592 !X86::mayFoldLoad(Input, Subtarget)))
13593 return;
13594 if (isNoopShuffleMask(InputMask))
13595 return;
13596 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13597, __extension__
__PRETTY_FUNCTION__))
13597 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13597, __extension__
__PRETTY_FUNCTION__))
;
13598 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13599 for (auto I : enumerate(InputMask)) {
13600 int &InputMaskElt = I.value();
13601 if (InputMaskElt >= 0)
13602 InputMaskElt = I.index();
13603 }
13604 };
13605
13606 // Currently, we may need to produce one shuffle per input, and blend results.
13607 // It is possible that the shuffle for one of the inputs is already a no-op.
13608 // See if we can simplify non-no-op shuffles into broadcasts,
13609 // which we consider to be strictly better than an arbitrary shuffle.
13610 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13611 isNoopOrBroadcastShuffleMask(V2Mask)) {
13612 canonicalizeBroadcastableInput(V1, V1Mask);
13613 canonicalizeBroadcastableInput(V2, V2Mask);
13614 }
13615
13616 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13617 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13618 // the shuffle may be able to fold with a load or other benefit. However, when
13619 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13620 // pre-shuffle first is a better strategy.
13621 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
25
Assuming the condition is true
26
Assuming the condition is true
27
Taking true branch
13622 // Only prefer immediate blends to unpack/rotate.
13623 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
28
Taking false branch
13624 DAG, true))
13625 return BlendPerm;
13626 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
29
Calling 'lowerShuffleAsUNPCKAndPermute'
13627 DAG))
13628 return UnpackPerm;
13629 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13630 DL, VT, V1, V2, Mask, Subtarget, DAG))
13631 return RotatePerm;
13632 // Unpack/rotate failed - try again with variable blends.
13633 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13634 DAG))
13635 return BlendPerm;
13636 if (VT.getScalarSizeInBits() >= 32)
13637 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
13638 DL, VT, V1, V2, Mask, Subtarget, DAG))
13639 return PermUnpack;
13640 }
13641
13642 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13643 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13644 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13645 // than half the elements coming from each source.
13646 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13647 V1Mask.assign(NumElts, -1);
13648 V2Mask.assign(NumElts, -1);
13649 FinalMask.assign(NumElts, -1);
13650 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13651 for (int j = 0; j != NumEltsPerLane; ++j) {
13652 int M = Mask[i + j];
13653 if (M >= 0 && M < NumElts) {
13654 V1Mask[i + (j / 2)] = M;
13655 FinalMask[i + j] = i + (j / 2);
13656 } else if (M >= NumElts) {
13657 V2Mask[i + (j / 2)] = M - NumElts;
13658 FinalMask[i + j] = i + (j / 2) + NumElts;
13659 }
13660 }
13661 }
13662
13663 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13664 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13665 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13666}
13667
13668/// Try to lower a vector shuffle as a bit rotation.
13669///
13670/// Look for a repeated rotation pattern in each sub group.
13671/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13672static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13673 int NumElts = Mask.size();
13674 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13674, __extension__
__PRETTY_FUNCTION__))
;
13675
13676 int RotateAmt = -1;
13677 for (int i = 0; i != NumElts; i += NumSubElts) {
13678 for (int j = 0; j != NumSubElts; ++j) {
13679 int M = Mask[i + j];
13680 if (M < 0)
13681 continue;
13682 if (!isInRange(M, i, i + NumSubElts))
13683 return -1;
13684 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13685 if (0 <= RotateAmt && Offset != RotateAmt)
13686 return -1;
13687 RotateAmt = Offset;
13688 }
13689 }
13690 return RotateAmt;
13691}
13692
13693static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13694 const X86Subtarget &Subtarget,
13695 ArrayRef<int> Mask) {
13696 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13696, __extension__
__PRETTY_FUNCTION__))
;
13697 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13697, __extension__
__PRETTY_FUNCTION__))
;
13698
13699 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13700 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13701 int MaxSubElts = 64 / EltSizeInBits;
13702 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13703 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13704 if (RotateAmt < 0)
13705 continue;
13706
13707 int NumElts = Mask.size();
13708 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13709 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13710 return RotateAmt * EltSizeInBits;
13711 }
13712
13713 return -1;
13714}
13715
13716/// Lower shuffle using X86ISD::VROTLI rotations.
13717static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13718 ArrayRef<int> Mask,
13719 const X86Subtarget &Subtarget,
13720 SelectionDAG &DAG) {
13721 // Only XOP + AVX512 targets have bit rotation instructions.
13722 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13723 bool IsLegal =
13724 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13725 if (!IsLegal && Subtarget.hasSSE3())
13726 return SDValue();
13727
13728 MVT RotateVT;
13729 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13730 Subtarget, Mask);
13731 if (RotateAmt < 0)
13732 return SDValue();
13733
13734 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13735 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13736 // widen to vXi16 or more then existing lowering should will be better.
13737 if (!IsLegal) {
13738 if ((RotateAmt % 16) == 0)
13739 return SDValue();
13740 // TODO: Use getTargetVShiftByConstNode.
13741 unsigned ShlAmt = RotateAmt;
13742 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13743 V1 = DAG.getBitcast(RotateVT, V1);
13744 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13745 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13746 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13747 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13748 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13749 return DAG.getBitcast(VT, Rot);
13750 }
13751
13752 SDValue Rot =
13753 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13754 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13755 return DAG.getBitcast(VT, Rot);
13756}
13757
13758/// Try to match a vector shuffle as an element rotation.
13759///
13760/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13761static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13762 ArrayRef<int> Mask) {
13763 int NumElts = Mask.size();
13764
13765 // We need to detect various ways of spelling a rotation:
13766 // [11, 12, 13, 14, 15, 0, 1, 2]
13767 // [-1, 12, 13, 14, -1, -1, 1, -1]
13768 // [-1, -1, -1, -1, -1, -1, 1, 2]
13769 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13770 // [-1, 4, 5, 6, -1, -1, 9, -1]
13771 // [-1, 4, 5, 6, -1, -1, -1, -1]
13772 int Rotation = 0;
13773 SDValue Lo, Hi;
13774 for (int i = 0; i < NumElts; ++i) {
13775 int M = Mask[i];
13776 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13777, __extension__
__PRETTY_FUNCTION__))
13777 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13777, __extension__
__PRETTY_FUNCTION__))
;
13778 if (M < 0)
13779 continue;
13780
13781 // Determine where a rotated vector would have started.
13782 int StartIdx = i - (M % NumElts);
13783 if (StartIdx == 0)
13784 // The identity rotation isn't interesting, stop.
13785 return -1;
13786
13787 // If we found the tail of a vector the rotation must be the missing
13788 // front. If we found the head of a vector, it must be how much of the
13789 // head.
13790 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13791
13792 if (Rotation == 0)
13793 Rotation = CandidateRotation;
13794 else if (Rotation != CandidateRotation)
13795 // The rotations don't match, so we can't match this mask.
13796 return -1;
13797
13798 // Compute which value this mask is pointing at.
13799 SDValue MaskV = M < NumElts ? V1 : V2;
13800
13801 // Compute which of the two target values this index should be assigned
13802 // to. This reflects whether the high elements are remaining or the low
13803 // elements are remaining.
13804 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13805
13806 // Either set up this value if we've not encountered it before, or check
13807 // that it remains consistent.
13808 if (!TargetV)
13809 TargetV = MaskV;
13810 else if (TargetV != MaskV)
13811 // This may be a rotation, but it pulls from the inputs in some
13812 // unsupported interleaving.
13813 return -1;
13814 }
13815
13816 // Check that we successfully analyzed the mask, and normalize the results.
13817 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13817, __extension__
__PRETTY_FUNCTION__))
;
13818 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13818, __extension__
__PRETTY_FUNCTION__))
;
13819 if (!Lo)
13820 Lo = Hi;
13821 else if (!Hi)
13822 Hi = Lo;
13823
13824 V1 = Lo;
13825 V2 = Hi;
13826
13827 return Rotation;
13828}
13829
13830/// Try to lower a vector shuffle as a byte rotation.
13831///
13832/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13833/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13834/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13835/// try to generically lower a vector shuffle through such an pattern. It
13836/// does not check for the profitability of lowering either as PALIGNR or
13837/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13838/// This matches shuffle vectors that look like:
13839///
13840/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13841///
13842/// Essentially it concatenates V1 and V2, shifts right by some number of
13843/// elements, and takes the low elements as the result. Note that while this is
13844/// specified as a *right shift* because x86 is little-endian, it is a *left
13845/// rotate* of the vector lanes.
13846static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13847 ArrayRef<int> Mask) {
13848 // Don't accept any shuffles with zero elements.
13849 if (isAnyZero(Mask))
13850 return -1;
13851
13852 // PALIGNR works on 128-bit lanes.
13853 SmallVector<int, 16> RepeatedMask;
13854 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13855 return -1;
13856
13857 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13858 if (Rotation <= 0)
13859 return -1;
13860
13861 // PALIGNR rotates bytes, so we need to scale the
13862 // rotation based on how many bytes are in the vector lane.
13863 int NumElts = RepeatedMask.size();
13864 int Scale = 16 / NumElts;
13865 return Rotation * Scale;
13866}
13867
13868static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13869 SDValue V2, ArrayRef<int> Mask,
13870 const X86Subtarget &Subtarget,
13871 SelectionDAG &DAG) {
13872 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13872, __extension__
__PRETTY_FUNCTION__))
;
13873
13874 SDValue Lo = V1, Hi = V2;
13875 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13876 if (ByteRotation <= 0)
13877 return SDValue();
13878
13879 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13880 // PSLLDQ/PSRLDQ.
13881 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13882 Lo = DAG.getBitcast(ByteVT, Lo);
13883 Hi = DAG.getBitcast(ByteVT, Hi);
13884
13885 // SSSE3 targets can use the palignr instruction.
13886 if (Subtarget.hasSSSE3()) {
13887 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13888, __extension__
__PRETTY_FUNCTION__))
13888 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13888, __extension__
__PRETTY_FUNCTION__))
;
13889 return DAG.getBitcast(
13890 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13891 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13892 }
13893
13894 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__))
13895 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__))
;
13896 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13897, __extension__
__PRETTY_FUNCTION__))
13897 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13897, __extension__
__PRETTY_FUNCTION__))
;
13898 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13899, __extension__
__PRETTY_FUNCTION__))
13899 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13899, __extension__
__PRETTY_FUNCTION__))
;
13900
13901 // Default SSE2 implementation
13902 int LoByteShift = 16 - ByteRotation;
13903 int HiByteShift = ByteRotation;
13904
13905 SDValue LoShift =
13906 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13907 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13908 SDValue HiShift =
13909 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13910 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13911 return DAG.getBitcast(VT,
13912 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13913}
13914
13915/// Try to lower a vector shuffle as a dword/qword rotation.
13916///
13917/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13918/// rotation of the concatenation of two vectors; This routine will
13919/// try to generically lower a vector shuffle through such an pattern.
13920///
13921/// Essentially it concatenates V1 and V2, shifts right by some number of
13922/// elements, and takes the low elements as the result. Note that while this is
13923/// specified as a *right shift* because x86 is little-endian, it is a *left
13924/// rotate* of the vector lanes.
13925static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13926 SDValue V2, ArrayRef<int> Mask,
13927 const X86Subtarget &Subtarget,
13928 SelectionDAG &DAG) {
13929 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__))
13930 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__))
;
13931
13932 // 128/256-bit vectors are only supported with VLX.
13933 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13934, __extension__
__PRETTY_FUNCTION__))
13934 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13934, __extension__
__PRETTY_FUNCTION__))
;
13935
13936 SDValue Lo = V1, Hi = V2;
13937 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13938 if (Rotation <= 0)
13939 return SDValue();
13940
13941 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13942 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13943}
13944
13945/// Try to lower a vector shuffle as a byte shift sequence.
13946static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13947 SDValue V2, ArrayRef<int> Mask,
13948 const APInt &Zeroable,
13949 const X86Subtarget &Subtarget,
13950 SelectionDAG &DAG) {
13951 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13951, __extension__
__PRETTY_FUNCTION__))
;
13952 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13952, __extension__
__PRETTY_FUNCTION__))
;
13953
13954 // We need a shuffle that has zeros at one/both ends and a sequential
13955 // shuffle from one source within.
13956 unsigned ZeroLo = Zeroable.countr_one();
13957 unsigned ZeroHi = Zeroable.countl_one();
13958 if (!ZeroLo && !ZeroHi)
13959 return SDValue();
13960
13961 unsigned NumElts = Mask.size();
13962 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13963 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13964 return SDValue();
13965
13966 unsigned Scale = VT.getScalarSizeInBits() / 8;
13967 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13968 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13969 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13970 return SDValue();
13971
13972 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13973 Res = DAG.getBitcast(MVT::v16i8, Res);
13974
13975 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13976 // inner sequential set of elements, possibly offset:
13977 // 01234567 --> zzzzzz01 --> 1zzzzzzz
13978 // 01234567 --> 4567zzzz --> zzzzz456
13979 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13980 if (ZeroLo == 0) {
13981 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13982 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13983 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13984 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13985 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13986 } else if (ZeroHi == 0) {
13987 unsigned Shift = Mask[ZeroLo] % NumElts;
13988 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13989 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13990 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13991 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13992 } else if (!Subtarget.hasSSSE3()) {
13993 // If we don't have PSHUFB then its worth avoiding an AND constant mask
13994 // by performing 3 byte shifts. Shuffle combining can kick in above that.
13995 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13996 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13997 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13998 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13999 Shift += Mask[ZeroLo] % NumElts;
14000 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14001 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14002 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14003 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14004 } else
14005 return SDValue();
14006
14007 return DAG.getBitcast(VT, Res);
14008}
14009
14010/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
14011///
14012/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
14013/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
14014/// matches elements from one of the input vectors shuffled to the left or
14015/// right with zeroable elements 'shifted in'. It handles both the strictly
14016/// bit-wise element shifts and the byte shift across an entire 128-bit double
14017/// quad word lane.
14018///
14019/// PSHL : (little-endian) left bit shift.
14020/// [ zz, 0, zz, 2 ]
14021/// [ -1, 4, zz, -1 ]
14022/// PSRL : (little-endian) right bit shift.
14023/// [ 1, zz, 3, zz]
14024/// [ -1, -1, 7, zz]
14025/// PSLLDQ : (little-endian) left byte shift
14026/// [ zz, 0, 1, 2, 3, 4, 5, 6]
14027/// [ zz, zz, -1, -1, 2, 3, 4, -1]
14028/// [ zz, zz, zz, zz, zz, zz, -1, 1]
14029/// PSRLDQ : (little-endian) right byte shift
14030/// [ 5, 6, 7, zz, zz, zz, zz, zz]
14031/// [ -1, 5, 6, 7, zz, zz, zz, zz]
14032/// [ 1, 2, -1, -1, -1, -1, zz, zz]
14033static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
14034 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
14035 int MaskOffset, const APInt &Zeroable,
14036 const X86Subtarget &Subtarget) {
14037 int Size = Mask.size();
14038 unsigned SizeInBits = Size * ScalarSizeInBits;
14039
14040 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
14041 for (int i = 0; i < Size; i += Scale)
14042 for (int j = 0; j < Shift; ++j)
14043 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
14044 return false;
14045
14046 return true;
14047 };
14048
14049 auto MatchShift = [&](int Shift, int Scale, bool Left) {
14050 for (int i = 0; i != Size; i += Scale) {
14051 unsigned Pos = Left ? i + Shift : i;
14052 unsigned Low = Left ? i : i + Shift;
14053 unsigned Len = Scale - Shift;
14054 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
14055 return -1;
14056 }
14057
14058 int ShiftEltBits = ScalarSizeInBits * Scale;
14059 bool ByteShift = ShiftEltBits > 64;
14060 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
14061 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
14062 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
14063
14064 // Normalize the scale for byte shifts to still produce an i64 element
14065 // type.
14066 Scale = ByteShift ? Scale / 2 : Scale;
14067
14068 // We need to round trip through the appropriate type for the shift.
14069 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
14070 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
14071 : MVT::getVectorVT(ShiftSVT, Size / Scale);
14072 return (int)ShiftAmt;
14073 };
14074
14075 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
14076 // keep doubling the size of the integer elements up to that. We can
14077 // then shift the elements of the integer vector by whole multiples of
14078 // their width within the elements of the larger integer vector. Test each
14079 // multiple to see if we can find a match with the moved element indices
14080 // and that the shifted in elements are all zeroable.
14081 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
14082 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
14083 for (int Shift = 1; Shift != Scale; ++Shift)
14084 for (bool Left : {true, false})
14085 if (CheckZeros(Shift, Scale, Left)) {
14086 int ShiftAmt = MatchShift(Shift, Scale, Left);
14087 if (0 < ShiftAmt)
14088 return ShiftAmt;
14089 }
14090
14091 // no match
14092 return -1;
14093}
14094
14095static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
14096 SDValue V2, ArrayRef<int> Mask,
14097 const APInt &Zeroable,
14098 const X86Subtarget &Subtarget,
14099 SelectionDAG &DAG, bool BitwiseOnly) {
14100 int Size = Mask.size();
14101 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14101, __extension__
__PRETTY_FUNCTION__))
;
14102
14103 MVT ShiftVT;
14104 SDValue V = V1;
14105 unsigned Opcode;
14106
14107 // Try to match shuffle against V1 shift.
14108 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14109 Mask, 0, Zeroable, Subtarget);
14110
14111 // If V1 failed, try to match shuffle against V2 shift.
14112 if (ShiftAmt < 0) {
14113 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14114 Mask, Size, Zeroable, Subtarget);
14115 V = V2;
14116 }
14117
14118 if (ShiftAmt < 0)
14119 return SDValue();
14120
14121 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
14122 return SDValue();
14123
14124 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))
14125 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))
;
14126 V = DAG.getBitcast(ShiftVT, V);
14127 V = DAG.getNode(Opcode, DL, ShiftVT, V,
14128 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
14129 return DAG.getBitcast(VT, V);
14130}
14131
14132// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
14133// Remainder of lower half result is zero and upper half is all undef.
14134static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
14135 ArrayRef<int> Mask, uint64_t &BitLen,
14136 uint64_t &BitIdx, const APInt &Zeroable) {
14137 int Size = Mask.size();
14138 int HalfSize = Size / 2;
14139 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14139, __extension__
__PRETTY_FUNCTION__))
;
14140 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14140, __extension__
__PRETTY_FUNCTION__))
;
14141
14142 // Upper half must be undefined.
14143 if (!isUndefUpperHalf(Mask))
14144 return false;
14145
14146 // Determine the extraction length from the part of the
14147 // lower half that isn't zeroable.
14148 int Len = HalfSize;
14149 for (; Len > 0; --Len)
14150 if (!Zeroable[Len - 1])
14151 break;
14152 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14152, __extension__
__PRETTY_FUNCTION__))
;
14153
14154 // Attempt to match first Len sequential elements from the lower half.
14155 SDValue Src;
14156 int Idx = -1;
14157 for (int i = 0; i != Len; ++i) {
14158 int M = Mask[i];
14159 if (M == SM_SentinelUndef)
14160 continue;
14161 SDValue &V = (M < Size ? V1 : V2);
14162 M = M % Size;
14163
14164 // The extracted elements must start at a valid index and all mask
14165 // elements must be in the lower half.
14166 if (i > M || M >= HalfSize)
14167 return false;
14168
14169 if (Idx < 0 || (Src == V && Idx == (M - i))) {
14170 Src = V;
14171 Idx = M - i;
14172 continue;
14173 }
14174 return false;
14175 }
14176
14177 if (!Src || Idx < 0)
14178 return false;
14179
14180 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__))
;
14181 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14182 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14183 V1 = Src;
14184 return true;
14185}
14186
14187// INSERTQ: Extract lowest Len elements from lower half of second source and
14188// insert over first source, starting at Idx.
14189// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
14190static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
14191 ArrayRef<int> Mask, uint64_t &BitLen,
14192 uint64_t &BitIdx) {
14193 int Size = Mask.size();
14194 int HalfSize = Size / 2;
14195 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14195, __extension__
__PRETTY_FUNCTION__))
;
14196
14197 // Upper half must be undefined.
14198 if (!isUndefUpperHalf(Mask))
14199 return false;
14200
14201 for (int Idx = 0; Idx != HalfSize; ++Idx) {
14202 SDValue Base;
14203
14204 // Attempt to match first source from mask before insertion point.
14205 if (isUndefInRange(Mask, 0, Idx)) {
14206 /* EMPTY */
14207 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
14208 Base = V1;
14209 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
14210 Base = V2;
14211 } else {
14212 continue;
14213 }
14214
14215 // Extend the extraction length looking to match both the insertion of
14216 // the second source and the remaining elements of the first.
14217 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
14218 SDValue Insert;
14219 int Len = Hi - Idx;
14220
14221 // Match insertion.
14222 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
14223 Insert = V1;
14224 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
14225 Insert = V2;
14226 } else {
14227 continue;
14228 }
14229
14230 // Match the remaining elements of the lower half.
14231 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
14232 /* EMPTY */
14233 } else if ((!Base || (Base == V1)) &&
14234 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
14235 Base = V1;
14236 } else if ((!Base || (Base == V2)) &&
14237 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
14238 Size + Hi)) {
14239 Base = V2;
14240 } else {
14241 continue;
14242 }
14243
14244 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14245 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14246 V1 = Base;
14247 V2 = Insert;
14248 return true;
14249 }
14250 }
14251
14252 return false;
14253}
14254
14255/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
14256static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
14257 SDValue V2, ArrayRef<int> Mask,
14258 const APInt &Zeroable, SelectionDAG &DAG) {
14259 uint64_t BitLen, BitIdx;
14260 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
14261 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
14262 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14263 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14264
14265 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
14266 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
14267 V2 ? V2 : DAG.getUNDEF(VT),
14268 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14269 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14270
14271 return SDValue();
14272}
14273
14274/// Lower a vector shuffle as a zero or any extension.
14275///
14276/// Given a specific number of elements, element bit width, and extension
14277/// stride, produce either a zero or any extension based on the available
14278/// features of the subtarget. The extended elements are consecutive and
14279/// begin and can start from an offsetted element index in the input; to
14280/// avoid excess shuffling the offset must either being in the bottom lane
14281/// or at the start of a higher lane. All extended elements must be from
14282/// the same lane.
14283static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14284 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
14285 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14286 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14286, __extension__
__PRETTY_FUNCTION__))
;
14287 int EltBits = VT.getScalarSizeInBits();
14288 int NumElements = VT.getVectorNumElements();
14289 int NumEltsPerLane = 128 / EltBits;
14290 int OffsetLane = Offset / NumEltsPerLane;
14291 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14292, __extension__
__PRETTY_FUNCTION__))
14292 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14292, __extension__
__PRETTY_FUNCTION__))
;
14293 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14293, __extension__
__PRETTY_FUNCTION__))
;
14294 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14294, __extension__
__PRETTY_FUNCTION__))
;
14295 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14296, __extension__
__PRETTY_FUNCTION__))
14296 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14296, __extension__
__PRETTY_FUNCTION__))
;
14297
14298 // Check that an index is in same lane as the base offset.
14299 auto SafeOffset = [&](int Idx) {
14300 return OffsetLane == (Idx / NumEltsPerLane);
14301 };
14302
14303 // Shift along an input so that the offset base moves to the first element.
14304 auto ShuffleOffset = [&](SDValue V) {
14305 if (!Offset)
14306 return V;
14307
14308 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14309 for (int i = 0; i * Scale < NumElements; ++i) {
14310 int SrcIdx = i + Offset;
14311 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
14312 }
14313 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
14314 };
14315
14316 // Found a valid a/zext mask! Try various lowering strategies based on the
14317 // input type and available ISA extensions.
14318 if (Subtarget.hasSSE41()) {
14319 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
14320 // PUNPCK will catch this in a later shuffle match.
14321 if (Offset && Scale == 2 && VT.is128BitVector())
14322 return SDValue();
14323 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14324 NumElements / Scale);
14325 InputV = DAG.getBitcast(VT, InputV);
14326 InputV = ShuffleOffset(InputV);
14327 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14328 DL, ExtVT, InputV, DAG);
14329 return DAG.getBitcast(VT, InputV);
14330 }
14331
14332 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14332, __extension__
__PRETTY_FUNCTION__))
;
14333 InputV = DAG.getBitcast(VT, InputV);
14334
14335 // For any extends we can cheat for larger element sizes and use shuffle
14336 // instructions that can fold with a load and/or copy.
14337 if (AnyExt && EltBits == 32) {
14338 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
14339 -1};
14340 return DAG.getBitcast(
14341 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14342 DAG.getBitcast(MVT::v4i32, InputV),
14343 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14344 }
14345 if (AnyExt && EltBits == 16 && Scale > 2) {
14346 int PSHUFDMask[4] = {Offset / 2, -1,
14347 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
14348 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14349 DAG.getBitcast(MVT::v4i32, InputV),
14350 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14351 int PSHUFWMask[4] = {1, -1, -1, -1};
14352 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
14353 return DAG.getBitcast(
14354 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
14355 DAG.getBitcast(MVT::v8i16, InputV),
14356 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
14357 }
14358
14359 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
14360 // to 64-bits.
14361 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
14362 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14362, __extension__
__PRETTY_FUNCTION__))
;
14363 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__))
;
14364
14365 int LoIdx = Offset * EltBits;
14366 SDValue Lo = DAG.getBitcast(
14367 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14368 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14369 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
14370
14371 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
14372 return DAG.getBitcast(VT, Lo);
14373
14374 int HiIdx = (Offset + 1) * EltBits;
14375 SDValue Hi = DAG.getBitcast(
14376 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14377 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14378 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
14379 return DAG.getBitcast(VT,
14380 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
14381 }
14382
14383 // If this would require more than 2 unpack instructions to expand, use
14384 // pshufb when available. We can only use more than 2 unpack instructions
14385 // when zero extending i8 elements which also makes it easier to use pshufb.
14386 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
14387 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14387, __extension__
__PRETTY_FUNCTION__))
;
14388 SDValue PSHUFBMask[16];
14389 for (int i = 0; i < 16; ++i) {
14390 int Idx = Offset + (i / Scale);
14391 if ((i % Scale == 0 && SafeOffset(Idx))) {
14392 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
14393 continue;
14394 }
14395 PSHUFBMask[i] =
14396 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
14397 }
14398 InputV = DAG.getBitcast(MVT::v16i8, InputV);
14399 return DAG.getBitcast(
14400 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
14401 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
14402 }
14403
14404 // If we are extending from an offset, ensure we start on a boundary that
14405 // we can unpack from.
14406 int AlignToUnpack = Offset % (NumElements / Scale);
14407 if (AlignToUnpack) {
14408 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14409 for (int i = AlignToUnpack; i < NumElements; ++i)
14410 ShMask[i - AlignToUnpack] = i;
14411 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
14412 Offset -= AlignToUnpack;
14413 }
14414
14415 // Otherwise emit a sequence of unpacks.
14416 do {
14417 unsigned UnpackLoHi = X86ISD::UNPCKL;
14418 if (Offset >= (NumElements / 2)) {
14419 UnpackLoHi = X86ISD::UNPCKH;
14420 Offset -= (NumElements / 2);
14421 }
14422
14423 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
14424 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
14425 : getZeroVector(InputVT, Subtarget, DAG, DL);
14426 InputV = DAG.getBitcast(InputVT, InputV);
14427 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
14428 Scale /= 2;
14429 EltBits *= 2;
14430 NumElements /= 2;
14431 } while (Scale > 1);
14432 return DAG.getBitcast(VT, InputV);
14433}
14434
14435/// Try to lower a vector shuffle as a zero extension on any microarch.
14436///
14437/// This routine will try to do everything in its power to cleverly lower
14438/// a shuffle which happens to match the pattern of a zero extend. It doesn't
14439/// check for the profitability of this lowering, it tries to aggressively
14440/// match this pattern. It will use all of the micro-architectural details it
14441/// can to emit an efficient lowering. It handles both blends with all-zero
14442/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
14443/// masking out later).
14444///
14445/// The reason we have dedicated lowering for zext-style shuffles is that they
14446/// are both incredibly common and often quite performance sensitive.
14447static SDValue lowerShuffleAsZeroOrAnyExtend(
14448 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14449 const APInt &Zeroable, const X86Subtarget &Subtarget,
14450 SelectionDAG &DAG) {
14451 int Bits = VT.getSizeInBits();
14452 int NumLanes = Bits / 128;
14453 int NumElements = VT.getVectorNumElements();
14454 int NumEltsPerLane = NumElements / NumLanes;
14455 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__))
14456 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__))
;
14457 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14457, __extension__
__PRETTY_FUNCTION__))
;
14458
14459 // Define a helper function to check a particular ext-scale and lower to it if
14460 // valid.
14461 auto Lower = [&](int Scale) -> SDValue {
14462 SDValue InputV;
14463 bool AnyExt = true;
14464 int Offset = 0;
14465 int Matches = 0;
14466 for (int i = 0; i < NumElements; ++i) {
14467 int M = Mask[i];
14468 if (M < 0)
14469 continue; // Valid anywhere but doesn't tell us anything.
14470 if (i % Scale != 0) {
14471 // Each of the extended elements need to be zeroable.
14472 if (!Zeroable[i])
14473 return SDValue();
14474
14475 // We no longer are in the anyext case.
14476 AnyExt = false;
14477 continue;
14478 }
14479
14480 // Each of the base elements needs to be consecutive indices into the
14481 // same input vector.
14482 SDValue V = M < NumElements ? V1 : V2;
14483 M = M % NumElements;
14484 if (!InputV) {
14485 InputV = V;
14486 Offset = M - (i / Scale);
14487 } else if (InputV != V)
14488 return SDValue(); // Flip-flopping inputs.
14489
14490 // Offset must start in the lowest 128-bit lane or at the start of an
14491 // upper lane.
14492 // FIXME: Is it ever worth allowing a negative base offset?
14493 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
14494 (Offset % NumEltsPerLane) == 0))
14495 return SDValue();
14496
14497 // If we are offsetting, all referenced entries must come from the same
14498 // lane.
14499 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
14500 return SDValue();
14501
14502 if ((M % NumElements) != (Offset + (i / Scale)))
14503 return SDValue(); // Non-consecutive strided elements.
14504 Matches++;
14505 }
14506
14507 // If we fail to find an input, we have a zero-shuffle which should always
14508 // have already been handled.
14509 // FIXME: Maybe handle this here in case during blending we end up with one?
14510 if (!InputV)
14511 return SDValue();
14512
14513 // If we are offsetting, don't extend if we only match a single input, we
14514 // can always do better by using a basic PSHUF or PUNPCK.
14515 if (Offset != 0 && Matches < 2)
14516 return SDValue();
14517
14518 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
14519 InputV, Mask, Subtarget, DAG);
14520 };
14521
14522 // The widest scale possible for extending is to a 64-bit integer.
14523 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14524, __extension__
__PRETTY_FUNCTION__))
14524 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14524, __extension__
__PRETTY_FUNCTION__))
;
14525 int NumExtElements = Bits / 64;
14526
14527 // Each iteration, try extending the elements half as much, but into twice as
14528 // many elements.
14529 for (; NumExtElements < NumElements; NumExtElements *= 2) {
14530 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14531, __extension__
__PRETTY_FUNCTION__))
14531 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14531, __extension__
__PRETTY_FUNCTION__))
;
14532 if (SDValue V = Lower(NumElements / NumExtElements))
14533 return V;
14534 }
14535
14536 // General extends failed, but 128-bit vectors may be able to use MOVQ.
14537 if (Bits != 128)
14538 return SDValue();
14539
14540 // Returns one of the source operands if the shuffle can be reduced to a
14541 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
14542 auto CanZExtLowHalf = [&]() {
14543 for (int i = NumElements / 2; i != NumElements; ++i)
14544 if (!Zeroable[i])
14545 return SDValue();
14546 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
14547 return V1;
14548 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
14549 return V2;
14550 return SDValue();
14551 };
14552
14553 if (SDValue V = CanZExtLowHalf()) {
14554 V = DAG.getBitcast(MVT::v2i64, V);
14555 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
14556 return DAG.getBitcast(VT, V);
14557 }
14558
14559 // No viable ext lowering found.
14560 return SDValue();
14561}
14562
14563/// Try to get a scalar value for a specific element of a vector.
14564///
14565/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
14566static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
14567 SelectionDAG &DAG) {
14568 MVT VT = V.getSimpleValueType();
14569 MVT EltVT = VT.getVectorElementType();
14570 V = peekThroughBitcasts(V);
14571
14572 // If the bitcasts shift the element size, we can't extract an equivalent
14573 // element from it.
14574 MVT NewVT = V.getSimpleValueType();
14575 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
14576 return SDValue();
14577
14578 if (V.getOpcode() == ISD::BUILD_VECTOR ||
14579 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
14580 // Ensure the scalar operand is the same size as the destination.
14581 // FIXME: Add support for scalar truncation where possible.
14582 SDValue S = V.getOperand(Idx);
14583 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
14584 return DAG.getBitcast(EltVT, S);
14585 }
14586
14587 return SDValue();
14588}
14589
14590/// Helper to test for a load that can be folded with x86 shuffles.
14591///
14592/// This is particularly important because the set of instructions varies
14593/// significantly based on whether the operand is a load or not.
14594static bool isShuffleFoldableLoad(SDValue V) {
14595 return V->hasOneUse() &&
14596 ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
14597}
14598
14599template<typename T>
14600static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
14601 return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
14602}
14603
14604template<typename T>
14605bool X86TargetLowering::isSoftFP16(T VT) const {
14606 return ::isSoftFP16(VT, Subtarget);
14607}
14608
14609/// Try to lower insertion of a single element into a zero vector.
14610///
14611/// This is a common pattern that we have especially efficient patterns to lower
14612/// across all subtarget feature sets.
14613static SDValue lowerShuffleAsElementInsertion(
14614 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14615 const APInt &Zeroable, const X86Subtarget &Subtarget,
14616 SelectionDAG &DAG) {
14617 MVT ExtVT = VT;
14618 MVT EltVT = VT.getVectorElementType();
14619
14620 if (isSoftFP16(EltVT, Subtarget))
14621 return SDValue();
14622
14623 int V2Index =
14624 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14625 Mask.begin();
14626 bool IsV1Zeroable = true;
14627 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14628 if (i != V2Index && !Zeroable[i]) {
14629 IsV1Zeroable = false;
14630 break;
14631 }
14632
14633 // Check for a single input from a SCALAR_TO_VECTOR node.
14634 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14635 // all the smarts here sunk into that routine. However, the current
14636 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14637 // vector shuffle lowering is dead.
14638 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14639 DAG);
14640 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14641 // We need to zext the scalar if it is smaller than an i32.
14642 V2S = DAG.getBitcast(EltVT, V2S);
14643 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14644 // Using zext to expand a narrow element won't work for non-zero
14645 // insertions.
14646 if (!IsV1Zeroable)
14647 return SDValue();
14648
14649 // Zero-extend directly to i32.
14650 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14651 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14652 }
14653 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14654 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14655 EltVT == MVT::i16) {
14656 // Either not inserting from the low element of the input or the input
14657 // element size is too small to use VZEXT_MOVL to clear the high bits.
14658 return SDValue();
14659 }
14660
14661 if (!IsV1Zeroable) {
14662 // If V1 can't be treated as a zero vector we have fewer options to lower
14663 // this. We can't support integer vectors or non-zero targets cheaply, and
14664 // the V1 elements can't be permuted in any way.
14665 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14665, __extension__
__PRETTY_FUNCTION__))
;
14666 if (!VT.isFloatingPoint() || V2Index != 0)
14667 return SDValue();
14668 SmallVector<int, 8> V1Mask(Mask);
14669 V1Mask[V2Index] = -1;
14670 if (!isNoopShuffleMask(V1Mask))
14671 return SDValue();
14672 if (!VT.is128BitVector())
14673 return SDValue();
14674
14675 // Otherwise, use MOVSD, MOVSS or MOVSH.
14676 unsigned MovOpc = 0;
14677 if (EltVT == MVT::f16)
14678 MovOpc = X86ISD::MOVSH;
14679 else if (EltVT == MVT::f32)
14680 MovOpc = X86ISD::MOVSS;
14681 else if (EltVT == MVT::f64)
14682 MovOpc = X86ISD::MOVSD;
14683 else
14684 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14684)
;
14685 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14686 }
14687
14688 // This lowering only works for the low element with floating point vectors.
14689 if (VT.isFloatingPoint() && V2Index != 0)
14690 return SDValue();
14691
14692 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14693 if (ExtVT != VT)
14694 V2 = DAG.getBitcast(VT, V2);
14695
14696 if (V2Index != 0) {
14697 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14698 // the desired position. Otherwise it is more efficient to do a vector
14699 // shift left. We know that we can do a vector shift left because all
14700 // the inputs are zero.
14701 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
14702 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14703 V2Shuffle[V2Index] = 0;
14704 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14705 } else {
14706 V2 = DAG.getBitcast(MVT::v16i8, V2);
14707 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14708 DAG.getTargetConstant(
14709 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
14710 V2 = DAG.getBitcast(VT, V2);
14711 }
14712 }
14713 return V2;
14714}
14715
14716/// Try to lower broadcast of a single - truncated - integer element,
14717/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14718///
14719/// This assumes we have AVX2.
14720static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14721 int BroadcastIdx,
14722 const X86Subtarget &Subtarget,
14723 SelectionDAG &DAG) {
14724 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14725, __extension__
__PRETTY_FUNCTION__))
14725 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14725, __extension__
__PRETTY_FUNCTION__))
;
14726
14727 MVT EltVT = VT.getVectorElementType();
14728 MVT V0VT = V0.getSimpleValueType();
14729
14730 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14730, __extension__
__PRETTY_FUNCTION__))
;
14731 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14731, __extension__
__PRETTY_FUNCTION__))
;
14732
14733 MVT V0EltVT = V0VT.getVectorElementType();
14734 if (!V0EltVT.isInteger())
14735 return SDValue();
14736
14737 const unsigned EltSize = EltVT.getSizeInBits();
14738 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14739
14740 // This is only a truncation if the original element type is larger.
14741 if (V0EltSize <= EltSize)
14742 return SDValue();
14743
14744 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14745, __extension__
__PRETTY_FUNCTION__))
14745 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14745, __extension__
__PRETTY_FUNCTION__))
;
14746
14747 const unsigned V0Opc = V0.getOpcode();
14748 const unsigned Scale = V0EltSize / EltSize;
14749 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14750
14751 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14752 V0Opc != ISD::BUILD_VECTOR)
14753 return SDValue();
14754
14755 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14756
14757 // If we're extracting non-least-significant bits, shift so we can truncate.
14758 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14759 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14760 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14761 if (const int OffsetIdx = BroadcastIdx % Scale)
14762 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14763 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14764
14765 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14766 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14767}
14768
14769/// Test whether this can be lowered with a single SHUFPS instruction.
14770///
14771/// This is used to disable more specialized lowerings when the shufps lowering
14772/// will happen to be efficient.
14773static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14774 // This routine only handles 128-bit shufps.
14775 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14775, __extension__
__PRETTY_FUNCTION__))
;
14776 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__))
;
14777 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14777, __extension__
__PRETTY_FUNCTION__))
;
14778 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14778, __extension__
__PRETTY_FUNCTION__))
;
14779 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14779, __extension__
__PRETTY_FUNCTION__))
;
14780
14781 // To lower with a single SHUFPS we need to have the low half and high half
14782 // each requiring a single input.
14783 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14784 return false;
14785 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14786 return false;
14787
14788 return true;
14789}
14790
14791/// Test whether the specified input (0 or 1) is in-place blended by the
14792/// given mask.
14793///
14794/// This returns true if the elements from a particular input are already in the
14795/// slot required by the given mask and require no permutation.
14796static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14797 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14797, __extension__
__PRETTY_FUNCTION__))
;
14798 int Size = Mask.size();
14799 for (int i = 0; i < Size; ++i)
14800 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14801 return false;
14802
14803 return true;
14804}
14805
14806/// If we are extracting two 128-bit halves of a vector and shuffling the
14807/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14808/// multi-shuffle lowering.
14809static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14810 SDValue N1, ArrayRef<int> Mask,
14811 SelectionDAG &DAG) {
14812 MVT VT = N0.getSimpleValueType();
14813 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14815, __extension__
__PRETTY_FUNCTION__))
14814 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14815, __extension__
__PRETTY_FUNCTION__))
14815 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14815, __extension__
__PRETTY_FUNCTION__))
;
14816
14817 // Check that both sources are extracts of the same source vector.
14818 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14819 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14820 N0.getOperand(0) != N1.getOperand(0) ||
14821 !N0.hasOneUse() || !N1.hasOneUse())
14822 return SDValue();
14823
14824 SDValue WideVec = N0.getOperand(0);
14825 MVT WideVT = WideVec.getSimpleValueType();
14826 if (!WideVT.is256BitVector())
14827 return SDValue();
14828
14829 // Match extracts of each half of the wide source vector. Commute the shuffle
14830 // if the extract of the low half is N1.
14831 unsigned NumElts = VT.getVectorNumElements();
14832 SmallVector<int, 4> NewMask(Mask);
14833 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14834 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14835 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14836 ShuffleVectorSDNode::commuteMask(NewMask);
14837 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14838 return SDValue();
14839
14840 // Final bailout: if the mask is simple, we are better off using an extract
14841 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14842 // because that avoids a constant load from memory.
14843 if (NumElts == 4 &&
14844 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
14845 return SDValue();
14846
14847 // Extend the shuffle mask with undef elements.
14848 NewMask.append(NumElts, -1);
14849
14850 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14851 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14852 NewMask);
14853 // This is free: ymm -> xmm.
14854 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14855 DAG.getIntPtrConstant(0, DL));
14856}
14857
14858/// Try to lower broadcast of a single element.
14859///
14860/// For convenience, this code also bundles all of the subtarget feature set
14861/// filtering. While a little annoying to re-dispatch on type here, there isn't
14862/// a convenient way to factor it out.
14863static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14864 SDValue V2, ArrayRef<int> Mask,
14865 const X86Subtarget &Subtarget,
14866 SelectionDAG &DAG) {
14867 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14868 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14869 (Subtarget.hasAVX2() && VT.isInteger())))
14870 return SDValue();
14871
14872 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14873 // we can only broadcast from a register with AVX2.
14874 unsigned NumEltBits = VT.getScalarSizeInBits();
14875 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14876 ? X86ISD::MOVDDUP
14877 : X86ISD::VBROADCAST;
14878 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14879
14880 // Check that the mask is a broadcast.
14881 int BroadcastIdx = getSplatIndex(Mask);
14882 if (BroadcastIdx < 0)
14883 return SDValue();
14884 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14886, __extension__
__PRETTY_FUNCTION__))
14885 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14886, __extension__
__PRETTY_FUNCTION__))
14886 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14886, __extension__
__PRETTY_FUNCTION__))
;
14887
14888 // Go up the chain of (vector) values to find a scalar load that we can
14889 // combine with the broadcast.
14890 // TODO: Combine this logic with findEltLoadSrc() used by
14891 // EltsFromConsecutiveLoads().
14892 int BitOffset = BroadcastIdx * NumEltBits;
14893 SDValue V = V1;
14894 for (;;) {
14895 switch (V.getOpcode()) {
14896 case ISD::BITCAST: {
14897 V = V.getOperand(0);
14898 continue;
14899 }
14900 case ISD::CONCAT_VECTORS: {
14901 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14902 int OpIdx = BitOffset / OpBitWidth;
14903 V = V.getOperand(OpIdx);
14904 BitOffset %= OpBitWidth;
14905 continue;
14906 }
14907 case ISD::EXTRACT_SUBVECTOR: {
14908 // The extraction index adds to the existing offset.
14909 unsigned EltBitWidth = V.getScalarValueSizeInBits();
14910 unsigned Idx = V.getConstantOperandVal(1);
14911 unsigned BeginOffset = Idx * EltBitWidth;
14912 BitOffset += BeginOffset;
14913 V = V.getOperand(0);
14914 continue;
14915 }
14916 case ISD::INSERT_SUBVECTOR: {
14917 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
14918 int EltBitWidth = VOuter.getScalarValueSizeInBits();
14919 int Idx = (int)V.getConstantOperandVal(2);
14920 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
14921 int BeginOffset = Idx * EltBitWidth;
14922 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
14923 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
14924 BitOffset -= BeginOffset;
14925 V = VInner;
14926 } else {
14927 V = VOuter;
14928 }
14929 continue;
14930 }
14931 }
14932 break;
14933 }
14934 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14934, __extension__
__PRETTY_FUNCTION__))
;
14935 BroadcastIdx = BitOffset / NumEltBits;
14936
14937 // Do we need to bitcast the source to retrieve the original broadcast index?
14938 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
14939
14940 // Check if this is a broadcast of a scalar. We special case lowering
14941 // for scalars so that we can more effectively fold with loads.
14942 // If the original value has a larger element type than the shuffle, the
14943 // broadcast element is in essence truncated. Make that explicit to ease
14944 // folding.
14945 if (BitCastSrc && VT.isInteger())
14946 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
14947 DL, VT, V, BroadcastIdx, Subtarget, DAG))
14948 return TruncBroadcast;
14949
14950 // Also check the simpler case, where we can directly reuse the scalar.
14951 if (!BitCastSrc &&
14952 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14953 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14954 V = V.getOperand(BroadcastIdx);
14955
14956 // If we can't broadcast from a register, check that the input is a load.
14957 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14958 return SDValue();
14959 } else if (ISD::isNormalLoad(V.getNode()) &&
14960 cast<LoadSDNode>(V)->isSimple()) {
14961 // We do not check for one-use of the vector load because a broadcast load
14962 // is expected to be a win for code size, register pressure, and possibly
14963 // uops even if the original vector load is not eliminated.
14964
14965 // Reduce the vector load and shuffle to a broadcasted scalar load.
14966 LoadSDNode *Ld = cast<LoadSDNode>(V);
14967 SDValue BaseAddr = Ld->getOperand(1);
14968 MVT SVT = VT.getScalarType();
14969 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14970 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14970, __extension__
__PRETTY_FUNCTION__))
;
14971 SDValue NewAddr =
14972 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14973
14974 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14975 // than MOVDDUP.
14976 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14977 if (Opcode == X86ISD::VBROADCAST) {
14978 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14979 SDValue Ops[] = {Ld->getChain(), NewAddr};
14980 V = DAG.getMemIntrinsicNode(
14981 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14982 DAG.getMachineFunction().getMachineMemOperand(
14983 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14984 DAG.makeEquivalentMemoryOrdering(Ld, V);
14985 return DAG.getBitcast(VT, V);
14986 }
14987 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14987, __extension__
__PRETTY_FUNCTION__))
;
14988 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14989 DAG.getMachineFunction().getMachineMemOperand(
14990 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14991 DAG.makeEquivalentMemoryOrdering(Ld, V);
14992 } else if (!BroadcastFromReg) {
14993 // We can't broadcast from a vector register.
14994 return SDValue();
14995 } else if (BitOffset != 0) {
14996 // We can only broadcast from the zero-element of a vector register,
14997 // but it can be advantageous to broadcast from the zero-element of a
14998 // subvector.
14999 if (!VT.is256BitVector() && !VT.is512BitVector())
15000 return SDValue();
15001
15002 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
15003 if (VT == MVT::v4f64 || VT == MVT::v4i64)
15004 return SDValue();
15005
15006 // Only broadcast the zero-element of a 128-bit subvector.
15007 if ((BitOffset % 128) != 0)
15008 return SDValue();
15009
15010 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15011, __extension__
__PRETTY_FUNCTION__))
15011 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15011, __extension__
__PRETTY_FUNCTION__))
;
15012 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15013, __extension__
__PRETTY_FUNCTION__))
15013 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15013, __extension__
__PRETTY_FUNCTION__))
;
15014 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
15015 V = extract128BitVector(V, ExtractIdx, DAG, DL);
15016 }
15017
15018 // On AVX we can use VBROADCAST directly for scalar sources.
15019 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
15020 V = DAG.getBitcast(MVT::f64, V);
15021 if (Subtarget.hasAVX()) {
15022 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
15023 return DAG.getBitcast(VT, V);
15024 }
15025 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
15026 }
15027
15028 // If this is a scalar, do the broadcast on this type and bitcast.
15029 if (!V.getValueType().isVector()) {
15030 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15031, __extension__
__PRETTY_FUNCTION__))
15031 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15031, __extension__
__PRETTY_FUNCTION__))
;
15032 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
15033 VT.getVectorNumElements());
15034 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
15035 }
15036
15037 // We only support broadcasting from 128-bit vectors to minimize the
15038 // number of patterns we need to deal with in isel. So extract down to
15039 // 128-bits, removing as many bitcasts as possible.
15040 if (V.getValueSizeInBits() > 128)
15041 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
15042
15043 // Otherwise cast V to a vector with the same element type as VT, but
15044 // possibly narrower than VT. Then perform the broadcast.
15045 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
15046 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
15047 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
15048}
15049
15050// Check for whether we can use INSERTPS to perform the shuffle. We only use
15051// INSERTPS when the V1 elements are already in the correct locations
15052// because otherwise we can just always use two SHUFPS instructions which
15053// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
15054// perform INSERTPS if a single V1 element is out of place and all V2
15055// elements are zeroable.
15056static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
15057 unsigned &InsertPSMask,
15058 const APInt &Zeroable,
15059 ArrayRef<int> Mask, SelectionDAG &DAG) {
15060 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15060, __extension__
__PRETTY_FUNCTION__))
;
15061 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15061, __extension__
__PRETTY_FUNCTION__))
;
15062 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15062, __extension__
__PRETTY_FUNCTION__))
;
15063
15064 // Attempt to match INSERTPS with one element from VA or VB being
15065 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
15066 // are updated.
15067 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
15068 ArrayRef<int> CandidateMask) {
15069 unsigned ZMask = 0;
15070 int VADstIndex = -1;
15071 int VBDstIndex = -1;
15072 bool VAUsedInPlace = false;
15073
15074 for (int i = 0; i < 4; ++i) {
15075 // Synthesize a zero mask from the zeroable elements (includes undefs).
15076 if (Zeroable[i]) {
15077 ZMask |= 1 << i;
15078 continue;
15079 }
15080
15081 // Flag if we use any VA inputs in place.
15082 if (i == CandidateMask[i]) {
15083 VAUsedInPlace = true;
15084 continue;
15085 }
15086
15087 // We can only insert a single non-zeroable element.
15088 if (VADstIndex >= 0 || VBDstIndex >= 0)
15089 return false;
15090
15091 if (CandidateMask[i] < 4) {
15092 // VA input out of place for insertion.
15093 VADstIndex = i;
15094 } else {
15095 // VB input for insertion.
15096 VBDstIndex = i;
15097 }
15098 }
15099
15100 // Don't bother if we have no (non-zeroable) element for insertion.
15101 if (VADstIndex < 0 && VBDstIndex < 0)
15102 return false;
15103
15104 // Determine element insertion src/dst indices. The src index is from the
15105 // start of the inserted vector, not the start of the concatenated vector.
15106 unsigned VBSrcIndex = 0;
15107 if (VADstIndex >= 0) {
15108 // If we have a VA input out of place, we use VA as the V2 element
15109 // insertion and don't use the original V2 at all.
15110 VBSrcIndex = CandidateMask[VADstIndex];
15111 VBDstIndex = VADstIndex;
15112 VB = VA;
15113 } else {
15114 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
15115 }
15116
15117 // If no V1 inputs are used in place, then the result is created only from
15118 // the zero mask and the V2 insertion - so remove V1 dependency.
15119 if (!VAUsedInPlace)
15120 VA = DAG.getUNDEF(MVT::v4f32);
15121
15122 // Update V1, V2 and InsertPSMask accordingly.
15123 V1 = VA;
15124 V2 = VB;
15125
15126 // Insert the V2 element into the desired position.
15127 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
15128 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15128, __extension__
__PRETTY_FUNCTION__))
;
15129 return true;
15130 };
15131
15132 if (matchAsInsertPS(V1, V2, Mask))
15133 return true;
15134
15135 // Commute and try again.
15136 SmallVector<int, 4> CommutedMask(Mask);
15137 ShuffleVectorSDNode::commuteMask(CommutedMask);
15138 if (matchAsInsertPS(V2, V1, CommutedMask))
15139 return true;
15140
15141 return false;
15142}
15143
15144static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
15145 ArrayRef<int> Mask, const APInt &Zeroable,
15146 SelectionDAG &DAG) {
15147 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15147, __extension__
__PRETTY_FUNCTION__))
;
15148 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15148, __extension__
__PRETTY_FUNCTION__))
;
15149
15150 // Attempt to match the insertps pattern.
15151 unsigned InsertPSMask = 0;
15152 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
15153 return SDValue();
15154
15155 // Insert the V2 element into the desired position.
15156 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
15157 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
15158}
15159
15160/// Handle lowering of 2-lane 64-bit floating point shuffles.
15161///
15162/// This is the basis function for the 2-lane 64-bit shuffles as we have full
15163/// support for floating point shuffles but not integer shuffles. These
15164/// instructions will incur a domain crossing penalty on some chips though so
15165/// it is better to avoid lowering through this for integer vectors where
15166/// possible.
15167static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15168 const APInt &Zeroable, SDValue V1, SDValue V2,
15169 const X86Subtarget &Subtarget,
15170 SelectionDAG &DAG) {
15171 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15171, __extension__
__PRETTY_FUNCTION__))
;
15172 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15172, __extension__
__PRETTY_FUNCTION__))
;
15173 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15173, __extension__
__PRETTY_FUNCTION__))
;
15174
15175 if (V2.isUndef()) {
15176 // Check for being able to broadcast a single element.
15177 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
15178 Mask, Subtarget, DAG))
15179 return Broadcast;
15180
15181 // Straight shuffle of a single input vector. Simulate this by using the
15182 // single input as both of the "inputs" to this instruction..
15183 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
15184
15185 if (Subtarget.hasAVX()) {
15186 // If we have AVX, we can use VPERMILPS which will allow folding a load
15187 // into the shuffle.
15188 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
15189 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15190 }
15191
15192 return DAG.getNode(
15193 X86ISD::SHUFP, DL, MVT::v2f64,
15194 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15195 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15196 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15197 }
15198 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15198, __extension__
__PRETTY_FUNCTION__))
;
15199 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15199, __extension__
__PRETTY_FUNCTION__))
;
15200 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15200, __extension__
__PRETTY_FUNCTION__))
;
15201 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15201, __extension__
__PRETTY_FUNCTION__))
;
15202
15203 if (Subtarget.hasAVX2())
15204 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15205 return Extract;
15206
15207 // When loading a scalar and then shuffling it into a vector we can often do
15208 // the insertion cheaply.
15209 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15210 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15211 return Insertion;
15212 // Try inverting the insertion since for v2 masks it is easy to do and we
15213 // can't reliably sort the mask one way or the other.
15214 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
15215 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
15216 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15217 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15218 return Insertion;
15219
15220 // Try to use one of the special instruction patterns to handle two common
15221 // blend patterns if a zero-blend above didn't work.
15222 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
15223 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
15224 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
15225 // We can either use a special instruction to load over the low double or
15226 // to move just the low double.
15227 return DAG.getNode(
15228 X86ISD::MOVSD, DL, MVT::v2f64, V2,
15229 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
15230
15231 if (Subtarget.hasSSE41())
15232 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
15233 Zeroable, Subtarget, DAG))
15234 return Blend;
15235
15236 // Use dedicated unpack instructions for masks that match their pattern.
15237 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
15238 return V;
15239
15240 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
15241 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
15242 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15243}
15244
15245/// Handle lowering of 2-lane 64-bit integer shuffles.
15246///
15247/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
15248/// the integer unit to minimize domain crossing penalties. However, for blends
15249/// it falls back to the floating point shuffle operation with appropriate bit
15250/// casting.
15251static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15252 const APInt &Zeroable, SDValue V1, SDValue V2,
15253 const X86Subtarget &Subtarget,
15254 SelectionDAG &DAG) {
15255 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15255, __extension__
__PRETTY_FUNCTION__))
;
15256 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15256, __extension__
__PRETTY_FUNCTION__))
;
15257 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15257, __extension__
__PRETTY_FUNCTION__))
;
15258
15259 if (V2.isUndef()) {
15260 // Check for being able to broadcast a single element.
15261 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
15262 Mask, Subtarget, DAG))
15263 return Broadcast;
15264
15265 // Straight shuffle of a single input vector. For everything from SSE2
15266 // onward this has a single fast instruction with no scary immediates.
15267 // We have to map the mask as it is actually a v4i32 shuffle instruction.
15268 V1 = DAG.getBitcast(MVT::v4i32, V1);
15269 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
15270 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
15271 Mask[1] < 0 ? -1 : (Mask[1] * 2),
15272 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
15273 return DAG.getBitcast(
15274 MVT::v2i64,
15275 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15276 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
15277 }
15278 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15278, __extension__
__PRETTY_FUNCTION__))
;
15279 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__))
;
15280 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15280, __extension__
__PRETTY_FUNCTION__))
;
15281 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__))
;
15282
15283 if (Subtarget.hasAVX2())
15284 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15285 return Extract;
15286
15287 // Try to use shift instructions.
15288 if (SDValue Shift =
15289 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
15290 DAG, /*BitwiseOnly*/ false))
15291 return Shift;
15292
15293 // When loading a scalar and then shuffling it into a vector we can often do
15294 // the insertion cheaply.
15295 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15296 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15297 return Insertion;
15298 // Try inverting the insertion since for v2 masks it is easy to do and we
15299 // can't reliably sort the mask one way or the other.
15300 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
15301 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15302 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15303 return Insertion;
15304
15305 // We have different paths for blend lowering, but they all must use the
15306 // *exact* same predicate.
15307 bool IsBlendSupported = Subtarget.hasSSE41();
15308 if (IsBlendSupported)
15309 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
15310 Zeroable, Subtarget, DAG))
15311 return Blend;
15312
15313 // Use dedicated unpack instructions for masks that match their pattern.
15314 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
15315 return V;
15316
15317 // Try to use byte rotation instructions.
15318 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15319 if (Subtarget.hasSSSE3()) {
15320 if (Subtarget.hasVLX())
15321 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
15322 Subtarget, DAG))
15323 return Rotate;
15324
15325 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
15326 Subtarget, DAG))
15327 return Rotate;
15328 }
15329
15330 // If we have direct support for blends, we should lower by decomposing into
15331 // a permute. That will be faster than the domain cross.
15332 if (IsBlendSupported)
15333 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
15334 Subtarget, DAG);
15335
15336 // We implement this with SHUFPD which is pretty lame because it will likely
15337 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
15338 // However, all the alternatives are still more cycles and newer chips don't
15339 // have this problem. It would be really nice if x86 had better shuffles here.
15340 V1 = DAG.getBitcast(MVT::v2f64, V1);
15341 V2 = DAG.getBitcast(MVT::v2f64, V2);
15342 return DAG.getBitcast(MVT::v2i64,
15343 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
15344}
15345
15346/// Lower a vector shuffle using the SHUFPS instruction.
15347///
15348/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
15349/// It makes no assumptions about whether this is the *best* lowering, it simply
15350/// uses it.
15351static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
15352 ArrayRef<int> Mask, SDValue V1,
15353 SDValue V2, SelectionDAG &DAG) {
15354 SDValue LowV = V1, HighV = V2;
15355 SmallVector<int, 4> NewMask(Mask);
15356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15357
15358 if (NumV2Elements == 1) {
15359 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
15360
15361 // Compute the index adjacent to V2Index and in the same half by toggling
15362 // the low bit.
15363 int V2AdjIndex = V2Index ^ 1;
15364
15365 if (Mask[V2AdjIndex] < 0) {
15366 // Handles all the cases where we have a single V2 element and an undef.
15367 // This will only ever happen in the high lanes because we commute the
15368 // vector otherwise.
15369 if (V2Index < 2)
15370 std::swap(LowV, HighV);
15371 NewMask[V2Index] -= 4;
15372 } else {
15373 // Handle the case where the V2 element ends up adjacent to a V1 element.
15374 // To make this work, blend them together as the first step.
15375 int V1Index = V2AdjIndex;
15376 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
15377 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
15378 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15379
15380 // Now proceed to reconstruct the final blend as we have the necessary
15381 // high or low half formed.
15382 if (V2Index < 2) {
15383 LowV = V2;
15384 HighV = V1;
15385 } else {
15386 HighV = V2;
15387 }
15388 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
15389 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
15390 }
15391 } else if (NumV2Elements == 2) {
15392 if (Mask[0] < 4 && Mask[1] < 4) {
15393 // Handle the easy case where we have V1 in the low lanes and V2 in the
15394 // high lanes.
15395 NewMask[2] -= 4;
15396 NewMask[3] -= 4;
15397 } else if (Mask[2] < 4 && Mask[3] < 4) {
15398 // We also handle the reversed case because this utility may get called
15399 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
15400 // arrange things in the right direction.
15401 NewMask[0] -= 4;
15402 NewMask[1] -= 4;
15403 HighV = V1;
15404 LowV = V2;
15405 } else {
15406 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
15407 // trying to place elements directly, just blend them and set up the final
15408 // shuffle to place them.
15409
15410 // The first two blend mask elements are for V1, the second two are for
15411 // V2.
15412 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
15413 Mask[2] < 4 ? Mask[2] : Mask[3],
15414 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
15415 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
15416 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15417 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15418
15419 // Now we do a normal shuffle of V1 by giving V1 as both operands to
15420 // a blend.
15421 LowV = HighV = V1;
15422 NewMask[0] = Mask[0] < 4 ? 0 : 2;
15423 NewMask[1] = Mask[0] < 4 ? 2 : 0;
15424 NewMask[2] = Mask[2] < 4 ? 1 : 3;
15425 NewMask[3] = Mask[2] < 4 ? 3 : 1;
15426 }
15427 } else if (NumV2Elements == 3) {
15428 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
15429 // we can get here due to other paths (e.g repeated mask matching) that we
15430 // don't want to do another round of lowerVECTOR_SHUFFLE.
15431 ShuffleVectorSDNode::commuteMask(NewMask);
15432 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
15433 }
15434 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
15435 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
15436}
15437
15438/// Lower 4-lane 32-bit floating point shuffles.
15439///
15440/// Uses instructions exclusively from the floating point unit to minimize
15441/// domain crossing penalties, as these are sufficient to implement all v4f32
15442/// shuffles.
15443static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15444 const APInt &Zeroable, SDValue V1, SDValue V2,
15445 const X86Subtarget &Subtarget,
15446 SelectionDAG &DAG) {
15447 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15447, __extension__
__PRETTY_FUNCTION__))
;
15448 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15448, __extension__
__PRETTY_FUNCTION__))
;
15449 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15449, __extension__
__PRETTY_FUNCTION__))
;
15450
15451 if (Subtarget.hasSSE41())
15452 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15453 Zeroable, Subtarget, DAG))
15454 return Blend;
15455
15456 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15457
15458 if (NumV2Elements == 0) {
15459 // Check for being able to broadcast a single element.
15460 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
15461 Mask, Subtarget, DAG))
15462 return Broadcast;
15463
15464 // Use even/odd duplicate instructions for masks that match their pattern.
15465 if (Subtarget.hasSSE3()) {
15466 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15467 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
15468 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
15469 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
15470 }
15471
15472 if (Subtarget.hasAVX()) {
15473 // If we have AVX, we can use VPERMILPS which will allow folding a load
15474 // into the shuffle.
15475 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
15476 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15477 }
15478
15479 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
15480 // in SSE1 because otherwise they are widened to v2f64 and never get here.
15481 if (!Subtarget.hasSSE2()) {
15482 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
15483 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
15484 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
15485 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
15486 }
15487
15488 // Otherwise, use a straight shuffle of a single input vector. We pass the
15489 // input vector to both operands to simulate this with a SHUFPS.
15490 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
15491 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15492 }
15493
15494 if (Subtarget.hasSSE2())
15495 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15496 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15497 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15498 return ZExt;
15499 }
15500
15501 if (Subtarget.hasAVX2())
15502 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15503 return Extract;
15504
15505 // There are special ways we can lower some single-element blends. However, we
15506 // have custom ways we can lower more complex single-element blends below that
15507 // we defer to if both this and BLENDPS fail to match, so restrict this to
15508 // when the V2 input is targeting element 0 of the mask -- that is the fast
15509 // case here.
15510 if (NumV2Elements == 1 && Mask[0] >= 4)
15511 if (SDValue V = lowerShuffleAsElementInsertion(
15512 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15513 return V;
15514
15515 if (Subtarget.hasSSE41()) {
15516 // Use INSERTPS if we can complete the shuffle efficiently.
15517 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
15518 return V;
15519
15520 if (!isSingleSHUFPSMask(Mask))
15521 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
15522 V2, Mask, DAG))
15523 return BlendPerm;
15524 }
15525
15526 // Use low/high mov instructions. These are only valid in SSE1 because
15527 // otherwise they are widened to v2f64 and never get here.
15528 if (!Subtarget.hasSSE2()) {
15529 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15530 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15531 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15532 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15533 }
15534
15535 // Use dedicated unpack instructions for masks that match their pattern.
15536 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15537 return V;
15538
15539 // Otherwise fall back to a SHUFPS lowering strategy.
15540 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15541}
15542
15543/// Lower 4-lane i32 vector shuffles.
15544///
15545/// We try to handle these with integer-domain shuffles where we can, but for
15546/// blends we use the floating point domain blend instructions.
15547static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15548 const APInt &Zeroable, SDValue V1, SDValue V2,
15549 const X86Subtarget &Subtarget,
15550 SelectionDAG &DAG) {
15551 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15551, __extension__
__PRETTY_FUNCTION__))
;
15552 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15552, __extension__
__PRETTY_FUNCTION__))
;
15553 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15553, __extension__
__PRETTY_FUNCTION__))
;
15554
15555 // Whenever we can lower this as a zext, that instruction is strictly faster
15556 // than any alternative. It also allows us to fold memory operands into the
15557 // shuffle in many cases.
15558 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15559 Zeroable, Subtarget, DAG))
15560 return ZExt;
15561
15562 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15563
15564 // Try to use shift instructions if fast.
15565 if (Subtarget.preferLowerShuffleAsShift()) {
15566 if (SDValue Shift =
15567 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
15568 Subtarget, DAG, /*BitwiseOnly*/ true))
15569 return Shift;
15570 if (NumV2Elements == 0)
15571 if (SDValue Rotate =
15572 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
15573 return Rotate;
15574 }
15575
15576 if (NumV2Elements == 0) {
15577 // Try to use broadcast unless the mask only has one non-undef element.
15578 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15579 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15580 Mask, Subtarget, DAG))
15581 return Broadcast;
15582 }
15583
15584 // Straight shuffle of a single input vector. For everything from SSE2
15585 // onward this has a single fast instruction with no scary immediates.
15586 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15587 // but we aren't actually going to use the UNPCK instruction because doing
15588 // so prevents folding a load into this instruction or making a copy.
15589 const int UnpackLoMask[] = {0, 0, 1, 1};
15590 const int UnpackHiMask[] = {2, 2, 3, 3};
15591 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15592 Mask = UnpackLoMask;
15593 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15594 Mask = UnpackHiMask;
15595
15596 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15597 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15598 }
15599
15600 if (Subtarget.hasAVX2())
15601 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15602 return Extract;
15603
15604 // Try to use shift instructions.
15605 if (SDValue Shift =
15606 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
15607 DAG, /*BitwiseOnly*/ false))
15608 return Shift;
15609
15610 // There are special ways we can lower some single-element blends.
15611 if (NumV2Elements == 1)
15612 if (SDValue V = lowerShuffleAsElementInsertion(
15613 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15614 return V;
15615
15616 // We have different paths for blend lowering, but they all must use the
15617 // *exact* same predicate.
15618 bool IsBlendSupported = Subtarget.hasSSE41();
15619 if (IsBlendSupported)
15620 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15621 Zeroable, Subtarget, DAG))
15622 return Blend;
15623
15624 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15625 Zeroable, Subtarget, DAG))
15626 return Masked;
15627
15628 // Use dedicated unpack instructions for masks that match their pattern.
15629 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15630 return V;
15631
15632 // Try to use byte rotation instructions.
15633 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15634 if (Subtarget.hasSSSE3()) {
15635 if (Subtarget.hasVLX())
15636 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15637 Subtarget, DAG))
15638 return Rotate;
15639
15640 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15641 Subtarget, DAG))
15642 return Rotate;
15643 }
15644
15645 // Assume that a single SHUFPS is faster than an alternative sequence of
15646 // multiple instructions (even if the CPU has a domain penalty).
15647 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15648 if (!isSingleSHUFPSMask(Mask)) {
15649 // If we have direct support for blends, we should lower by decomposing into
15650 // a permute. That will be faster than the domain cross.
15651 if (IsBlendSupported)
15652 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15653 Subtarget, DAG);
15654
15655 // Try to lower by permuting the inputs into an unpack instruction.
15656 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15657 Mask, Subtarget, DAG))
15658 return Unpack;
15659 }
15660
15661 // We implement this with SHUFPS because it can blend from two vectors.
15662 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15663 // up the inputs, bypassing domain shift penalties that we would incur if we
15664 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15665 // relevant.
15666 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15667 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15668 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15669 return DAG.getBitcast(MVT::v4i32, ShufPS);
15670}
15671
15672/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15673/// shuffle lowering, and the most complex part.
15674///
15675/// The lowering strategy is to try to form pairs of input lanes which are
15676/// targeted at the same half of the final vector, and then use a dword shuffle
15677/// to place them onto the right half, and finally unpack the paired lanes into
15678/// their final position.
15679///
15680/// The exact breakdown of how to form these dword pairs and align them on the
15681/// correct sides is really tricky. See the comments within the function for
15682/// more of the details.
15683///
15684/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15685/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15686/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15687/// vector, form the analogous 128-bit 8-element Mask.
15688static SDValue lowerV8I16GeneralSingleInputShuffle(
15689 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15690 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15691 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15691, __extension__
__PRETTY_FUNCTION__))
;
15692 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15693
15694 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15694, __extension__
__PRETTY_FUNCTION__))
;
15695 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15696 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15697
15698 // Attempt to directly match PSHUFLW or PSHUFHW.
15699 if (isUndefOrInRange(LoMask, 0, 4) &&
15700 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15701 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15702 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15703 }
15704 if (isUndefOrInRange(HiMask, 4, 8) &&
15705 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15706 for (int i = 0; i != 4; ++i)
15707 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15708 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15709 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15710 }
15711
15712 SmallVector<int, 4> LoInputs;
15713 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15714 array_pod_sort(LoInputs.begin(), LoInputs.end());
15715 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15716 SmallVector<int, 4> HiInputs;
15717 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15718 array_pod_sort(HiInputs.begin(), HiInputs.end());
15719 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15720 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15721 int NumHToL = LoInputs.size() - NumLToL;
15722 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15723 int NumHToH = HiInputs.size() - NumLToH;
15724 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15725 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15726 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15727 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15728
15729 // If we are shuffling values from one half - check how many different DWORD
15730 // pairs we need to create. If only 1 or 2 then we can perform this as a
15731 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15732 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15733 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15734 V = DAG.getNode(ShufWOp, DL, VT, V,
15735 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15736 V = DAG.getBitcast(PSHUFDVT, V);
15737 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15738 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15739 return DAG.getBitcast(VT, V);
15740 };
15741
15742 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15743 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15744 SmallVector<std::pair<int, int>, 4> DWordPairs;
15745 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15746
15747 // Collect the different DWORD pairs.
15748 for (int DWord = 0; DWord != 4; ++DWord) {
15749 int M0 = Mask[2 * DWord + 0];
15750 int M1 = Mask[2 * DWord + 1];
15751 M0 = (M0 >= 0 ? M0 % 4 : M0);
15752 M1 = (M1 >= 0 ? M1 % 4 : M1);
15753 if (M0 < 0 && M1 < 0)
15754 continue;
15755
15756 bool Match = false;
15757 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15758 auto &DWordPair = DWordPairs[j];
15759 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15760 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15761 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15762 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15763 PSHUFDMask[DWord] = DOffset + j;
15764 Match = true;
15765 break;
15766 }
15767 }
15768 if (!Match) {
15769 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15770 DWordPairs.push_back(std::make_pair(M0, M1));
15771 }
15772 }
15773
15774 if (DWordPairs.size() <= 2) {
15775 DWordPairs.resize(2, std::make_pair(-1, -1));
15776 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15777 DWordPairs[1].first, DWordPairs[1].second};
15778 if ((NumHToL + NumHToH) == 0)
15779 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15780 if ((NumLToL + NumLToH) == 0)
15781 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15782 }
15783 }
15784
15785 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15786 // such inputs we can swap two of the dwords across the half mark and end up
15787 // with <=2 inputs to each half in each half. Once there, we can fall through
15788 // to the generic code below. For example:
15789 //
15790 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15791 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15792 //
15793 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15794 // and an existing 2-into-2 on the other half. In this case we may have to
15795 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15796 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15797 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15798 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15799 // half than the one we target for fixing) will be fixed when we re-enter this
15800 // path. We will also combine away any sequence of PSHUFD instructions that
15801 // result into a single instruction. Here is an example of the tricky case:
15802 //
15803 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15804 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15805 //
15806 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15807 //
15808 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15809 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15810 //
15811 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15812 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15813 //
15814 // The result is fine to be handled by the generic logic.
15815 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15816 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15817 int AOffset, int BOffset) {
15818 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15819, __extension__
__PRETTY_FUNCTION__))
15819 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15819, __extension__
__PRETTY_FUNCTION__))
;
15820 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15821, __extension__
__PRETTY_FUNCTION__))
15821 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15821, __extension__
__PRETTY_FUNCTION__))
;
15822 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15823, __extension__
__PRETTY_FUNCTION__))
15823 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15823, __extension__
__PRETTY_FUNCTION__))
;
15824
15825 bool ThreeAInputs = AToAInputs.size() == 3;
15826
15827 // Compute the index of dword with only one word among the three inputs in
15828 // a half by taking the sum of the half with three inputs and subtracting
15829 // the sum of the actual three inputs. The difference is the remaining
15830 // slot.
15831 int ADWord = 0, BDWord = 0;
15832 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15833 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15834 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15835 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15836 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15837 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15838 int TripleNonInputIdx =
15839 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15840 TripleDWord = TripleNonInputIdx / 2;
15841
15842 // We use xor with one to compute the adjacent DWord to whichever one the
15843 // OneInput is in.
15844 OneInputDWord = (OneInput / 2) ^ 1;
15845
15846 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15847 // and BToA inputs. If there is also such a problem with the BToB and AToB
15848 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15849 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15850 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15851 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15852 // Compute how many inputs will be flipped by swapping these DWords. We
15853 // need
15854 // to balance this to ensure we don't form a 3-1 shuffle in the other
15855 // half.
15856 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15857 llvm::count(AToBInputs, 2 * ADWord + 1);
15858 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15859 llvm::count(BToBInputs, 2 * BDWord + 1);
15860 if ((NumFlippedAToBInputs == 1 &&
15861 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15862 (NumFlippedBToBInputs == 1 &&
15863 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15864 // We choose whether to fix the A half or B half based on whether that
15865 // half has zero flipped inputs. At zero, we may not be able to fix it
15866 // with that half. We also bias towards fixing the B half because that
15867 // will more commonly be the high half, and we have to bias one way.
15868 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15869 ArrayRef<int> Inputs) {
15870 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15871 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15872 // Determine whether the free index is in the flipped dword or the
15873 // unflipped dword based on where the pinned index is. We use this bit
15874 // in an xor to conditionally select the adjacent dword.
15875 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15876 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15877 if (IsFixIdxInput == IsFixFreeIdxInput)
15878 FixFreeIdx += 1;
15879 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15880 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15881, __extension__
__PRETTY_FUNCTION__))
15881 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15881, __extension__
__PRETTY_FUNCTION__))
;
15882 int PSHUFHalfMask[] = {0, 1, 2, 3};
15883 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15884 V = DAG.getNode(
15885 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15886 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15887 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15888
15889 for (int &M : Mask)
15890 if (M >= 0 && M == FixIdx)
15891 M = FixFreeIdx;
15892 else if (M >= 0 && M == FixFreeIdx)
15893 M = FixIdx;
15894 };
15895 if (NumFlippedBToBInputs != 0) {
15896 int BPinnedIdx =
15897 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15898 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15899 } else {
15900 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15900, __extension__
__PRETTY_FUNCTION__))
;
15901 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15902 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15903 }
15904 }
15905 }
15906
15907 int PSHUFDMask[] = {0, 1, 2, 3};
15908 PSHUFDMask[ADWord] = BDWord;
15909 PSHUFDMask[BDWord] = ADWord;
15910 V = DAG.getBitcast(
15911 VT,
15912 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15913 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15914
15915 // Adjust the mask to match the new locations of A and B.
15916 for (int &M : Mask)
15917 if (M >= 0 && M/2 == ADWord)
15918 M = 2 * BDWord + M % 2;
15919 else if (M >= 0 && M/2 == BDWord)
15920 M = 2 * ADWord + M % 2;
15921
15922 // Recurse back into this routine to re-compute state now that this isn't
15923 // a 3 and 1 problem.
15924 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15925 };
15926 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15927 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15928 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15929 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15930
15931 // At this point there are at most two inputs to the low and high halves from
15932 // each half. That means the inputs can always be grouped into dwords and
15933 // those dwords can then be moved to the correct half with a dword shuffle.
15934 // We use at most one low and one high word shuffle to collect these paired
15935 // inputs into dwords, and finally a dword shuffle to place them.
15936 int PSHUFLMask[4] = {-1, -1, -1, -1};
15937 int PSHUFHMask[4] = {-1, -1, -1, -1};
15938 int PSHUFDMask[4] = {-1, -1, -1, -1};
15939
15940 // First fix the masks for all the inputs that are staying in their
15941 // original halves. This will then dictate the targets of the cross-half
15942 // shuffles.
15943 auto fixInPlaceInputs =
15944 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15945 MutableArrayRef<int> SourceHalfMask,
15946 MutableArrayRef<int> HalfMask, int HalfOffset) {
15947 if (InPlaceInputs.empty())
15948 return;
15949 if (InPlaceInputs.size() == 1) {
15950 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15951 InPlaceInputs[0] - HalfOffset;
15952 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15953 return;
15954 }
15955 if (IncomingInputs.empty()) {
15956 // Just fix all of the in place inputs.
15957 for (int Input : InPlaceInputs) {
15958 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15959 PSHUFDMask[Input / 2] = Input / 2;
15960 }
15961 return;
15962 }
15963
15964 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15964, __extension__
__PRETTY_FUNCTION__))
;
15965 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15966 InPlaceInputs[0] - HalfOffset;
15967 // Put the second input next to the first so that they are packed into
15968 // a dword. We find the adjacent index by toggling the low bit.
15969 int AdjIndex = InPlaceInputs[0] ^ 1;
15970 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15971 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15972 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15973 };
15974 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15975 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15976
15977 // Now gather the cross-half inputs and place them into a free dword of
15978 // their target half.
15979 // FIXME: This operation could almost certainly be simplified dramatically to
15980 // look more like the 3-1 fixing operation.
15981 auto moveInputsToRightHalf = [&PSHUFDMask](
15982 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15983 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15984 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15985 int DestOffset) {
15986 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15987 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15988 };
15989 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15990 int Word) {
15991 int LowWord = Word & ~1;
15992 int HighWord = Word | 1;
15993 return isWordClobbered(SourceHalfMask, LowWord) ||
15994 isWordClobbered(SourceHalfMask, HighWord);
15995 };
15996
15997 if (IncomingInputs.empty())
15998 return;
15999
16000 if (ExistingInputs.empty()) {
16001 // Map any dwords with inputs from them into the right half.
16002 for (int Input : IncomingInputs) {
16003 // If the source half mask maps over the inputs, turn those into
16004 // swaps and use the swapped lane.
16005 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
16006 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
16007 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
16008 Input - SourceOffset;
16009 // We have to swap the uses in our half mask in one sweep.
16010 for (int &M : HalfMask)
16011 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
16012 M = Input;
16013 else if (M == Input)
16014 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16015 } else {
16016 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16018, __extension__
__PRETTY_FUNCTION__))
16017 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16018, __extension__
__PRETTY_FUNCTION__))
16018 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16018, __extension__
__PRETTY_FUNCTION__))
;
16019 }
16020 // Note that this correctly re-maps both when we do a swap and when
16021 // we observe the other side of the swap above. We rely on that to
16022 // avoid swapping the members of the input list directly.
16023 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16024 }
16025
16026 // Map the input's dword into the correct half.
16027 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
16028 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
16029 else
16030 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16032, __extension__
__PRETTY_FUNCTION__))
16031 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16032, __extension__
__PRETTY_FUNCTION__))
16032 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16032, __extension__
__PRETTY_FUNCTION__))
;
16033 }
16034
16035 // And just directly shift any other-half mask elements to be same-half
16036 // as we will have mirrored the dword containing the element into the
16037 // same position within that half.
16038 for (int &M : HalfMask)
16039 if (M >= SourceOffset && M < SourceOffset + 4) {
16040 M = M - SourceOffset + DestOffset;
16041 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16041, __extension__
__PRETTY_FUNCTION__))
;
16042 }
16043 return;
16044 }
16045
16046 // Ensure we have the input in a viable dword of its current half. This
16047 // is particularly tricky because the original position may be clobbered
16048 // by inputs being moved and *staying* in that half.
16049 if (IncomingInputs.size() == 1) {
16050 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16051 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
16052 SourceOffset;
16053 SourceHalfMask[InputFixed - SourceOffset] =
16054 IncomingInputs[0] - SourceOffset;
16055 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
16056 InputFixed);
16057 IncomingInputs[0] = InputFixed;
16058 }
16059 } else if (IncomingInputs.size() == 2) {
16060 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
16061 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16062 // We have two non-adjacent or clobbered inputs we need to extract from
16063 // the source half. To do this, we need to map them into some adjacent
16064 // dword slot in the source mask.
16065 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
16066 IncomingInputs[1] - SourceOffset};
16067
16068 // If there is a free slot in the source half mask adjacent to one of
16069 // the inputs, place the other input in it. We use (Index XOR 1) to
16070 // compute an adjacent index.
16071 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
16072 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
16073 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
16074 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16075 InputsFixed[1] = InputsFixed[0] ^ 1;
16076 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
16077 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
16078 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
16079 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
16080 InputsFixed[0] = InputsFixed[1] ^ 1;
16081 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
16082 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
16083 // The two inputs are in the same DWord but it is clobbered and the
16084 // adjacent DWord isn't used at all. Move both inputs to the free
16085 // slot.
16086 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
16087 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
16088 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
16089 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
16090 } else {
16091 // The only way we hit this point is if there is no clobbering
16092 // (because there are no off-half inputs to this half) and there is no
16093 // free slot adjacent to one of the inputs. In this case, we have to
16094 // swap an input with a non-input.
16095 for (int i = 0; i < 4; ++i)
16096 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16097, __extension__
__PRETTY_FUNCTION__))
16097 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16097, __extension__
__PRETTY_FUNCTION__))
;
16098 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16099, __extension__
__PRETTY_FUNCTION__))
16099 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16099, __extension__
__PRETTY_FUNCTION__))
;
16100
16101 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16102 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
16103
16104 // We also have to update the final source mask in this case because
16105 // it may need to undo the above swap.
16106 for (int &M : FinalSourceHalfMask)
16107 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
16108 M = InputsFixed[1] + SourceOffset;
16109 else if (M == InputsFixed[1] + SourceOffset)
16110 M = (InputsFixed[0] ^ 1) + SourceOffset;
16111
16112 InputsFixed[1] = InputsFixed[0] ^ 1;
16113 }
16114
16115 // Point everything at the fixed inputs.
16116 for (int &M : HalfMask)
16117 if (M == IncomingInputs[0])
16118 M = InputsFixed[0] + SourceOffset;
16119 else if (M == IncomingInputs[1])
16120 M = InputsFixed[1] + SourceOffset;
16121
16122 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
16123 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
16124 }
16125 } else {
16126 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16126)
;
16127 }
16128
16129 // Now hoist the DWord down to the right half.
16130 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
16131 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16131, __extension__
__PRETTY_FUNCTION__))
;
16132 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
16133 for (int &M : HalfMask)
16134 for (int Input : IncomingInputs)
16135 if (M == Input)
16136 M = FreeDWord * 2 + Input % 2;
16137 };
16138 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
16139 /*SourceOffset*/ 4, /*DestOffset*/ 0);
16140 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
16141 /*SourceOffset*/ 0, /*DestOffset*/ 4);
16142
16143 // Now enact all the shuffles we've computed to move the inputs into their
16144 // target half.
16145 if (!isNoopShuffleMask(PSHUFLMask))
16146 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16147 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
16148 if (!isNoopShuffleMask(PSHUFHMask))
16149 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16150 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
16151 if (!isNoopShuffleMask(PSHUFDMask))
16152 V = DAG.getBitcast(
16153 VT,
16154 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16155 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16156
16157 // At this point, each half should contain all its inputs, and we can then
16158 // just shuffle them into their final position.
16159 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16160, __extension__
__PRETTY_FUNCTION__))
16160 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16160, __extension__
__PRETTY_FUNCTION__))
;
16161 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__))
16162 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__))
;
16163
16164 // Do a half shuffle for the low mask.
16165 if (!isNoopShuffleMask(LoMask))
16166 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16167 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
16168
16169 // Do a half shuffle with the high mask after shifting its values down.
16170 for (int &M : HiMask)
16171 if (M >= 0)
16172 M -= 4;
16173 if (!isNoopShuffleMask(HiMask))
16174 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16175 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
16176
16177 return V;
16178}
16179
16180/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
16181/// blend if only one input is used.
16182static SDValue lowerShuffleAsBlendOfPSHUFBs(
16183 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16184 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
16185 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16186, __extension__
__PRETTY_FUNCTION__))
16186 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16186, __extension__
__PRETTY_FUNCTION__))
;
16187
16188 int NumBytes = VT.getSizeInBits() / 8;
16189 int Size = Mask.size();
16190 int Scale = NumBytes / Size;
16191
16192 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16193 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16194 V1InUse = false;
16195 V2InUse = false;
16196
16197 for (int i = 0; i < NumBytes; ++i) {
16198 int M = Mask[i / Scale];
16199 if (M < 0)
16200 continue;
16201
16202 const int ZeroMask = 0x80;
16203 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
16204 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
16205 if (Zeroable[i / Scale])
16206 V1Idx = V2Idx = ZeroMask;
16207
16208 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
16209 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
16210 V1InUse |= (ZeroMask != V1Idx);
16211 V2InUse |= (ZeroMask != V2Idx);
16212 }
16213
16214 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
16215 if (V1InUse)
16216 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
16217 DAG.getBuildVector(ShufVT, DL, V1Mask));
16218 if (V2InUse)
16219 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
16220 DAG.getBuildVector(ShufVT, DL, V2Mask));
16221
16222 // If we need shuffled inputs from both, blend the two.
16223 SDValue V;
16224 if (V1InUse && V2InUse)
16225 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
16226 else
16227 V = V1InUse ? V1 : V2;
16228
16229 // Cast the result back to the correct type.
16230 return DAG.getBitcast(VT, V);
16231}
16232
16233/// Generic lowering of 8-lane i16 shuffles.
16234///
16235/// This handles both single-input shuffles and combined shuffle/blends with
16236/// two inputs. The single input shuffles are immediately delegated to
16237/// a dedicated lowering routine.
16238///
16239/// The blends are lowered in one of three fundamental ways. If there are few
16240/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
16241/// of the input is significantly cheaper when lowered as an interleaving of
16242/// the two inputs, try to interleave them. Otherwise, blend the low and high
16243/// halves of the inputs separately (making them have relatively few inputs)
16244/// and then concatenate them.
16245static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16246 const APInt &Zeroable, SDValue V1, SDValue V2,
16247 const X86Subtarget &Subtarget,
16248 SelectionDAG &DAG) {
16249 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16249, __extension__
__PRETTY_FUNCTION__))
;
16250 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16250, __extension__
__PRETTY_FUNCTION__))
;
16251 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16251, __extension__
__PRETTY_FUNCTION__))
;
16252
16253 // Whenever we can lower this as a zext, that instruction is strictly faster
16254 // than any alternative.
16255 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
16256 Zeroable, Subtarget, DAG))
16257 return ZExt;
16258
16259 // Try to use lower using a truncation.
16260 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16261 Subtarget, DAG))
16262 return V;
16263
16264 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
16265
16266 if (NumV2Inputs == 0) {
16267 // Try to use shift instructions.
16268 if (SDValue Shift =
16269 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
16270 Subtarget, DAG, /*BitwiseOnly*/ false))
16271 return Shift;
16272
16273 // Check for being able to broadcast a single element.
16274 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
16275 Mask, Subtarget, DAG))
16276 return Broadcast;
16277
16278 // Try to use bit rotation instructions.
16279 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
16280 Subtarget, DAG))
16281 return Rotate;
16282
16283 // Use dedicated unpack instructions for masks that match their pattern.
16284 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16285 return V;
16286
16287 // Use dedicated pack instructions for masks that match their pattern.
16288 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16289 Subtarget))
16290 return V;
16291
16292 // Try to use byte rotation instructions.
16293 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
16294 Subtarget, DAG))
16295 return Rotate;
16296
16297 // Make a copy of the mask so it can be modified.
16298 SmallVector<int, 8> MutableMask(Mask);
16299 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
16300 Subtarget, DAG);
16301 }
16302
16303 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16305, __extension__
__PRETTY_FUNCTION__))
16304 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16305, __extension__
__PRETTY_FUNCTION__))
16305 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16305, __extension__
__PRETTY_FUNCTION__))
;
16306
16307 // Try to use shift instructions.
16308 if (SDValue Shift =
16309 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
16310 DAG, /*BitwiseOnly*/ false))
16311 return Shift;
16312
16313 // See if we can use SSE4A Extraction / Insertion.
16314 if (Subtarget.hasSSE4A())
16315 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
16316 Zeroable, DAG))
16317 return V;
16318
16319 // There are special ways we can lower some single-element blends.
16320 if (NumV2Inputs == 1)
16321 if (SDValue V = lowerShuffleAsElementInsertion(
16322 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16323 return V;
16324
16325 // We have different paths for blend lowering, but they all must use the
16326 // *exact* same predicate.
16327 bool IsBlendSupported = Subtarget.hasSSE41();
16328 if (IsBlendSupported)
16329 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
16330 Zeroable, Subtarget, DAG))
16331 return Blend;
16332
16333 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
16334 Zeroable, Subtarget, DAG))
16335 return Masked;
16336
16337 // Use dedicated unpack instructions for masks that match their pattern.
16338 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16339 return V;
16340
16341 // Use dedicated pack instructions for masks that match their pattern.
16342 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16343 Subtarget))
16344 return V;
16345
16346 // Try to use lower using a truncation.
16347 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16348 Subtarget, DAG))
16349 return V;
16350
16351 // Try to use byte rotation instructions.
16352 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
16353 Subtarget, DAG))
16354 return Rotate;
16355
16356 if (SDValue BitBlend =
16357 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
16358 return BitBlend;
16359
16360 // Try to use byte shift instructions to mask.
16361 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
16362 Zeroable, Subtarget, DAG))
16363 return V;
16364
16365 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
16366 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
16367 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
16368 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
16369 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
16370 !Subtarget.hasVLX()) {
16371 // Check if this is part of a 256-bit vector truncation.
16372 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
16373 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16374 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
16375 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
16376 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
16377 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
16378 DAG.getTargetConstant(0xEE, DL, MVT::i8));
16379 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
16380 V1 = extract128BitVector(V1V2, 0, DAG, DL);
16381 V2 = extract128BitVector(V1V2, 4, DAG, DL);
16382 } else {
16383 SmallVector<SDValue, 4> DWordClearOps(4,
16384 DAG.getConstant(0, DL, MVT::i32));
16385 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
16386 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
16387 SDValue DWordClearMask =
16388 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
16389 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
16390 DWordClearMask);
16391 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
16392 DWordClearMask);
16393 }
16394 // Now pack things back together.
16395 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
16396 if (NumEvenDrops == 2) {
16397 Result = DAG.getBitcast(MVT::v4i32, Result);
16398 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
16399 }
16400 return Result;
16401 }
16402
16403 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
16404 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
16405 if (NumOddDrops == 1) {
16406 bool HasSSE41 = Subtarget.hasSSE41();
16407 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16408 DAG.getBitcast(MVT::v4i32, V1),
16409 DAG.getTargetConstant(16, DL, MVT::i8));
16410 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16411 DAG.getBitcast(MVT::v4i32, V2),
16412 DAG.getTargetConstant(16, DL, MVT::i8));
16413 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
16414 MVT::v8i16, V1, V2);
16415 }
16416
16417 // Try to lower by permuting the inputs into an unpack instruction.
16418 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
16419 Mask, Subtarget, DAG))
16420 return Unpack;
16421
16422 // If we can't directly blend but can use PSHUFB, that will be better as it
16423 // can both shuffle and set up the inefficient blend.
16424 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
16425 bool V1InUse, V2InUse;
16426 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
16427 Zeroable, DAG, V1InUse, V2InUse);
16428 }
16429
16430 // We can always bit-blend if we have to so the fallback strategy is to
16431 // decompose into single-input permutes and blends/unpacks.
16432 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
16433 Mask, Subtarget, DAG);
16434}
16435
16436/// Lower 8-lane 16-bit floating point shuffles.
16437static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16438 const APInt &Zeroable, SDValue V1, SDValue V2,
16439 const X86Subtarget &Subtarget,
16440 SelectionDAG &DAG) {
16441 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16441, __extension__
__PRETTY_FUNCTION__))
;
16442 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16442, __extension__
__PRETTY_FUNCTION__))
;
16443 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16443, __extension__
__PRETTY_FUNCTION__))
;
16444 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16445
16446 if (Subtarget.hasFP16()) {
16447 if (NumV2Elements == 0) {
16448 // Check for being able to broadcast a single element.
16449 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
16450 Mask, Subtarget, DAG))
16451 return Broadcast;
16452 }
16453 if (NumV2Elements == 1 && Mask[0] >= 8)
16454 if (SDValue V = lowerShuffleAsElementInsertion(
16455 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16456 return V;
16457 }
16458
16459 V1 = DAG.getBitcast(MVT::v8i16, V1);
16460 V2 = DAG.getBitcast(MVT::v8i16, V2);
16461 return DAG.getBitcast(MVT::v8f16,
16462 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
16463}
16464
16465// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
16466// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
16467// the active subvector is extracted.
16468static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
16469 ArrayRef<int> Mask, SDValue V1, SDValue V2,
16470 const X86Subtarget &Subtarget,
16471 SelectionDAG &DAG) {
16472 MVT MaskVT = VT.changeTypeToInteger();
16473 SDValue MaskNode;
16474 MVT ShuffleVT = VT;
16475 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
16476 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
16477 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
16478 ShuffleVT = V1.getSimpleValueType();
16479
16480 // Adjust mask to correct indices for the second input.
16481 int NumElts = VT.getVectorNumElements();
16482 unsigned Scale = 512 / VT.getSizeInBits();
16483 SmallVector<int, 32> AdjustedMask(Mask);
16484 for (int &M : AdjustedMask)
16485 if (NumElts <= M)
16486 M += (Scale - 1) * NumElts;
16487 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
16488 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
16489 } else {
16490 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
16491 }
16492
16493 SDValue Result;
16494 if (V2.isUndef())
16495 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
16496 else
16497 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
16498
16499 if (VT != ShuffleVT)
16500 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
16501
16502 return Result;
16503}
16504
16505/// Generic lowering of v16i8 shuffles.
16506///
16507/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
16508/// detect any complexity reducing interleaving. If that doesn't help, it uses
16509/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
16510/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
16511/// back together.
16512static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16513 const APInt &Zeroable, SDValue V1, SDValue V2,
16514 const X86Subtarget &Subtarget,
16515 SelectionDAG &DAG) {
16516 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16516, __extension__
__PRETTY_FUNCTION__))
;
16517 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16517, __extension__
__PRETTY_FUNCTION__))
;
16518 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16518, __extension__
__PRETTY_FUNCTION__))
;
16519
16520 // Try to use shift instructions.
16521 if (SDValue Shift =
16522 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
16523 DAG, /*BitwiseOnly*/ false))
16524 return Shift;
16525
16526 // Try to use byte rotation instructions.
16527 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
16528 Subtarget, DAG))
16529 return Rotate;
16530
16531 // Use dedicated pack instructions for masks that match their pattern.
16532 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
16533 Subtarget))
16534 return V;
16535
16536 // Try to use a zext lowering.
16537 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
16538 Zeroable, Subtarget, DAG))
16539 return ZExt;
16540
16541 // Try to use lower using a truncation.
16542 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16543 Subtarget, DAG))
16544 return V;
16545
16546 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16547 Subtarget, DAG))
16548 return V;
16549
16550 // See if we can use SSE4A Extraction / Insertion.
16551 if (Subtarget.hasSSE4A())
16552 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16553 Zeroable, DAG))
16554 return V;
16555
16556 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16557
16558 // For single-input shuffles, there are some nicer lowering tricks we can use.
16559 if (NumV2Elements == 0) {
16560 // Check for being able to broadcast a single element.
16561 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16562 Mask, Subtarget, DAG))
16563 return Broadcast;
16564
16565 // Try to use bit rotation instructions.
16566 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16567 Subtarget, DAG))
16568 return Rotate;
16569
16570 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16571 return V;
16572
16573 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16574 // Notably, this handles splat and partial-splat shuffles more efficiently.
16575 // However, it only makes sense if the pre-duplication shuffle simplifies
16576 // things significantly. Currently, this means we need to be able to
16577 // express the pre-duplication shuffle as an i16 shuffle.
16578 //
16579 // FIXME: We should check for other patterns which can be widened into an
16580 // i16 shuffle as well.
16581 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16582 for (int i = 0; i < 16; i += 2)
16583 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16584 return false;
16585
16586 return true;
16587 };
16588 auto tryToWidenViaDuplication = [&]() -> SDValue {
16589 if (!canWidenViaDuplication(Mask))
16590 return SDValue();
16591 SmallVector<int, 4> LoInputs;
16592 copy_if(Mask, std::back_inserter(LoInputs),
16593 [](int M) { return M >= 0 && M < 8; });
16594 array_pod_sort(LoInputs.begin(), LoInputs.end());
16595 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16596 LoInputs.end());
16597 SmallVector<int, 4> HiInputs;
16598 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16599 array_pod_sort(HiInputs.begin(), HiInputs.end());
16600 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16601 HiInputs.end());
16602
16603 bool TargetLo = LoInputs.size() >= HiInputs.size();
16604 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16605 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16606
16607 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16608 SmallDenseMap<int, int, 8> LaneMap;
16609 for (int I : InPlaceInputs) {
16610 PreDupI16Shuffle[I/2] = I/2;
16611 LaneMap[I] = I;
16612 }
16613 int j = TargetLo ? 0 : 4, je = j + 4;
16614 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16615 // Check if j is already a shuffle of this input. This happens when
16616 // there are two adjacent bytes after we move the low one.
16617 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16618 // If we haven't yet mapped the input, search for a slot into which
16619 // we can map it.
16620 while (j < je && PreDupI16Shuffle[j] >= 0)
16621 ++j;
16622
16623 if (j == je)
16624 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16625 return SDValue();
16626
16627 // Map this input with the i16 shuffle.
16628 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16629 }
16630
16631 // Update the lane map based on the mapping we ended up with.
16632 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16633 }
16634 V1 = DAG.getBitcast(
16635 MVT::v16i8,
16636 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16637 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16638
16639 // Unpack the bytes to form the i16s that will be shuffled into place.
16640 bool EvenInUse = false, OddInUse = false;
16641 for (int i = 0; i < 16; i += 2) {
16642 EvenInUse |= (Mask[i + 0] >= 0);
16643 OddInUse |= (Mask[i + 1] >= 0);
16644 if (EvenInUse && OddInUse)
16645 break;
16646 }
16647 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16648 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16649 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16650
16651 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16652 for (int i = 0; i < 16; ++i)
16653 if (Mask[i] >= 0) {
16654 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16655 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16655, __extension__
__PRETTY_FUNCTION__))
;
16656 if (PostDupI16Shuffle[i / 2] < 0)
16657 PostDupI16Shuffle[i / 2] = MappedMask;
16658 else
16659 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16660, __extension__
__PRETTY_FUNCTION__))
16660 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16660, __extension__
__PRETTY_FUNCTION__))
;
16661 }
16662 return DAG.getBitcast(
16663 MVT::v16i8,
16664 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16665 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16666 };
16667 if (SDValue V = tryToWidenViaDuplication())
16668 return V;
16669 }
16670
16671 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16672 Zeroable, Subtarget, DAG))
16673 return Masked;
16674
16675 // Use dedicated unpack instructions for masks that match their pattern.
16676 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16677 return V;
16678
16679 // Try to use byte shift instructions to mask.
16680 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16681 Zeroable, Subtarget, DAG))
16682 return V;
16683
16684 // Check for compaction patterns.
16685 bool IsSingleInput = V2.isUndef();
16686 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16687
16688 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16689 // with PSHUFB. It is important to do this before we attempt to generate any
16690 // blends but after all of the single-input lowerings. If the single input
16691 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16692 // want to preserve that and we can DAG combine any longer sequences into
16693 // a PSHUFB in the end. But once we start blending from multiple inputs,
16694 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16695 // and there are *very* few patterns that would actually be faster than the
16696 // PSHUFB approach because of its ability to zero lanes.
16697 //
16698 // If the mask is a binary compaction, we can more efficiently perform this
16699 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16700 //
16701 // FIXME: The only exceptions to the above are blends which are exact
16702 // interleavings with direct instructions supporting them. We currently don't
16703 // handle those well here.
16704 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16705 bool V1InUse = false;
16706 bool V2InUse = false;
16707
16708 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16709 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16710
16711 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16712 // do so. This avoids using them to handle blends-with-zero which is
16713 // important as a single pshufb is significantly faster for that.
16714 if (V1InUse && V2InUse) {
16715 if (Subtarget.hasSSE41())
16716 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16717 Zeroable, Subtarget, DAG))
16718 return Blend;
16719
16720 // We can use an unpack to do the blending rather than an or in some
16721 // cases. Even though the or may be (very minorly) more efficient, we
16722 // preference this lowering because there are common cases where part of
16723 // the complexity of the shuffles goes away when we do the final blend as
16724 // an unpack.
16725 // FIXME: It might be worth trying to detect if the unpack-feeding
16726 // shuffles will both be pshufb, in which case we shouldn't bother with
16727 // this.
16728 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16729 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16730 return Unpack;
16731
16732 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16733 if (Subtarget.hasVBMI())
16734 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16735 DAG);
16736
16737 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16738 if (Subtarget.hasXOP()) {
16739 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16740 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16741 }
16742
16743 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16744 // PALIGNR will be cheaper than the second PSHUFB+OR.
16745 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16746 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16747 return V;
16748 }
16749
16750 return PSHUFB;
16751 }
16752
16753 // There are special ways we can lower some single-element blends.
16754 if (NumV2Elements == 1)
16755 if (SDValue V = lowerShuffleAsElementInsertion(
16756 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16757 return V;
16758
16759 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16760 return Blend;
16761
16762 // Check whether a compaction lowering can be done. This handles shuffles
16763 // which take every Nth element for some even N. See the helper function for
16764 // details.
16765 //
16766 // We special case these as they can be particularly efficiently handled with
16767 // the PACKUSB instruction on x86 and they show up in common patterns of
16768 // rearranging bytes to truncate wide elements.
16769 if (NumEvenDrops) {
16770 // NumEvenDrops is the power of two stride of the elements. Another way of
16771 // thinking about it is that we need to drop the even elements this many
16772 // times to get the original input.
16773
16774 // First we need to zero all the dropped bytes.
16775 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16776, __extension__
__PRETTY_FUNCTION__))
16776 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16776, __extension__
__PRETTY_FUNCTION__))
;
16777 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16778 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16779 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16780 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16781 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16782 WordClearMask);
16783 if (!IsSingleInput)
16784 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16785 WordClearMask);
16786
16787 // Now pack things back together.
16788 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16789 IsSingleInput ? V1 : V2);
16790 for (int i = 1; i < NumEvenDrops; ++i) {
16791 Result = DAG.getBitcast(MVT::v8i16, Result);
16792 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16793 }
16794 return Result;
16795 }
16796
16797 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16798 if (NumOddDrops == 1) {
16799 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16800 DAG.getBitcast(MVT::v8i16, V1),
16801 DAG.getTargetConstant(8, DL, MVT::i8));
16802 if (!IsSingleInput)
16803 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16804 DAG.getBitcast(MVT::v8i16, V2),
16805 DAG.getTargetConstant(8, DL, MVT::i8));
16806 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16807 IsSingleInput ? V1 : V2);
16808 }
16809
16810 // Handle multi-input cases by blending/unpacking single-input shuffles.
16811 if (NumV2Elements > 0)
16812 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16813 Subtarget, DAG);
16814
16815 // The fallback path for single-input shuffles widens this into two v8i16
16816 // vectors with unpacks, shuffles those, and then pulls them back together
16817 // with a pack.
16818 SDValue V = V1;
16819
16820 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16821 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16822 for (int i = 0; i < 16; ++i)
16823 if (Mask[i] >= 0)
16824 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16825
16826 SDValue VLoHalf, VHiHalf;
16827 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16828 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16829 // i16s.
16830 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16831 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16832 // Use a mask to drop the high bytes.
16833 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16834 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16835 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16836
16837 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16838 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16839
16840 // Squash the masks to point directly into VLoHalf.
16841 for (int &M : LoBlendMask)
16842 if (M >= 0)
16843 M /= 2;
16844 for (int &M : HiBlendMask)
16845 if (M >= 0)
16846 M /= 2;
16847 } else {
16848 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16849 // VHiHalf so that we can blend them as i16s.
16850 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16851
16852 VLoHalf = DAG.getBitcast(
16853 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16854 VHiHalf = DAG.getBitcast(
16855 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16856 }
16857
16858 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16859 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16860
16861 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16862}
16863
16864/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16865///
16866/// This routine breaks down the specific type of 128-bit shuffle and
16867/// dispatches to the lowering routines accordingly.
16868static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16869 MVT VT, SDValue V1, SDValue V2,
16870 const APInt &Zeroable,
16871 const X86Subtarget &Subtarget,
16872 SelectionDAG &DAG) {
16873 switch (VT.SimpleTy) {
16874 case MVT::v2i64:
16875 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16876 case MVT::v2f64:
16877 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16878 case MVT::v4i32:
16879 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16880 case MVT::v4f32:
16881 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16882 case MVT::v8i16:
16883 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16884 case MVT::v8f16:
16885 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16886 case MVT::v16i8:
16887 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16888
16889 default:
16890 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16890)
;
16891 }
16892}
16893
16894/// Generic routine to split vector shuffle into half-sized shuffles.
16895///
16896/// This routine just extracts two subvectors, shuffles them independently, and
16897/// then concatenates them back together. This should work effectively with all
16898/// AVX vector shuffle types.
16899static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16900 SDValue V2, ArrayRef<int> Mask,
16901 SelectionDAG &DAG, bool SimpleOnly) {
16902 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16903, __extension__
__PRETTY_FUNCTION__))
16903 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16903, __extension__
__PRETTY_FUNCTION__))
;
16904 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16904, __extension__
__PRETTY_FUNCTION__))
;
16905 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16905, __extension__
__PRETTY_FUNCTION__))
;
16906
16907 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16908 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16909
16910 int NumElements = VT.getVectorNumElements();
16911 int SplitNumElements = NumElements / 2;
16912 MVT ScalarVT = VT.getVectorElementType();
16913 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16914
16915 // Use splitVector/extractSubVector so that split build-vectors just build two
16916 // narrower build vectors. This helps shuffling with splats and zeros.
16917 auto SplitVector = [&](SDValue V) {
16918 SDValue LoV, HiV;
16919 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16920 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16921 DAG.getBitcast(SplitVT, HiV));
16922 };
16923
16924 SDValue LoV1, HiV1, LoV2, HiV2;
16925 std::tie(LoV1, HiV1) = SplitVector(V1);
16926 std::tie(LoV2, HiV2) = SplitVector(V2);
16927
16928 // Now create two 4-way blends of these half-width vectors.
16929 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
16930 bool &UseHiV1, bool &UseLoV2,
16931 bool &UseHiV2) {
16932 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
16933 for (int i = 0; i < SplitNumElements; ++i) {
16934 int M = HalfMask[i];
16935 if (M >= NumElements) {
16936 if (M >= NumElements + SplitNumElements)
16937 UseHiV2 = true;
16938 else
16939 UseLoV2 = true;
16940 } else if (M >= 0) {
16941 if (M >= SplitNumElements)
16942 UseHiV1 = true;
16943 else
16944 UseLoV1 = true;
16945 }
16946 }
16947 };
16948
16949 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
16950 if (!SimpleOnly)
16951 return true;
16952
16953 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16954 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16955
16956 return !(UseHiV1 || UseHiV2);
16957 };
16958
16959 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16960 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16961 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16962 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16963 for (int i = 0; i < SplitNumElements; ++i) {
16964 int M = HalfMask[i];
16965 if (M >= NumElements) {
16966 V2BlendMask[i] = M - NumElements;
16967 BlendMask[i] = SplitNumElements + i;
16968 } else if (M >= 0) {
16969 V1BlendMask[i] = M;
16970 BlendMask[i] = i;
16971 }
16972 }
16973
16974 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16975 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16976
16977 // Because the lowering happens after all combining takes place, we need to
16978 // manually combine these blend masks as much as possible so that we create
16979 // a minimal number of high-level vector shuffle nodes.
16980 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16980, __extension__
__PRETTY_FUNCTION__))
;
16981
16982 // First try just blending the halves of V1 or V2.
16983 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16984 return DAG.getUNDEF(SplitVT);
16985 if (!UseLoV2 && !UseHiV2)
16986 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16987 if (!UseLoV1 && !UseHiV1)
16988 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16989
16990 SDValue V1Blend, V2Blend;
16991 if (UseLoV1 && UseHiV1) {
16992 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16993 } else {
16994 // We only use half of V1 so map the usage down into the final blend mask.
16995 V1Blend = UseLoV1 ? LoV1 : HiV1;
16996 for (int i = 0; i < SplitNumElements; ++i)
16997 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16998 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16999 }
17000 if (UseLoV2 && UseHiV2) {
17001 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17002 } else {
17003 // We only use half of V2 so map the usage down into the final blend mask.
17004 V2Blend = UseLoV2 ? LoV2 : HiV2;
17005 for (int i = 0; i < SplitNumElements; ++i)
17006 if (BlendMask[i] >= SplitNumElements)
17007 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
17008 }
17009 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
17010 };
17011
17012 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
17013 return SDValue();
17014
17015 SDValue Lo = HalfBlend(LoMask);
17016 SDValue Hi = HalfBlend(HiMask);
17017 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17018}
17019
17020/// Either split a vector in halves or decompose the shuffles and the
17021/// blend/unpack.
17022///
17023/// This is provided as a good fallback for many lowerings of non-single-input
17024/// shuffles with more than one 128-bit lane. In those cases, we want to select
17025/// between splitting the shuffle into 128-bit components and stitching those
17026/// back together vs. extracting the single-input shuffles and blending those
17027/// results.
17028static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
17029 SDValue V2, ArrayRef<int> Mask,
17030 const X86Subtarget &Subtarget,
17031 SelectionDAG &DAG) {
17032 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17033, __extension__
__PRETTY_FUNCTION__))
20
'?' condition is true
17033 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17033, __extension__
__PRETTY_FUNCTION__))
;
17034 int Size = Mask.size();
17035
17036 // If this can be modeled as a broadcast of two elements followed by a blend,
17037 // prefer that lowering. This is especially important because broadcasts can
17038 // often fold with memory operands.
17039 auto DoBothBroadcast = [&] {
17040 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
17041 for (int M : Mask)
17042 if (M >= Size) {
17043 if (V2BroadcastIdx < 0)
17044 V2BroadcastIdx = M - Size;
17045 else if (M - Size != V2BroadcastIdx)
17046 return false;
17047 } else if (M >= 0) {
17048 if (V1BroadcastIdx < 0)
17049 V1BroadcastIdx = M;
17050 else if (M != V1BroadcastIdx)
17051 return false;
17052 }
17053 return true;
17054 };
17055 if (DoBothBroadcast())
21
Taking true branch
17056 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
22
Calling 'lowerShuffleAsDecomposedShuffleMerge'
17057 DAG);
17058
17059 // If the inputs all stem from a single 128-bit lane of each input, then we
17060 // split them rather than blending because the split will decompose to
17061 // unusually few instructions.
17062 int LaneCount = VT.getSizeInBits() / 128;
17063 int LaneSize = Size / LaneCount;
17064 SmallBitVector LaneInputs[2];
17065 LaneInputs[0].resize(LaneCount, false);
17066 LaneInputs[1].resize(LaneCount, false);
17067 for (int i = 0; i < Size; ++i)
17068 if (Mask[i] >= 0)
17069 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
17070 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17071 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17072 /*SimpleOnly*/ false);
17073
17074 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
17075 // requires that the decomposed single-input shuffles don't end up here.
17076 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17077 DAG);
17078}
17079
17080// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17081// TODO: Extend to support v8f32 (+ 512-bit shuffles).
17082static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
17083 SDValue V1, SDValue V2,
17084 ArrayRef<int> Mask,
17085 SelectionDAG &DAG) {
17086 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17086, __extension__
__PRETTY_FUNCTION__))
;
17087
17088 int LHSMask[4] = {-1, -1, -1, -1};
17089 int RHSMask[4] = {-1, -1, -1, -1};
17090 unsigned SHUFPMask = 0;
17091
17092 // As SHUFPD uses a single LHS/RHS element per lane, we can always
17093 // perform the shuffle once the lanes have been shuffled in place.
17094 for (int i = 0; i != 4; ++i) {
17095 int M = Mask[i];
17096 if (M < 0)
17097 continue;
17098 int LaneBase = i & ~1;
17099 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
17100 LaneMask[LaneBase + (M & 1)] = M;
17101 SHUFPMask |= (M & 1) << i;
17102 }
17103
17104 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
17105 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
17106 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
17107 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
17108}
17109
17110/// Lower a vector shuffle crossing multiple 128-bit lanes as
17111/// a lane permutation followed by a per-lane permutation.
17112///
17113/// This is mainly for cases where we can have non-repeating permutes
17114/// in each lane.
17115///
17116/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
17117/// we should investigate merging them.
17118static SDValue lowerShuffleAsLanePermuteAndPermute(
17119 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17120 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17121 int NumElts = VT.getVectorNumElements();
17122 int NumLanes = VT.getSizeInBits() / 128;
17123 int NumEltsPerLane = NumElts / NumLanes;
17124 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
17125
17126 /// Attempts to find a sublane permute with the given size
17127 /// that gets all elements into their target lanes.
17128 ///
17129 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
17130 /// If unsuccessful, returns false and may overwrite InLaneMask.
17131 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
17132 int NumSublanesPerLane = NumSublanes / NumLanes;
17133 int NumEltsPerSublane = NumElts / NumSublanes;
17134
17135 SmallVector<int, 16> CrossLaneMask;
17136 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
17137 // CrossLaneMask but one entry == one sublane.
17138 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
17139
17140 for (int i = 0; i != NumElts; ++i) {
17141 int M = Mask[i];
17142 if (M < 0)
17143 continue;
17144
17145 int SrcSublane = M / NumEltsPerSublane;
17146 int DstLane = i / NumEltsPerLane;
17147
17148 // We only need to get the elements into the right lane, not sublane.
17149 // So search all sublanes that make up the destination lane.
17150 bool Found = false;
17151 int DstSubStart = DstLane * NumSublanesPerLane;
17152 int DstSubEnd = DstSubStart + NumSublanesPerLane;
17153 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
17154 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
17155 continue;
17156
17157 Found = true;
17158 CrossLaneMaskLarge[DstSublane] = SrcSublane;
17159 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
17160 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
17161 break;
17162 }
17163 if (!Found)
17164 return SDValue();
17165 }
17166
17167 // Fill CrossLaneMask using CrossLaneMaskLarge.
17168 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
17169
17170 if (!CanUseSublanes) {
17171 // If we're only shuffling a single lowest lane and the rest are identity
17172 // then don't bother.
17173 // TODO - isShuffleMaskInputInPlace could be extended to something like
17174 // this.
17175 int NumIdentityLanes = 0;
17176 bool OnlyShuffleLowestLane = true;
17177 for (int i = 0; i != NumLanes; ++i) {
17178 int LaneOffset = i * NumEltsPerLane;
17179 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
17180 i * NumEltsPerLane))
17181 NumIdentityLanes++;
17182 else if (CrossLaneMask[LaneOffset] != 0)
17183 OnlyShuffleLowestLane = false;
17184 }
17185 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
17186 return SDValue();
17187 }
17188
17189 // Avoid returning the same shuffle operation. For example,
17190 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
17191 // undef:v16i16
17192 if (CrossLaneMask == Mask || InLaneMask == Mask)
17193 return SDValue();
17194
17195 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
17196 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
17197 InLaneMask);
17198 };
17199
17200 // First attempt a solution with full lanes.
17201 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
17202 return V;
17203
17204 // The rest of the solutions use sublanes.
17205 if (!CanUseSublanes)
17206 return SDValue();
17207
17208 // Then attempt a solution with 64-bit sublanes (vpermq).
17209 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
17210 return V;
17211
17212 // If that doesn't work and we have fast variable cross-lane shuffle,
17213 // attempt 32-bit sublanes (vpermd).
17214 if (!Subtarget.hasFastVariableCrossLaneShuffle())
17215 return SDValue();
17216
17217 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17218}
17219
17220/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17221static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17222 SmallVector<int> &InLaneMask) {
17223 int Size = Mask.size();
17224 InLaneMask.assign(Mask.begin(), Mask.end());
17225 for (int i = 0; i < Size; ++i) {
17226 int &M = InLaneMask[i];
17227 if (M < 0)
17228 continue;
17229 if (((M % Size) / LaneSize) != (i / LaneSize))
17230 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17231 }
17232}
17233
17234/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17235/// source with a lane permutation.
17236///
17237/// This lowering strategy results in four instructions in the worst case for a
17238/// single-input cross lane shuffle which is lower than any other fully general
17239/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
17240/// shuffle pattern should be handled prior to trying this lowering.
17241static SDValue lowerShuffleAsLanePermuteAndShuffle(
17242 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17243 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17244 // FIXME: This should probably be generalized for 512-bit vectors as well.
17245 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17245, __extension__
__PRETTY_FUNCTION__))
;
17246 int Size = Mask.size();
17247 int LaneSize = Size / 2;
17248
17249 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17250 // Only do this if the elements aren't all from the lower lane,
17251 // otherwise we're (probably) better off doing a split.
17252 if (VT == MVT::v4f64 &&
17253 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
17254 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
17255
17256 // If there are only inputs from one 128-bit lane, splitting will in fact be
17257 // less expensive. The flags track whether the given lane contains an element
17258 // that crosses to another lane.
17259 bool AllLanes;
17260 if (!Subtarget.hasAVX2()) {
17261 bool LaneCrossing[2] = {false, false};
17262 for (int i = 0; i < Size; ++i)
17263 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
17264 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
17265 AllLanes = LaneCrossing[0] && LaneCrossing[1];
17266 } else {
17267 bool LaneUsed[2] = {false, false};
17268 for (int i = 0; i < Size; ++i)
17269 if (Mask[i] >= 0)
17270 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
17271 AllLanes = LaneUsed[0] && LaneUsed[1];
17272 }
17273
17274 // TODO - we could support shuffling V2 in the Flipped input.
17275 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17276, __extension__
__PRETTY_FUNCTION__))
17276 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17276, __extension__
__PRETTY_FUNCTION__))
;
17277
17278 SmallVector<int> InLaneMask;
17279 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17280
17281 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17282, __extension__
__PRETTY_FUNCTION__))
17282 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17282, __extension__
__PRETTY_FUNCTION__))
;
17283
17284 // If we're not using both lanes in each lane and the inlane mask is not
17285 // repeating, then we're better off splitting.
17286 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17287 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17288 /*SimpleOnly*/ false);
17289
17290 // Flip the lanes, and shuffle the results which should now be in-lane.
17291 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
17292 SDValue Flipped = DAG.getBitcast(PVT, V1);
17293 Flipped =
17294 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
17295 Flipped = DAG.getBitcast(VT, Flipped);
17296 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
17297}
17298
17299/// Handle lowering 2-lane 128-bit shuffles.
17300static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
17301 SDValue V2, ArrayRef<int> Mask,
17302 const APInt &Zeroable,
17303 const X86Subtarget &Subtarget,
17304 SelectionDAG &DAG) {
17305 if (V2.isUndef()) {
17306 // Attempt to match VBROADCAST*128 subvector broadcast load.
17307 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
17308 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
17309 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
17310 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
17311 MVT MemVT = VT.getHalfNumVectorElementsVT();
17312 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
17313 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
17314 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
17315 VT, MemVT, Ld, Ofs, DAG))
17316 return BcstLd;
17317 }
17318
17319 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
17320 if (Subtarget.hasAVX2())
17321 return SDValue();
17322 }
17323
17324 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
17325
17326 SmallVector<int, 4> WidenedMask;
17327 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
17328 return SDValue();
17329
17330 bool IsLowZero = (Zeroable & 0x3) == 0x3;
17331 bool IsHighZero = (Zeroable & 0xc) == 0xc;
17332
17333 // Try to use an insert into a zero vector.
17334 if (WidenedMask[0] == 0 && IsHighZero) {
17335 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17336 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17337 DAG.getIntPtrConstant(0, DL));
17338 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17339 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17340 DAG.getIntPtrConstant(0, DL));
17341 }
17342
17343 // TODO: If minimizing size and one of the inputs is a zero vector and the
17344 // the zero vector has only one use, we could use a VPERM2X128 to save the
17345 // instruction bytes needed to explicitly generate the zero vector.
17346
17347 // Blends are faster and handle all the non-lane-crossing cases.
17348 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
17349 Subtarget, DAG))
17350 return Blend;
17351
17352 // If either input operand is a zero vector, use VPERM2X128 because its mask
17353 // allows us to replace the zero input with an implicit zero.
17354 if (!IsLowZero && !IsHighZero) {
17355 // Check for patterns which can be matched with a single insert of a 128-bit
17356 // subvector.
17357 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
17358 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
17359
17360 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
17361 // this will likely become vinsertf128 which can't fold a 256-bit memop.
17362 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
17363 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17364 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
17365 OnlyUsesV1 ? V1 : V2,
17366 DAG.getIntPtrConstant(0, DL));
17367 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17368 DAG.getIntPtrConstant(2, DL));
17369 }
17370 }
17371
17372 // Try to use SHUF128 if possible.
17373 if (Subtarget.hasVLX()) {
17374 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
17375 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
17376 ((WidenedMask[1] % 2) << 1);
17377 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
17378 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17379 }
17380 }
17381 }
17382
17383 // Otherwise form a 128-bit permutation. After accounting for undefs,
17384 // convert the 64-bit shuffle mask selection values into 128-bit
17385 // selection bits by dividing the indexes by 2 and shifting into positions
17386 // defined by a vperm2*128 instruction's immediate control byte.
17387
17388 // The immediate permute control byte looks like this:
17389 // [1:0] - select 128 bits from sources for low half of destination
17390 // [2] - ignore
17391 // [3] - zero low half of destination
17392 // [5:4] - select 128 bits from sources for high half of destination
17393 // [6] - ignore
17394 // [7] - zero high half of destination
17395
17396 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17397, __extension__
__PRETTY_FUNCTION__))
17397 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17397, __extension__
__PRETTY_FUNCTION__))
;
17398
17399 unsigned PermMask = 0;
17400 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
17401 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
17402
17403 // Check the immediate mask and replace unused sources with undef.
17404 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
17405 V1 = DAG.getUNDEF(VT);
17406 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
17407 V2 = DAG.getUNDEF(VT);
17408
17409 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
17410 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17411}
17412
17413/// Lower a vector shuffle by first fixing the 128-bit lanes and then
17414/// shuffling each lane.
17415///
17416/// This attempts to create a repeated lane shuffle where each lane uses one
17417/// or two of the lanes of the inputs. The lanes of the input vectors are
17418/// shuffled in one or two independent shuffles to get the lanes into the
17419/// position needed by the final shuffle.
17420static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
17421 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17422 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17423 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17423, __extension__
__PRETTY_FUNCTION__))
;
17424
17425 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17426 return SDValue();
17427
17428 int NumElts = Mask.size();
17429 int NumLanes = VT.getSizeInBits() / 128;
17430 int NumLaneElts = 128 / VT.getScalarSizeInBits();
17431 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
17432 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
17433
17434 // First pass will try to fill in the RepeatMask from lanes that need two
17435 // sources.
17436 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17437 int Srcs[2] = {-1, -1};
17438 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
17439 for (int i = 0; i != NumLaneElts; ++i) {
17440 int M = Mask[(Lane * NumLaneElts) + i];
17441 if (M < 0)
17442 continue;
17443 // Determine which of the possible input lanes (NumLanes from each source)
17444 // this element comes from. Assign that as one of the sources for this
17445 // lane. We can assign up to 2 sources for this lane. If we run out
17446 // sources we can't do anything.
17447 int LaneSrc = M / NumLaneElts;
17448 int Src;
17449 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
17450 Src = 0;
17451 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
17452 Src = 1;
17453 else
17454 return SDValue();
17455
17456 Srcs[Src] = LaneSrc;
17457 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
17458 }
17459
17460 // If this lane has two sources, see if it fits with the repeat mask so far.
17461 if (Srcs[1] < 0)
17462 continue;
17463
17464 LaneSrcs[Lane][0] = Srcs[0];
17465 LaneSrcs[Lane][1] = Srcs[1];
17466
17467 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
17468 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17468, __extension__
__PRETTY_FUNCTION__))
;
17469 for (int i = 0, e = M1.size(); i != e; ++i)
17470 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
17471 return false;
17472 return true;
17473 };
17474
17475 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
17476 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17476, __extension__
__PRETTY_FUNCTION__))
;
17477 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
17478 int M = Mask[i];
17479 if (M < 0)
17480 continue;
17481 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17482, __extension__
__PRETTY_FUNCTION__))
17482 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17482, __extension__
__PRETTY_FUNCTION__))
;
17483 MergedMask[i] = M;
17484 }
17485 };
17486
17487 if (MatchMasks(InLaneMask, RepeatMask)) {
17488 // Merge this lane mask into the final repeat mask.
17489 MergeMasks(InLaneMask, RepeatMask);
17490 continue;
17491 }
17492
17493 // Didn't find a match. Swap the operands and try again.
17494 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
17495 ShuffleVectorSDNode::commuteMask(InLaneMask);
17496
17497 if (MatchMasks(InLaneMask, RepeatMask)) {
17498 // Merge this lane mask into the final repeat mask.
17499 MergeMasks(InLaneMask, RepeatMask);
17500 continue;
17501 }
17502
17503 // Couldn't find a match with the operands in either order.
17504 return SDValue();
17505 }
17506
17507 // Now handle any lanes with only one source.
17508 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17509 // If this lane has already been processed, skip it.
17510 if (LaneSrcs[Lane][0] >= 0)
17511 continue;
17512
17513 for (int i = 0; i != NumLaneElts; ++i) {
17514 int M = Mask[(Lane * NumLaneElts) + i];
17515 if (M < 0)
17516 continue;
17517
17518 // If RepeatMask isn't defined yet we can define it ourself.
17519 if (RepeatMask[i] < 0)
17520 RepeatMask[i] = M % NumLaneElts;
17521
17522 if (RepeatMask[i] < NumElts) {
17523 if (RepeatMask[i] != M % NumLaneElts)
17524 return SDValue();
17525 LaneSrcs[Lane][0] = M / NumLaneElts;
17526 } else {
17527 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
17528 return SDValue();
17529 LaneSrcs[Lane][1] = M / NumLaneElts;
17530 }
17531 }
17532
17533 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
17534 return SDValue();
17535 }
17536
17537 SmallVector<int, 16> NewMask(NumElts, -1);
17538 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17539 int Src = LaneSrcs[Lane][0];
17540 for (int i = 0; i != NumLaneElts; ++i) {
17541 int M = -1;
17542 if (Src >= 0)
17543 M = Src * NumLaneElts + i;
17544 NewMask[Lane * NumLaneElts + i] = M;
17545 }
17546 }
17547 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17548 // Ensure we didn't get back the shuffle we started with.
17549 // FIXME: This is a hack to make up for some splat handling code in
17550 // getVectorShuffle.
17551 if (isa<ShuffleVectorSDNode>(NewV1) &&
17552 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
17553 return SDValue();
17554
17555 for (int Lane = 0; Lane != NumLanes; ++Lane) {
17556 int Src = LaneSrcs[Lane][1];
17557 for (int i = 0; i != NumLaneElts; ++i) {
17558 int M = -1;
17559 if (Src >= 0)
17560 M = Src * NumLaneElts + i;
17561 NewMask[Lane * NumLaneElts + i] = M;
17562 }
17563 }
17564 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17565 // Ensure we didn't get back the shuffle we started with.
17566 // FIXME: This is a hack to make up for some splat handling code in
17567 // getVectorShuffle.
17568 if (isa<ShuffleVectorSDNode>(NewV2) &&
17569 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
17570 return SDValue();
17571
17572 for (int i = 0; i != NumElts; ++i) {
17573 if (Mask[i] < 0) {
17574 NewMask[i] = -1;
17575 continue;
17576 }
17577 NewMask[i] = RepeatMask[i % NumLaneElts];
17578 if (NewMask[i] < 0)
17579 continue;
17580
17581 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
17582 }
17583 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
17584}
17585
17586/// If the input shuffle mask results in a vector that is undefined in all upper
17587/// or lower half elements and that mask accesses only 2 halves of the
17588/// shuffle's operands, return true. A mask of half the width with mask indexes
17589/// adjusted to access the extracted halves of the original shuffle operands is
17590/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17591/// lower half of each input operand is accessed.
17592static bool
17593getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17594 int &HalfIdx1, int &HalfIdx2) {
17595 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17596, __extension__
__PRETTY_FUNCTION__))
17596 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17596, __extension__
__PRETTY_FUNCTION__))
;
17597
17598 // Exactly one half of the result must be undef to allow narrowing.
17599 bool UndefLower = isUndefLowerHalf(Mask);
17600 bool UndefUpper = isUndefUpperHalf(Mask);
17601 if (UndefLower == UndefUpper)
17602 return false;
17603
17604 unsigned HalfNumElts = HalfMask.size();
17605 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17606 HalfIdx1 = -1;
17607 HalfIdx2 = -1;
17608 for (unsigned i = 0; i != HalfNumElts; ++i) {
17609 int M = Mask[i + MaskIndexOffset];
17610 if (M < 0) {
17611 HalfMask[i] = M;
17612 continue;
17613 }
17614
17615 // Determine which of the 4 half vectors this element is from.
17616 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17617 int HalfIdx = M / HalfNumElts;
17618
17619 // Determine the element index into its half vector source.
17620 int HalfElt = M % HalfNumElts;
17621
17622 // We can shuffle with up to 2 half vectors, set the new 'half'
17623 // shuffle mask accordingly.
17624 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17625 HalfMask[i] = HalfElt;
17626 HalfIdx1 = HalfIdx;
17627 continue;
17628 }
17629 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17630 HalfMask[i] = HalfElt + HalfNumElts;
17631 HalfIdx2 = HalfIdx;
17632 continue;
17633 }
17634
17635 // Too many half vectors referenced.
17636 return false;
17637 }
17638
17639 return true;
17640}
17641
17642/// Given the output values from getHalfShuffleMask(), create a half width
17643/// shuffle of extracted vectors followed by an insert back to full width.
17644static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17645 ArrayRef<int> HalfMask, int HalfIdx1,
17646 int HalfIdx2, bool UndefLower,
17647 SelectionDAG &DAG, bool UseConcat = false) {
17648 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17648, __extension__
__PRETTY_FUNCTION__))
;
17649 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17649, __extension__
__PRETTY_FUNCTION__))
;
17650
17651 MVT VT = V1.getSimpleValueType();
17652 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17653 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17654
17655 auto getHalfVector = [&](int HalfIdx) {
17656 if (HalfIdx < 0)
17657 return DAG.getUNDEF(HalfVT);
17658 SDValue V = (HalfIdx < 2 ? V1 : V2);
17659 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17660 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17661 DAG.getIntPtrConstant(HalfIdx, DL));
17662 };
17663
17664 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17665 SDValue Half1 = getHalfVector(HalfIdx1);
17666 SDValue Half2 = getHalfVector(HalfIdx2);
17667 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17668 if (UseConcat) {
17669 SDValue Op0 = V;
17670 SDValue Op1 = DAG.getUNDEF(HalfVT);
17671 if (UndefLower)
17672 std::swap(Op0, Op1);
17673 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17674 }
17675
17676 unsigned Offset = UndefLower ? HalfNumElts : 0;
17677 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17678 DAG.getIntPtrConstant(Offset, DL));
17679}
17680
17681/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17682/// This allows for fast cases such as subvector extraction/insertion
17683/// or shuffling smaller vector types which can lower more efficiently.
17684static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17685 SDValue V2, ArrayRef<int> Mask,
17686 const X86Subtarget &Subtarget,
17687 SelectionDAG &DAG) {
17688 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17689, __extension__
__PRETTY_FUNCTION__))
17689 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17689, __extension__
__PRETTY_FUNCTION__))
;
17690
17691 bool UndefLower = isUndefLowerHalf(Mask);
17692 if (!UndefLower && !isUndefUpperHalf(Mask))
17693 return SDValue();
17694
17695 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17696, __extension__
__PRETTY_FUNCTION__))
17696 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17696, __extension__
__PRETTY_FUNCTION__))
;
17697
17698 // Upper half is undef and lower half is whole upper subvector.
17699 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17700 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17701 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17702 if (!UndefLower &&
17703 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17704 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17705 DAG.getIntPtrConstant(HalfNumElts, DL));
17706 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17707 DAG.getIntPtrConstant(0, DL));
17708 }
17709
17710 // Lower half is undef and upper half is whole lower subvector.
17711 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17712 if (UndefLower &&
17713 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17714 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17715 DAG.getIntPtrConstant(0, DL));
17716 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17717 DAG.getIntPtrConstant(HalfNumElts, DL));
17718 }
17719
17720 int HalfIdx1, HalfIdx2;
17721 SmallVector<int, 8> HalfMask(HalfNumElts);
17722 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17723 return SDValue();
17724
17725 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17725, __extension__
__PRETTY_FUNCTION__))
;
17726
17727 // Only shuffle the halves of the inputs when useful.
17728 unsigned NumLowerHalves =
17729 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17730 unsigned NumUpperHalves =
17731 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17732 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17732, __extension__
__PRETTY_FUNCTION__))
;
17733
17734 // Determine the larger pattern of undef/halves, then decide if it's worth
17735 // splitting the shuffle based on subtarget capabilities and types.
17736 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17737 if (!UndefLower) {
17738 // XXXXuuuu: no insert is needed.
17739 // Always extract lowers when setting lower - these are all free subreg ops.
17740 if (NumUpperHalves == 0)
17741 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17742 UndefLower, DAG);
17743
17744 if (NumUpperHalves == 1) {
17745 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17746 if (Subtarget.hasAVX2()) {
17747 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17748 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17749 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
17750 (!isSingleSHUFPSMask(HalfMask) ||
17751 Subtarget.hasFastVariableCrossLaneShuffle()))
17752 return SDValue();
17753 // If this is a unary shuffle (assume that the 2nd operand is
17754 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17755 // are better off extracting the upper half of 1 operand and using a
17756 // narrow shuffle.
17757 if (EltWidth == 64 && V2.isUndef())
17758 return SDValue();
17759 }
17760 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17761 if (Subtarget.hasAVX512() && VT.is512BitVector())
17762 return SDValue();
17763 // Extract + narrow shuffle is better than the wide alternative.
17764 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17765 UndefLower, DAG);
17766 }
17767
17768 // Don't extract both uppers, instead shuffle and then extract.
17769 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17769, __extension__
__PRETTY_FUNCTION__))
;
17770 return SDValue();
17771 }
17772
17773 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17774 if (NumUpperHalves == 0) {
17775 // AVX2 has efficient 64-bit element cross-lane shuffles.
17776 // TODO: Refine to account for unary shuffle, splat, and other masks?
17777 if (Subtarget.hasAVX2() && EltWidth == 64)
17778 return SDValue();
17779 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17780 if (Subtarget.hasAVX512() && VT.is512BitVector())
17781 return SDValue();
17782 // Narrow shuffle + insert is better than the wide alternative.
17783 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17784 UndefLower, DAG);
17785 }
17786
17787 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17788 return SDValue();
17789}
17790
17791/// Handle case where shuffle sources are coming from the same 128-bit lane and
17792/// every lane can be represented as the same repeating mask - allowing us to
17793/// shuffle the sources with the repeating shuffle and then permute the result
17794/// to the destination lanes.
17795static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17796 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17797 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17798 int NumElts = VT.getVectorNumElements();
17799 int NumLanes = VT.getSizeInBits() / 128;
17800 int NumLaneElts = NumElts / NumLanes;
17801
17802 // On AVX2 we may be able to just shuffle the lowest elements and then
17803 // broadcast the result.
17804 if (Subtarget.hasAVX2()) {
17805 for (unsigned BroadcastSize : {16, 32, 64}) {
17806 if (BroadcastSize <= VT.getScalarSizeInBits())
17807 continue;
17808 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17809
17810 // Attempt to match a repeating pattern every NumBroadcastElts,
17811 // accounting for UNDEFs but only references the lowest 128-bit
17812 // lane of the inputs.
17813 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17814 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17815 for (int j = 0; j != NumBroadcastElts; ++j) {
17816 int M = Mask[i + j];
17817 if (M < 0)
17818 continue;
17819 int &R = RepeatMask[j];
17820 if (0 != ((M % NumElts) / NumLaneElts))
17821 return false;
17822 if (0 <= R && R != M)
17823 return false;
17824 R = M;
17825 }
17826 return true;
17827 };
17828
17829 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17830 if (!FindRepeatingBroadcastMask(RepeatMask))
17831 continue;
17832
17833 // Shuffle the (lowest) repeated elements in place for broadcast.
17834 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17835
17836 // Shuffle the actual broadcast.
17837 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17838 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17839 for (int j = 0; j != NumBroadcastElts; ++j)
17840 BroadcastMask[i + j] = j;
17841 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17842 BroadcastMask);
17843 }
17844 }
17845
17846 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17847 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17848 return SDValue();
17849
17850 // Bail if we already have a repeated lane shuffle mask.
17851 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17852 return SDValue();
17853
17854 // Helper to look for repeated mask in each split sublane, and that those
17855 // sublanes can then be permuted into place.
17856 auto ShuffleSubLanes = [&](int SubLaneScale) {
17857 int NumSubLanes = NumLanes * SubLaneScale;
17858 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17859
17860 // Check that all the sources are coming from the same lane and see if we
17861 // can form a repeating shuffle mask (local to each sub-lane). At the same
17862 // time, determine the source sub-lane for each destination sub-lane.
17863 int TopSrcSubLane = -1;
17864 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17865 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17866 SubLaneScale,
17867 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17868
17869 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17870 // Extract the sub-lane mask, check that it all comes from the same lane
17871 // and normalize the mask entries to come from the first lane.
17872 int SrcLane = -1;
17873 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17874 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17875 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17876 if (M < 0)
17877 continue;
17878 int Lane = (M % NumElts) / NumLaneElts;
17879 if ((0 <= SrcLane) && (SrcLane != Lane))
17880 return SDValue();
17881 SrcLane = Lane;
17882 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17883 SubLaneMask[Elt] = LocalM;
17884 }
17885
17886 // Whole sub-lane is UNDEF.
17887 if (SrcLane < 0)
17888 continue;
17889
17890 // Attempt to match against the candidate repeated sub-lane masks.
17891 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17892 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17893 for (int i = 0; i != NumSubLaneElts; ++i) {
17894 if (M1[i] < 0 || M2[i] < 0)
17895 continue;
17896 if (M1[i] != M2[i])
17897 return false;
17898 }
17899 return true;
17900 };
17901
17902 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17903 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
17904 continue;
17905
17906 // Merge the sub-lane mask into the matching repeated sub-lane mask.
17907 for (int i = 0; i != NumSubLaneElts; ++i) {
17908 int M = SubLaneMask[i];
17909 if (M < 0)
17910 continue;
17911 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17912, __extension__
__PRETTY_FUNCTION__))
17912 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17912, __extension__
__PRETTY_FUNCTION__))
;
17913 RepeatedSubLaneMask[i] = M;
17914 }
17915
17916 // Track the top most source sub-lane - by setting the remaining to
17917 // UNDEF we can greatly simplify shuffle matching.
17918 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
17919 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
17920 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
17921 break;
17922 }
17923
17924 // Bail if we failed to find a matching repeated sub-lane mask.
17925 if (Dst2SrcSubLanes[DstSubLane] < 0)
17926 return SDValue();
17927 }
17928 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17929, __extension__
__PRETTY_FUNCTION__))
17929 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17929, __extension__
__PRETTY_FUNCTION__))
;
17930
17931 // Create a repeating shuffle mask for the entire vector.
17932 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17933 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17934 int Lane = SubLane / SubLaneScale;
17935 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17936 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17937 int M = RepeatedSubLaneMask[Elt];
17938 if (M < 0)
17939 continue;
17940 int Idx = (SubLane * NumSubLaneElts) + Elt;
17941 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17942 }
17943 }
17944
17945 // Shuffle each source sub-lane to its destination.
17946 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17947 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17948 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17949 if (SrcSubLane < 0)
17950 continue;
17951 for (int j = 0; j != NumSubLaneElts; ++j)
17952 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17953 }
17954
17955 // Avoid returning the same shuffle operation.
17956 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
17957 if (RepeatedMask == Mask || SubLaneMask == Mask)
17958 return SDValue();
17959
17960 SDValue RepeatedShuffle =
17961 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17962
17963 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17964 SubLaneMask);
17965 };
17966
17967 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
17968 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
17969 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
17970 // Otherwise we can only permute whole 128-bit lanes.
17971 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
17972 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
17973 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
17974 MinSubLaneScale = 2;
17975 MaxSubLaneScale =
17976 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
17977 }
17978 if (Subtarget.hasBWI() && VT == MVT::v64i8)
17979 MinSubLaneScale = MaxSubLaneScale = 4;
17980
17981 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
17982 if (SDValue Shuffle = ShuffleSubLanes(Scale))
17983 return Shuffle;
17984
17985 return SDValue();
17986}
17987
17988static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17989 bool &ForceV1Zero, bool &ForceV2Zero,
17990 unsigned &ShuffleImm, ArrayRef<int> Mask,
17991 const APInt &Zeroable) {
17992 int NumElts = VT.getVectorNumElements();
17993 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17995, __extension__
__PRETTY_FUNCTION__))
17994 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17995, __extension__
__PRETTY_FUNCTION__))
17995 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17995, __extension__
__PRETTY_FUNCTION__))
;
17996 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17997, __extension__
__PRETTY_FUNCTION__))
17997 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17997, __extension__
__PRETTY_FUNCTION__))
;
17998
17999 bool ZeroLane[2] = { true, true };
18000 for (int i = 0; i < NumElts; ++i)
18001 ZeroLane[i & 1] &= Zeroable[i];
18002
18003 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
18004 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
18005 ShuffleImm = 0;
18006 bool ShufpdMask = true;
18007 bool CommutableMask = true;
18008 for (int i = 0; i < NumElts; ++i) {
18009 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
18010 continue;
18011 if (Mask[i] < 0)
18012 return false;
18013 int Val = (i & 6) + NumElts * (i & 1);
18014 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
18015 if (Mask[i] < Val || Mask[i] > Val + 1)
18016 ShufpdMask = false;
18017 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
18018 CommutableMask = false;
18019 ShuffleImm |= (Mask[i] % 2) << i;
18020 }
18021
18022 if (!ShufpdMask && !CommutableMask)
18023 return false;
18024
18025 if (!ShufpdMask && CommutableMask)
18026 std::swap(V1, V2);
18027
18028 ForceV1Zero = ZeroLane[0];
18029 ForceV2Zero = ZeroLane[1];
18030 return true;
18031}
18032
18033static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
18034 SDValue V2, ArrayRef<int> Mask,
18035 const APInt &Zeroable,
18036 const X86Subtarget &Subtarget,
18037 SelectionDAG &DAG) {
18038 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18039, __extension__
__PRETTY_FUNCTION__))
18039 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18039, __extension__
__PRETTY_FUNCTION__))
;
18040
18041 unsigned Immediate = 0;
18042 bool ForceV1Zero = false, ForceV2Zero = false;
18043 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
18044 Mask, Zeroable))
18045 return SDValue();
18046
18047 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
18048 if (ForceV1Zero)
18049 V1 = getZeroVector(VT, Subtarget, DAG, DL);
18050 if (ForceV2Zero)
18051 V2 = getZeroVector(VT, Subtarget, DAG, DL);
18052
18053 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
18054 DAG.getTargetConstant(Immediate, DL, MVT::i8));
18055}
18056
18057// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18058// by zeroable elements in the remaining 24 elements. Turn this into two
18059// vmovqb instructions shuffled together.
18060static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
18061 SDValue V1, SDValue V2,
18062 ArrayRef<int> Mask,
18063 const APInt &Zeroable,
18064 SelectionDAG &DAG) {
18065 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18065, __extension__
__PRETTY_FUNCTION__))
;
18066
18067 // The first 8 indices should be every 8th element.
18068 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
18069 return SDValue();
18070
18071 // Remaining elements need to be zeroable.
18072 if (Zeroable.countl_one() < (Mask.size() - 8))
18073 return SDValue();
18074
18075 V1 = DAG.getBitcast(MVT::v4i64, V1);
18076 V2 = DAG.getBitcast(MVT::v4i64, V2);
18077
18078 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
18079 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
18080
18081 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
18082 // the upper bits of the result using an unpckldq.
18083 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
18084 { 0, 1, 2, 3, 16, 17, 18, 19,
18085 4, 5, 6, 7, 20, 21, 22, 23 });
18086 // Insert the unpckldq into a zero vector to widen to v32i8.
18087 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
18088 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
18089 DAG.getIntPtrConstant(0, DL));
18090}
18091
18092// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
18093// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
18094// =>
18095// ul = unpckl v1, v2
18096// uh = unpckh v1, v2
18097// a = vperm ul, uh
18098// b = vperm ul, uh
18099//
18100// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
18101// and permute. We cannot directly match v3 because it is split into two
18102// 256-bit vectors in earlier isel stages. Therefore, this function matches a
18103// pair of 256-bit shuffles and makes sure the masks are consecutive.
18104//
18105// Once unpck and permute nodes are created, the permute corresponding to this
18106// shuffle is returned, while the other permute replaces the other half of the
18107// shuffle in the selection dag.
18108static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
18109 SDValue V1, SDValue V2,
18110 ArrayRef<int> Mask,
18111 SelectionDAG &DAG) {
18112 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
18113 VT != MVT::v32i8)
18114 return SDValue();
18115 // <B0, B1, B0+1, B1+1, ..., >
18116 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
18117 unsigned Begin1) {
18118 size_t Size = Mask.size();
18119 assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18119, __extension__
__PRETTY_FUNCTION__))
;
18120 for (unsigned I = 0; I < Size; I += 2) {
18121 if (Mask[I] != (int)(Begin0 + I / 2) ||
18122 Mask[I + 1] != (int)(Begin1 + I / 2))
18123 return false;
18124 }
18125 return true;
18126 };
18127 // Check which half is this shuffle node
18128 int NumElts = VT.getVectorNumElements();
18129 size_t FirstQtr = NumElts / 2;
18130 size_t ThirdQtr = NumElts + NumElts / 2;
18131 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
18132 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
18133 if (!IsFirstHalf && !IsSecondHalf)
18134 return SDValue();
18135
18136 // Find the intersection between shuffle users of V1 and V2.
18137 SmallVector<SDNode *, 2> Shuffles;
18138 for (SDNode *User : V1->uses())
18139 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
18140 User->getOperand(1) == V2)
18141 Shuffles.push_back(User);
18142 // Limit user size to two for now.
18143 if (Shuffles.size() != 2)
18144 return SDValue();
18145 // Find out which half of the 512-bit shuffles is each smaller shuffle
18146 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
18147 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
18148 SDNode *FirstHalf;
18149 SDNode *SecondHalf;
18150 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
18151 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
18152 FirstHalf = Shuffles[0];
18153 SecondHalf = Shuffles[1];
18154 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
18155 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
18156 FirstHalf = Shuffles[1];
18157 SecondHalf = Shuffles[0];
18158 } else {
18159 return SDValue();
18160 }
18161 // Lower into unpck and perm. Return the perm of this shuffle and replace
18162 // the other.
18163 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
18164 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
18165 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18166 DAG.getTargetConstant(0x20, DL, MVT::i8));
18167 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18168 DAG.getTargetConstant(0x31, DL, MVT::i8));
18169 if (IsFirstHalf) {
18170 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
18171 return Perm1;
18172 }
18173 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
18174 return Perm2;
18175}
18176
18177/// Handle lowering of 4-lane 64-bit floating point shuffles.
18178///
18179/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
18180/// isn't available.
18181static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18182 const APInt &Zeroable, SDValue V1, SDValue V2,
18183 const X86Subtarget &Subtarget,
18184 SelectionDAG &DAG) {
18185 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18185, __extension__
__PRETTY_FUNCTION__))
;
18186 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18186, __extension__
__PRETTY_FUNCTION__))
;
18187 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18187, __extension__
__PRETTY_FUNCTION__))
;
18188
18189 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
18190 Subtarget, DAG))
18191 return V;
18192
18193 if (V2.isUndef()) {
18194 // Check for being able to broadcast a single element.
18195 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
18196 Mask, Subtarget, DAG))
18197 return Broadcast;
18198
18199 // Use low duplicate instructions for masks that match their pattern.
18200 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
18201 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
18202
18203 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
18204 // Non-half-crossing single input shuffles can be lowered with an
18205 // interleaved permutation.
18206 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18207 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
18208 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
18209 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18210 }
18211
18212 // With AVX2 we have direct support for this permutation.
18213 if (Subtarget.hasAVX2())
18214 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
18215 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18216
18217 // Try to create an in-lane repeating shuffle mask and then shuffle the
18218 // results into the target lanes.
18219 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18220 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18221 return V;
18222
18223 // Try to permute the lanes and then use a per-lane permute.
18224 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
18225 Mask, DAG, Subtarget))
18226 return V;
18227
18228 // Otherwise, fall back.
18229 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
18230 DAG, Subtarget);
18231 }
18232
18233 // Use dedicated unpack instructions for masks that match their pattern.
18234 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
18235 return V;
18236
18237 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
18238 Zeroable, Subtarget, DAG))
18239 return Blend;
18240
18241 // Check if the blend happens to exactly fit that of SHUFPD.
18242 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
18243 Zeroable, Subtarget, DAG))
18244 return Op;
18245
18246 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18247 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18248
18249 // If we have lane crossing shuffles AND they don't all come from the lower
18250 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
18251 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
18252 // canonicalize to a blend of splat which isn't necessary for this combine.
18253 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
18254 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
18255 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
18256 (V2.getOpcode() != ISD::BUILD_VECTOR))
18257 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
18258
18259 // If we have one input in place, then we can permute the other input and
18260 // blend the result.
18261 if (V1IsInPlace || V2IsInPlace)
18262 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18263 Subtarget, DAG);
18264
18265 // Try to create an in-lane repeating shuffle mask and then shuffle the
18266 // results into the target lanes.
18267 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18268 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18269 return V;
18270
18271 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18272 // shuffle. However, if we have AVX2 and either inputs are already in place,
18273 // we will be able to shuffle even across lanes the other input in a single
18274 // instruction so skip this pattern.
18275 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
18276 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
18277 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18278 return V;
18279
18280 // If we have VLX support, we can use VEXPAND.
18281 if (Subtarget.hasVLX())
18282 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
18283 DAG, Subtarget))
18284 return V;
18285
18286 // If we have AVX2 then we always want to lower with a blend because an v4 we
18287 // can fully permute the elements.
18288 if (Subtarget.hasAVX2())
18289 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18290 Subtarget, DAG);
18291
18292 // Otherwise fall back on generic lowering.
18293 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
18294 Subtarget, DAG);
18295}
18296
18297/// Handle lowering of 4-lane 64-bit integer shuffles.
18298///
18299/// This routine is only called when we have AVX2 and thus a reasonable
18300/// instruction set for v4i64 shuffling..
18301static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18302 const APInt &Zeroable, SDValue V1, SDValue V2,
18303 const X86Subtarget &Subtarget,
18304 SelectionDAG &DAG) {
18305 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18305, __extension__
__PRETTY_FUNCTION__))
;
18306 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18306, __extension__
__PRETTY_FUNCTION__))
;
18307 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18307, __extension__
__PRETTY_FUNCTION__))
;
18308 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18308, __extension__
__PRETTY_FUNCTION__))
;
18309
18310 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18311 Subtarget, DAG))
18312 return V;
18313
18314 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
18315 Zeroable, Subtarget, DAG))
18316 return Blend;
18317
18318 // Check for being able to broadcast a single element.
18319 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
18320 Subtarget, DAG))
18321 return Broadcast;
18322
18323 // Try to use shift instructions if fast.
18324 if (Subtarget.preferLowerShuffleAsShift())
18325 if (SDValue Shift =
18326 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18327 Subtarget, DAG, /*BitwiseOnly*/ true))
18328 return Shift;
18329
18330 if (V2.isUndef()) {
18331 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18332 // can use lower latency instructions that will operate on both lanes.
18333 SmallVector<int, 2> RepeatedMask;
18334 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
18335 SmallVector<int, 4> PSHUFDMask;
18336 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
18337 return DAG.getBitcast(
18338 MVT::v4i64,
18339 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
18340 DAG.getBitcast(MVT::v8i32, V1),
18341 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18342 }
18343
18344 // AVX2 provides a direct instruction for permuting a single input across
18345 // lanes.
18346 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
18347 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18348 }
18349
18350 // Try to use shift instructions.
18351 if (SDValue Shift =
18352 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
18353 DAG, /*BitwiseOnly*/ false))
18354 return Shift;
18355
18356 // If we have VLX support, we can use VALIGN or VEXPAND.
18357 if (Subtarget.hasVLX()) {
18358 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
18359 Subtarget, DAG))
18360 return Rotate;
18361
18362 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
18363 DAG, Subtarget))
18364 return V;
18365 }
18366
18367 // Try to use PALIGNR.
18368 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
18369 Subtarget, DAG))
18370 return Rotate;
18371
18372 // Use dedicated unpack instructions for masks that match their pattern.
18373 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
18374 return V;
18375
18376 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18377 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18378
18379 // If we have one input in place, then we can permute the other input and
18380 // blend the result.
18381 if (V1IsInPlace || V2IsInPlace)
18382 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18383 Subtarget, DAG);
18384
18385 // Try to create an in-lane repeating shuffle mask and then shuffle the
18386 // results into the target lanes.
18387 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18388 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18389 return V;
18390
18391 // Try to lower to PERMQ(BLENDD(V1,V2)).
18392 if (SDValue V =
18393 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
18394 return V;
18395
18396 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18397 // shuffle. However, if we have AVX2 and either inputs are already in place,
18398 // we will be able to shuffle even across lanes the other input in a single
18399 // instruction so skip this pattern.
18400 if (!V1IsInPlace && !V2IsInPlace)
18401 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18402 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18403 return Result;
18404
18405 // Otherwise fall back on generic blend lowering.
18406 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18407 Subtarget, DAG);
18408}
18409
18410/// Handle lowering of 8-lane 32-bit floating point shuffles.
18411///
18412/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
18413/// isn't available.
18414static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18415 const APInt &Zeroable, SDValue V1, SDValue V2,
18416 const X86Subtarget &Subtarget,
18417 SelectionDAG &DAG) {
18418 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18418, __extension__
__PRETTY_FUNCTION__))
;
1
'?' condition is true
18419 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18419, __extension__
__PRETTY_FUNCTION__))
;
2
'?' condition is true
18420 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18420, __extension__
__PRETTY_FUNCTION__))
;
3
Assuming the condition is true
4
'?' condition is true
18421
18422 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
5
Taking false branch
18423 Zeroable, Subtarget, DAG))
18424 return Blend;
18425
18426 // Check for being able to broadcast a single element.
18427 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
6
Taking false branch
18428 Subtarget, DAG))
18429 return Broadcast;
18430
18431 if (!Subtarget.hasAVX2()) {
7
Taking true branch
18432 SmallVector<int> InLaneMask;
18433 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18434
18435 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
8
Assuming the condition is false
9
Taking false branch
18436 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18437 /*SimpleOnly*/ true))
18438 return R;
18439 }
18440 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
10
Taking false branch
18441 Zeroable, Subtarget, DAG))
18442 return DAG.getBitcast(MVT::v8f32, ZExt);
18443
18444 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18445 // options to efficiently lower the shuffle.
18446 SmallVector<int, 4> RepeatedMask;
18447 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
11
Assuming the condition is false
12
Taking false branch
18448 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18449, __extension__
__PRETTY_FUNCTION__))
18449 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18449, __extension__
__PRETTY_FUNCTION__))
;
18450
18451 // Use even/odd duplicate instructions for masks that match their pattern.
18452 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18453 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
18454 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18455 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
18456
18457 if (V2.isUndef())
18458 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
18459 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18460
18461 // Use dedicated unpack instructions for masks that match their pattern.
18462 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
18463 return V;
18464
18465 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
18466 // have already handled any direct blends.
18467 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
18468 }
18469
18470 // Try to create an in-lane repeating shuffle mask and then shuffle the
18471 // results into the target lanes.
18472 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13
Taking false branch
18473 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18474 return V;
18475
18476 // If we have a single input shuffle with different shuffle patterns in the
18477 // two 128-bit lanes use the variable mask to VPERMILPS.
18478 if (V2.isUndef()) {
14
Taking false branch
18479 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
18480 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18481 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
18482 }
18483 if (Subtarget.hasAVX2()) {
18484 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18485 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
18486 }
18487 // Otherwise, fall back.
18488 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
18489 DAG, Subtarget);
18490 }
18491
18492 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18493 // shuffle.
18494 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15
Taking false branch
18495 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18496 return Result;
18497
18498 // If we have VLX support, we can use VEXPAND.
18499 if (Subtarget.hasVLX())
16
Assuming the condition is false
18500 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
18501 DAG, Subtarget))
18502 return V;
18503
18504 // Try to match an interleave of two v8f32s and lower them as unpck and
18505 // permutes using ymms. This needs to go before we try to split the vectors.
18506 //
18507 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18508 // this path inadvertently.
18509 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18510 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18511 Mask, DAG))
18512 return V;
18513
18514 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18515 // since after split we get a more efficient code using vpunpcklwd and
18516 // vpunpckhwd instrs than vblend.
18517 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
17
Assuming the condition is true
18
Taking true branch
18518 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
19
Calling 'lowerShuffleAsSplitOrBlend'
18519 DAG);
18520
18521 // If we have AVX2 then we always want to lower with a blend because at v8 we
18522 // can fully permute the elements.
18523 if (Subtarget.hasAVX2())
18524 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
18525 Subtarget, DAG);
18526
18527 // Otherwise fall back on generic lowering.
18528 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
18529 Subtarget, DAG);
18530}
18531
18532/// Handle lowering of 8-lane 32-bit integer shuffles.
18533///
18534/// This routine is only called when we have AVX2 and thus a reasonable
18535/// instruction set for v8i32 shuffling..
18536static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18537 const APInt &Zeroable, SDValue V1, SDValue V2,
18538 const X86Subtarget &Subtarget,
18539 SelectionDAG &DAG) {
18540 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18540, __extension__
__PRETTY_FUNCTION__))
;
18541 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18541, __extension__
__PRETTY_FUNCTION__))
;
18542 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18542, __extension__
__PRETTY_FUNCTION__))
;
18543 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18543, __extension__
__PRETTY_FUNCTION__))
;
18544
18545 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
18546
18547 // Whenever we can lower this as a zext, that instruction is strictly faster
18548 // than any alternative. It also allows us to fold memory operands into the
18549 // shuffle in many cases.
18550 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18551 Zeroable, Subtarget, DAG))
18552 return ZExt;
18553
18554 // Try to match an interleave of two v8i32s and lower them as unpck and
18555 // permutes using ymms. This needs to go before we try to split the vectors.
18556 if (!Subtarget.hasAVX512())
18557 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18558 Mask, DAG))
18559 return V;
18560
18561 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18562 // since after split we get a more efficient code than vblend by using
18563 // vpunpcklwd and vpunpckhwd instrs.
18564 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
18565 !Subtarget.hasAVX512())
18566 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
18567 DAG);
18568
18569 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
18570 Zeroable, Subtarget, DAG))
18571 return Blend;
18572
18573 // Check for being able to broadcast a single element.
18574 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
18575 Subtarget, DAG))
18576 return Broadcast;
18577
18578 // Try to use shift instructions if fast.
18579 if (Subtarget.preferLowerShuffleAsShift()) {
18580 if (SDValue Shift =
18581 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
18582 Subtarget, DAG, /*BitwiseOnly*/ true))
18583 return Shift;
18584 if (NumV2Elements == 0)
18585 if (SDValue Rotate =
18586 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18587 return Rotate;
18588 }
18589
18590 // If the shuffle mask is repeated in each 128-bit lane we can use more
18591 // efficient instructions that mirror the shuffles across the two 128-bit
18592 // lanes.
18593 SmallVector<int, 4> RepeatedMask;
18594 bool Is128BitLaneRepeatedShuffle =
18595 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
18596 if (Is128BitLaneRepeatedShuffle) {
18597 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18597, __extension__
__PRETTY_FUNCTION__))
;
18598 if (V2.isUndef())
18599 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
18600 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18601
18602 // Use dedicated unpack instructions for masks that match their pattern.
18603 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
18604 return V;
18605 }
18606
18607 // Try to use shift instructions.
18608 if (SDValue Shift =
18609 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
18610 DAG, /*BitwiseOnly*/ false))
18611 return Shift;
18612
18613 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
18614 if (SDValue Rotate =
18615 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18616 return Rotate;
18617
18618 // If we have VLX support, we can use VALIGN or EXPAND.
18619 if (Subtarget.hasVLX()) {
18620 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
18621 Subtarget, DAG))
18622 return Rotate;
18623
18624 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
18625 DAG, Subtarget))
18626 return V;
18627 }
18628
18629 // Try to use byte rotation instructions.
18630 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
18631 Subtarget, DAG))
18632 return Rotate;
18633
18634 // Try to create an in-lane repeating shuffle mask and then shuffle the
18635 // results into the target lanes.
18636 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18637 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18638 return V;
18639
18640 if (V2.isUndef()) {
18641 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18642 // because that should be faster than the variable permute alternatives.
18643 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
18644 return V;
18645
18646 // If the shuffle patterns aren't repeated but it's a single input, directly
18647 // generate a cross-lane VPERMD instruction.
18648 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18649 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
18650 }
18651
18652 // Assume that a single SHUFPS is faster than an alternative sequence of
18653 // multiple instructions (even if the CPU has a domain penalty).
18654 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18655 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18656 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
18657 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
18658 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
18659 CastV1, CastV2, DAG);
18660 return DAG.getBitcast(MVT::v8i32, ShufPS);
18661 }
18662
18663 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18664 // shuffle.
18665 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18666 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18667 return Result;
18668
18669 // Otherwise fall back on generic blend lowering.
18670 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
18671 Subtarget, DAG);
18672}
18673
18674/// Handle lowering of 16-lane 16-bit integer shuffles.
18675///
18676/// This routine is only called when we have AVX2 and thus a reasonable
18677/// instruction set for v16i16 shuffling..
18678static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18679 const APInt &Zeroable, SDValue V1, SDValue V2,
18680 const X86Subtarget &Subtarget,
18681 SelectionDAG &DAG) {
18682 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18682, __extension__
__PRETTY_FUNCTION__))
;
18683 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18683, __extension__
__PRETTY_FUNCTION__))
;
18684 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18684, __extension__
__PRETTY_FUNCTION__))
;
18685 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18685, __extension__
__PRETTY_FUNCTION__))
;
18686
18687 // Whenever we can lower this as a zext, that instruction is strictly faster
18688 // than any alternative. It also allows us to fold memory operands into the
18689 // shuffle in many cases.
18690 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18691 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18692 return ZExt;
18693
18694 // Check for being able to broadcast a single element.
18695 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
18696 Subtarget, DAG))
18697 return Broadcast;
18698
18699 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
18700 Zeroable, Subtarget, DAG))
18701 return Blend;
18702
18703 // Use dedicated unpack instructions for masks that match their pattern.
18704 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
18705 return V;
18706
18707 // Use dedicated pack instructions for masks that match their pattern.
18708 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
18709 Subtarget))
18710 return V;
18711
18712 // Try to use lower using a truncation.
18713 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18714 Subtarget, DAG))
18715 return V;
18716
18717 // Try to use shift instructions.
18718 if (SDValue Shift =
18719 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18720 Subtarget, DAG, /*BitwiseOnly*/ false))
18721 return Shift;
18722
18723 // Try to use byte rotation instructions.
18724 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
18725 Subtarget, DAG))
18726 return Rotate;
18727
18728 // Try to create an in-lane repeating shuffle mask and then shuffle the
18729 // results into the target lanes.
18730 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18731 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18732 return V;
18733
18734 if (V2.isUndef()) {
18735 // Try to use bit rotation instructions.
18736 if (SDValue Rotate =
18737 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
18738 return Rotate;
18739
18740 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18741 // because that should be faster than the variable permute alternatives.
18742 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
18743 return V;
18744
18745 // There are no generalized cross-lane shuffle operations available on i16
18746 // element types.
18747 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18748 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18749 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18750 return V;
18751
18752 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18753 DAG, Subtarget);
18754 }
18755
18756 SmallVector<int, 8> RepeatedMask;
18757 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18758 // As this is a single-input shuffle, the repeated mask should be
18759 // a strictly valid v8i16 mask that we can pass through to the v8i16
18760 // lowering to handle even the v16 case.
18761 return lowerV8I16GeneralSingleInputShuffle(
18762 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18763 }
18764 }
18765
18766 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18767 Zeroable, Subtarget, DAG))
18768 return PSHUFB;
18769
18770 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18771 if (Subtarget.hasBWI())
18772 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18773
18774 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18775 // shuffle.
18776 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18777 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18778 return Result;
18779
18780 // Try to permute the lanes and then use a per-lane permute.
18781 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18782 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18783 return V;
18784
18785 // Try to match an interleave of two v16i16s and lower them as unpck and
18786 // permutes using ymms.
18787 if (!Subtarget.hasAVX512())
18788 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18789 Mask, DAG))
18790 return V;
18791
18792 // Otherwise fall back on generic lowering.
18793 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18794 Subtarget, DAG);
18795}
18796
18797/// Handle lowering of 32-lane 8-bit integer shuffles.
18798///
18799/// This routine is only called when we have AVX2 and thus a reasonable
18800/// instruction set for v32i8 shuffling..
18801static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18802 const APInt &Zeroable, SDValue V1, SDValue V2,
18803 const X86Subtarget &Subtarget,
18804 SelectionDAG &DAG) {
18805 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18805, __extension__
__PRETTY_FUNCTION__))
;
18806 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18806, __extension__
__PRETTY_FUNCTION__))
;
18807 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18807, __extension__
__PRETTY_FUNCTION__))
;
18808 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18808, __extension__
__PRETTY_FUNCTION__))
;
18809
18810 // Whenever we can lower this as a zext, that instruction is strictly faster
18811 // than any alternative. It also allows us to fold memory operands into the
18812 // shuffle in many cases.
18813 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18814 Zeroable, Subtarget, DAG))
18815 return ZExt;
18816
18817 // Check for being able to broadcast a single element.
18818 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18819 Subtarget, DAG))
18820 return Broadcast;
18821
18822 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18823 Zeroable, Subtarget, DAG))
18824 return Blend;
18825
18826 // Use dedicated unpack instructions for masks that match their pattern.
18827 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18828 return V;
18829
18830 // Use dedicated pack instructions for masks that match their pattern.
18831 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18832 Subtarget))
18833 return V;
18834
18835 // Try to use lower using a truncation.
18836 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18837 Subtarget, DAG))
18838 return V;
18839
18840 // Try to use shift instructions.
18841 if (SDValue Shift =
18842 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
18843 DAG, /*BitwiseOnly*/ false))
18844 return Shift;
18845
18846 // Try to use byte rotation instructions.
18847 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18848 Subtarget, DAG))
18849 return Rotate;
18850
18851 // Try to use bit rotation instructions.
18852 if (V2.isUndef())
18853 if (SDValue Rotate =
18854 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18855 return Rotate;
18856
18857 // Try to create an in-lane repeating shuffle mask and then shuffle the
18858 // results into the target lanes.
18859 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18860 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18861 return V;
18862
18863 // There are no generalized cross-lane shuffle operations available on i8
18864 // element types.
18865 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18866 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18867 // because that should be faster than the variable permute alternatives.
18868 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18869 return V;
18870
18871 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18872 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18873 return V;
18874
18875 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18876 DAG, Subtarget);
18877 }
18878
18879 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18880 Zeroable, Subtarget, DAG))
18881 return PSHUFB;
18882
18883 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18884 if (Subtarget.hasVBMI())
18885 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18886
18887 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18888 // shuffle.
18889 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18890 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18891 return Result;
18892
18893 // Try to permute the lanes and then use a per-lane permute.
18894 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18895 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18896 return V;
18897
18898 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18899 // by zeroable elements in the remaining 24 elements. Turn this into two
18900 // vmovqb instructions shuffled together.
18901 if (Subtarget.hasVLX())
18902 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18903 Mask, Zeroable, DAG))
18904 return V;
18905
18906 // Try to match an interleave of two v32i8s and lower them as unpck and
18907 // permutes using ymms.
18908 if (!Subtarget.hasAVX512())
18909 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
18910 Mask, DAG))
18911 return V;
18912
18913 // Otherwise fall back on generic lowering.
18914 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
18915 Subtarget, DAG);
18916}
18917
18918/// High-level routine to lower various 256-bit x86 vector shuffles.
18919///
18920/// This routine either breaks down the specific type of a 256-bit x86 vector
18921/// shuffle or splits it into two 128-bit shuffles and fuses the results back
18922/// together based on the available instructions.
18923static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18924 SDValue V1, SDValue V2, const APInt &Zeroable,
18925 const X86Subtarget &Subtarget,
18926 SelectionDAG &DAG) {
18927 // If we have a single input to the zero element, insert that into V1 if we
18928 // can do so cheaply.
18929 int NumElts = VT.getVectorNumElements();
18930 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18931
18932 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18933 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18934 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18935 return Insertion;
18936
18937 // Handle special cases where the lower or upper half is UNDEF.
18938 if (SDValue V =
18939 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18940 return V;
18941
18942 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
18943 // can check for those subtargets here and avoid much of the subtarget
18944 // querying in the per-vector-type lowering routines. With AVX1 we have
18945 // essentially *zero* ability to manipulate a 256-bit vector with integer
18946 // types. Since we'll use floating point types there eventually, just
18947 // immediately cast everything to a float and operate entirely in that domain.
18948 if (VT.isInteger() && !Subtarget.hasAVX2()) {
18949 int ElementBits = VT.getScalarSizeInBits();
18950 if (ElementBits < 32) {
18951 // No floating point type available, if we can't use the bit operations
18952 // for masking/blending then decompose into 128-bit vectors.
18953 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18954 Subtarget, DAG))
18955 return V;
18956 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18957 return V;
18958 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18959 }
18960
18961 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
18962 VT.getVectorNumElements());
18963 V1 = DAG.getBitcast(FpVT, V1);
18964 V2 = DAG.getBitcast(FpVT, V2);
18965 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
18966 }
18967
18968 if (VT == MVT::v16f16) {
18969 V1 = DAG.getBitcast(MVT::v16i16, V1);
18970 V2 = DAG.getBitcast(MVT::v16i16, V2);
18971 return DAG.getBitcast(MVT::v16f16,
18972 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
18973 }
18974
18975 switch (VT.SimpleTy) {
18976 case MVT::v4f64:
18977 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18978 case MVT::v4i64:
18979 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18980 case MVT::v8f32:
18981 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18982 case MVT::v8i32:
18983 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18984 case MVT::v16i16:
18985 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18986 case MVT::v32i8:
18987 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18988
18989 default:
18990 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18990)
;
18991 }
18992}
18993
18994/// Try to lower a vector shuffle as a 128-bit shuffles.
18995static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
18996 const APInt &Zeroable, SDValue V1, SDValue V2,
18997 const X86Subtarget &Subtarget,
18998 SelectionDAG &DAG) {
18999 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000, __extension__
__PRETTY_FUNCTION__))
19000 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000, __extension__
__PRETTY_FUNCTION__))
;
19001
19002 // To handle 256 bit vector requires VLX and most probably
19003 // function lowerV2X128VectorShuffle() is better solution.
19004 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19004, __extension__
__PRETTY_FUNCTION__))
;
19005
19006 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
19007 SmallVector<int, 4> Widened128Mask;
19008 if (!canWidenShuffleElements(Mask, Widened128Mask))
19009 return SDValue();
19010 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19010, __extension__
__PRETTY_FUNCTION__))
;
19011
19012 // Try to use an insert into a zero vector.
19013 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
19014 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
19015 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
19016 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
19017 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
19018 DAG.getIntPtrConstant(0, DL));
19019 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19020 getZeroVector(VT, Subtarget, DAG, DL), LoV,
19021 DAG.getIntPtrConstant(0, DL));
19022 }
19023
19024 // Check for patterns which can be matched with a single insert of a 256-bit
19025 // subvector.
19026 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
19027 if (OnlyUsesV1 ||
19028 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
19029 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
19030 SDValue SubVec =
19031 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
19032 DAG.getIntPtrConstant(0, DL));
19033 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
19034 DAG.getIntPtrConstant(4, DL));
19035 }
19036
19037 // See if this is an insertion of the lower 128-bits of V2 into V1.
19038 bool IsInsert = true;
19039 int V2Index = -1;
19040 for (int i = 0; i < 4; ++i) {
19041 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19041, __extension__
__PRETTY_FUNCTION__))
;
19042 if (Widened128Mask[i] < 0)
19043 continue;
19044
19045 // Make sure all V1 subvectors are in place.
19046 if (Widened128Mask[i] < 4) {
19047 if (Widened128Mask[i] != i) {
19048 IsInsert = false;
19049 break;
19050 }
19051 } else {
19052 // Make sure we only have a single V2 index and its the lowest 128-bits.
19053 if (V2Index >= 0 || Widened128Mask[i] != 4) {
19054 IsInsert = false;
19055 break;
19056 }
19057 V2Index = i;
19058 }
19059 }
19060 if (IsInsert && V2Index >= 0) {
19061 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
19062 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
19063 DAG.getIntPtrConstant(0, DL));
19064 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
19065 }
19066
19067 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
19068 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
19069 // possible we at least ensure the lanes stay sequential to help later
19070 // combines.
19071 SmallVector<int, 2> Widened256Mask;
19072 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
19073 Widened128Mask.clear();
19074 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
19075 }
19076
19077 // Try to lower to vshuf64x2/vshuf32x4.
19078 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
19079 unsigned PermMask = 0;
19080 // Insure elements came from the same Op.
19081 for (int i = 0; i < 4; ++i) {
19082 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19082, __extension__
__PRETTY_FUNCTION__))
;
19083 if (Widened128Mask[i] < 0)
19084 continue;
19085
19086 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
19087 unsigned OpIndex = i / 2;
19088 if (Ops[OpIndex].isUndef())
19089 Ops[OpIndex] = Op;
19090 else if (Ops[OpIndex] != Op)
19091 return SDValue();
19092
19093 // Convert the 128-bit shuffle mask selection values into 128-bit selection
19094 // bits defined by a vshuf64x2 instruction's immediate control byte.
19095 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
19096 }
19097
19098 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
19099 DAG.getTargetConstant(PermMask, DL, MVT::i8));
19100}
19101
19102/// Handle lowering of 8-lane 64-bit floating point shuffles.
19103static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19104 const APInt &Zeroable, SDValue V1, SDValue V2,
19105 const X86Subtarget &Subtarget,
19106 SelectionDAG &DAG) {
19107 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19107, __extension__
__PRETTY_FUNCTION__))
;
19108 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19108, __extension__
__PRETTY_FUNCTION__))
;
19109 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19109, __extension__
__PRETTY_FUNCTION__))
;
19110
19111 if (V2.isUndef()) {
19112 // Use low duplicate instructions for masks that match their pattern.
19113 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
19114 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
19115
19116 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
19117 // Non-half-crossing single input shuffles can be lowered with an
19118 // interleaved permutation.
19119 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
19120 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
19121 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
19122 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
19123 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
19124 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
19125 }
19126
19127 SmallVector<int, 4> RepeatedMask;
19128 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
19129 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
19130 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19131 }
19132
19133 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
19134 V2, Subtarget, DAG))
19135 return Shuf128;
19136
19137 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
19138 return Unpck;
19139
19140 // Check if the blend happens to exactly fit that of SHUFPD.
19141 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
19142 Zeroable, Subtarget, DAG))
19143 return Op;
19144
19145 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
19146 DAG, Subtarget))
19147 return V;
19148
19149 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
19150 Zeroable, Subtarget, DAG))
19151 return Blend;
19152
19153 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
19154}
19155
19156/// Handle lowering of 16-lane 32-bit floating point shuffles.
19157static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19158 const APInt &Zeroable, SDValue V1, SDValue V2,
19159 const X86Subtarget &Subtarget,
19160 SelectionDAG &DAG) {
19161 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19161, __extension__
__PRETTY_FUNCTION__))
;
19162 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19162, __extension__
__PRETTY_FUNCTION__))
;
19163 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19163, __extension__
__PRETTY_FUNCTION__))
;
19164
19165 // If the shuffle mask is repeated in each 128-bit lane, we have many more
19166 // options to efficiently lower the shuffle.
19167 SmallVector<int, 4> RepeatedMask;
19168 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
19169 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19169, __extension__
__PRETTY_FUNCTION__))
;
19170
19171 // Use even/odd duplicate instructions for masks that match their pattern.
19172 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
19173 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
19174 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
19175 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
19176
19177 if (V2.isUndef())
19178 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
19179 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19180
19181 // Use dedicated unpack instructions for masks that match their pattern.
19182 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
19183 return V;
19184
19185 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19186 Zeroable, Subtarget, DAG))
19187 return Blend;
19188
19189 // Otherwise, fall back to a SHUFPS sequence.
19190 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
19191 }
19192
19193 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19194 Zeroable, Subtarget, DAG))
19195 return Blend;
19196
19197 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19198 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19199 return DAG.getBitcast(MVT::v16f32, ZExt);
19200
19201 // Try to create an in-lane repeating shuffle mask and then shuffle the
19202 // results into the target lanes.
19203 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19204 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
19205 return V;
19206
19207 // If we have a single input shuffle with different shuffle patterns in the
19208 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
19209 if (V2.isUndef() &&
19210 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
19211 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
19212 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
19213 }
19214
19215 // If we have AVX512F support, we can use VEXPAND.
19216 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
19217 V1, V2, DAG, Subtarget))
19218 return V;
19219
19220 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
19221}
19222
19223/// Handle lowering of 8-lane 64-bit integer shuffles.
19224static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19225 const APInt &Zeroable, SDValue V1, SDValue V2,
19226 const X86Subtarget &Subtarget,
19227 SelectionDAG &DAG) {
19228 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19228, __extension__
__PRETTY_FUNCTION__))
;
19229 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19229, __extension__
__PRETTY_FUNCTION__))
;
19230 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19230, __extension__
__PRETTY_FUNCTION__))
;
19231
19232 // Try to use shift instructions if fast.
19233 if (Subtarget.preferLowerShuffleAsShift())
19234 if (SDValue Shift =
19235 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
19236 Subtarget, DAG, /*BitwiseOnly*/ true))
19237 return Shift;
19238
19239 if (V2.isUndef()) {
19240 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
19241 // can use lower latency instructions that will operate on all four
19242 // 128-bit lanes.
19243 SmallVector<int, 2> Repeated128Mask;
19244 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
19245 SmallVector<int, 4> PSHUFDMask;
19246 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
19247 return DAG.getBitcast(
19248 MVT::v8i64,
19249 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
19250 DAG.getBitcast(MVT::v16i32, V1),
19251 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
19252 }
19253
19254 SmallVector<int, 4> Repeated256Mask;
19255 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
19256 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
19257 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
19258 }
19259
19260 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
19261 V2, Subtarget, DAG))
19262 return Shuf128;
19263
19264 // Try to use shift instructions.
19265 if (SDValue Shift =
19266 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
19267 DAG, /*BitwiseOnly*/ false))
19268 return Shift;
19269
19270 // Try to use VALIGN.
19271 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
19272 Subtarget, DAG))
19273 return Rotate;
19274
19275 // Try to use PALIGNR.
19276 if (Subtarget.hasBWI())
19277 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
19278 Subtarget, DAG))
19279 return Rotate;
19280
19281 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
19282 return Unpck;
19283
19284 // If we have AVX512F support, we can use VEXPAND.
19285 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
19286 DAG, Subtarget))
19287 return V;
19288
19289 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
19290 Zeroable, Subtarget, DAG))
19291 return Blend;
19292
19293 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
19294}
19295
19296/// Handle lowering of 16-lane 32-bit integer shuffles.
19297static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19298 const APInt &Zeroable, SDValue V1, SDValue V2,
19299 const X86Subtarget &Subtarget,
19300 SelectionDAG &DAG) {
19301 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19301, __extension__
__PRETTY_FUNCTION__))
;
19302 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19302, __extension__
__PRETTY_FUNCTION__))
;
19303 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19303, __extension__
__PRETTY_FUNCTION__))
;
19304
19305 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
19306
19307 // Whenever we can lower this as a zext, that instruction is strictly faster
19308 // than any alternative. It also allows us to fold memory operands into the
19309 // shuffle in many cases.
19310 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19311 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19312 return ZExt;
19313
19314 // Try to use shift instructions if fast.
19315 if (Subtarget.preferLowerShuffleAsShift()) {
19316 if (SDValue Shift =
19317 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19318 Subtarget, DAG, /*BitwiseOnly*/ true))
19319 return Shift;
19320 if (NumV2Elements == 0)
19321 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
19322 Subtarget, DAG))
19323 return Rotate;
19324 }
19325
19326 // If the shuffle mask is repeated in each 128-bit lane we can use more
19327 // efficient instructions that mirror the shuffles across the four 128-bit
19328 // lanes.
19329 SmallVector<int, 4> RepeatedMask;
19330 bool Is128BitLaneRepeatedShuffle =
19331 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
19332 if (Is128BitLaneRepeatedShuffle) {
19333 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19333, __extension__
__PRETTY_FUNCTION__))
;
19334 if (V2.isUndef())
19335 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
19336 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19337
19338 // Use dedicated unpack instructions for masks that match their pattern.
19339 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
19340 return V;
19341 }
19342
19343 // Try to use shift instructions.
19344 if (SDValue Shift =
19345 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19346 Subtarget, DAG, /*BitwiseOnly*/ false))
19347 return Shift;
19348
19349 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
19350 if (SDValue Rotate =
19351 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
19352 return Rotate;
19353
19354 // Try to use VALIGN.
19355 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
19356 Subtarget, DAG))
19357 return Rotate;
19358
19359 // Try to use byte rotation instructions.
19360 if (Subtarget.hasBWI())
19361 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
19362 Subtarget, DAG))
19363 return Rotate;
19364
19365 // Assume that a single SHUFPS is faster than using a permv shuffle.
19366 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
19367 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
19368 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
19369 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
19370 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
19371 CastV1, CastV2, DAG);
19372 return DAG.getBitcast(MVT::v16i32, ShufPS);
19373 }
19374
19375 // Try to create an in-lane repeating shuffle mask and then shuffle the
19376 // results into the target lanes.
19377 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19378 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
19379 return V;
19380
19381 // If we have AVX512F support, we can use VEXPAND.
19382 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
19383 DAG, Subtarget))
19384 return V;
19385
19386 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
19387 Zeroable, Subtarget, DAG))
19388 return Blend;
19389
19390 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
19391}
19392
19393/// Handle lowering of 32-lane 16-bit integer shuffles.
19394static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19395 const APInt &Zeroable, SDValue V1, SDValue V2,
19396 const X86Subtarget &Subtarget,
19397 SelectionDAG &DAG) {
19398 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19398, __extension__
__PRETTY_FUNCTION__))
;
19399 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19399, __extension__
__PRETTY_FUNCTION__))
;
19400 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19400, __extension__
__PRETTY_FUNCTION__))
;
19401 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19401, __extension__
__PRETTY_FUNCTION__))
;
19402
19403 // Whenever we can lower this as a zext, that instruction is strictly faster
19404 // than any alternative. It also allows us to fold memory operands into the
19405 // shuffle in many cases.
19406 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19407 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
19408 return ZExt;
19409
19410 // Use dedicated unpack instructions for masks that match their pattern.
19411 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
19412 return V;
19413
19414 // Use dedicated pack instructions for masks that match their pattern.
19415 if (SDValue V =
19416 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
19417 return V;
19418
19419 // Try to use shift instructions.
19420 if (SDValue Shift =
19421 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
19422 Subtarget, DAG, /*BitwiseOnly*/ false))
19423 return Shift;
19424
19425 // Try to use byte rotation instructions.
19426 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
19427 Subtarget, DAG))
19428 return Rotate;
19429
19430 if (V2.isUndef()) {
19431 // Try to use bit rotation instructions.
19432 if (SDValue Rotate =
19433 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
19434 return Rotate;
19435
19436 SmallVector<int, 8> RepeatedMask;
19437 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
19438 // As this is a single-input shuffle, the repeated mask should be
19439 // a strictly valid v8i16 mask that we can pass through to the v8i16
19440 // lowering to handle even the v32 case.
19441 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
19442 RepeatedMask, Subtarget, DAG);
19443 }
19444 }
19445
19446 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
19447 Zeroable, Subtarget, DAG))
19448 return Blend;
19449
19450 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
19451 Zeroable, Subtarget, DAG))
19452 return PSHUFB;
19453
19454 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
19455}
19456
19457/// Handle lowering of 64-lane 8-bit integer shuffles.
19458static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19459 const APInt &Zeroable, SDValue V1, SDValue V2,
19460 const X86Subtarget &Subtarget,
19461 SelectionDAG &DAG) {
19462 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19462, __extension__
__PRETTY_FUNCTION__))
;
19463 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19463, __extension__
__PRETTY_FUNCTION__))
;
19464 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19464, __extension__
__PRETTY_FUNCTION__))
;
19465 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19465, __extension__
__PRETTY_FUNCTION__))
;
19466
19467 // Whenever we can lower this as a zext, that instruction is strictly faster
19468 // than any alternative. It also allows us to fold memory operands into the
19469 // shuffle in many cases.
19470 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19471 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
19472 return ZExt;
19473
19474 // Use dedicated unpack instructions for masks that match their pattern.
19475 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
19476 return V;
19477
19478 // Use dedicated pack instructions for masks that match their pattern.
19479 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
19480 Subtarget))
19481 return V;
19482
19483 // Try to use shift instructions.
19484 if (SDValue Shift =
19485 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
19486 DAG, /*BitwiseOnly*/ false))
19487 return Shift;
19488
19489 // Try to use byte rotation instructions.
19490 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
19491 Subtarget, DAG))
19492 return Rotate;
19493
19494 // Try to use bit rotation instructions.
19495 if (V2.isUndef())
19496 if (SDValue Rotate =
19497 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
19498 return Rotate;
19499
19500 // Lower as AND if possible.
19501 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
19502 Zeroable, Subtarget, DAG))
19503 return Masked;
19504
19505 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
19506 Zeroable, Subtarget, DAG))
19507 return PSHUFB;
19508
19509 // Try to create an in-lane repeating shuffle mask and then shuffle the
19510 // results into the target lanes.
19511 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19512 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19513 return V;
19514
19515 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
19516 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
19517 return Result;
19518
19519 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
19520 Zeroable, Subtarget, DAG))
19521 return Blend;
19522
19523 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
19524 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
19525 // PALIGNR will be cheaper than the second PSHUFB+OR.
19526 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
19527 Mask, Subtarget, DAG))
19528 return V;
19529
19530 // If we can't directly blend but can use PSHUFB, that will be better as it
19531 // can both shuffle and set up the inefficient blend.
19532 bool V1InUse, V2InUse;
19533 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
19534 DAG, V1InUse, V2InUse);
19535 }
19536
19537 // Try to simplify this by merging 128-bit lanes to enable a lane-based
19538 // shuffle.
19539 if (!V2.isUndef())
19540 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19541 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19542 return Result;
19543
19544 // VBMI can use VPERMV/VPERMV3 byte shuffles.
19545 if (Subtarget.hasVBMI())
19546 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19547
19548 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19549}
19550
19551/// High-level routine to lower various 512-bit x86 vector shuffles.
19552///
19553/// This routine either breaks down the specific type of a 512-bit x86 vector
19554/// shuffle or splits it into two 256-bit shuffles and fuses the results back
19555/// together based on the available instructions.
19556static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19557 MVT VT, SDValue V1, SDValue V2,
19558 const APInt &Zeroable,
19559 const X86Subtarget &Subtarget,
19560 SelectionDAG &DAG) {
19561 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19562, __extension__
__PRETTY_FUNCTION__))
19562 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19562, __extension__
__PRETTY_FUNCTION__))
;
19563
19564 // If we have a single input to the zero element, insert that into V1 if we
19565 // can do so cheaply.
19566 int NumElts = Mask.size();
19567 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19568
19569 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19570 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19571 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19572 return Insertion;
19573
19574 // Handle special cases where the lower or upper half is UNDEF.
19575 if (SDValue V =
19576 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19577 return V;
19578
19579 // Check for being able to broadcast a single element.
19580 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
19581 Subtarget, DAG))
19582 return Broadcast;
19583
19584 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
19585 // Try using bit ops for masking and blending before falling back to
19586 // splitting.
19587 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19588 Subtarget, DAG))
19589 return V;
19590 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19591 return V;
19592
19593 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19594 }
19595
19596 if (VT == MVT::v32f16) {
19597 V1 = DAG.getBitcast(MVT::v32i16, V1);
19598 V2 = DAG.getBitcast(MVT::v32i16, V2);
19599 return DAG.getBitcast(MVT::v32f16,
19600 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
19601 }
19602
19603 // Dispatch to each element type for lowering. If we don't have support for
19604 // specific element type shuffles at 512 bits, immediately split them and
19605 // lower them. Each lowering routine of a given type is allowed to assume that
19606 // the requisite ISA extensions for that element type are available.
19607 switch (VT.SimpleTy) {
19608 case MVT::v8f64:
19609 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19610 case MVT::v16f32:
19611 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19612 case MVT::v8i64:
19613 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19614 case MVT::v16i32:
19615 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19616 case MVT::v32i16:
19617 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19618 case MVT::v64i8:
19619 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19620
19621 default:
19622 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19622)
;
19623 }
19624}
19625
19626static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
19627 MVT VT, SDValue V1, SDValue V2,
19628 const X86Subtarget &Subtarget,
19629 SelectionDAG &DAG) {
19630 // Shuffle should be unary.
19631 if (!V2.isUndef())
19632 return SDValue();
19633
19634 int ShiftAmt = -1;
19635 int NumElts = Mask.size();
19636 for (int i = 0; i != NumElts; ++i) {
19637 int M = Mask[i];
19638 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19639, __extension__
__PRETTY_FUNCTION__))
19639 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19639, __extension__
__PRETTY_FUNCTION__))
;
19640 if (M < 0)
19641 continue;
19642
19643 // The first non-undef element determines our shift amount.
19644 if (ShiftAmt < 0) {
19645 ShiftAmt = M - i;
19646 // Need to be shifting right.
19647 if (ShiftAmt <= 0)
19648 return SDValue();
19649 }
19650 // All non-undef elements must shift by the same amount.
19651 if (ShiftAmt != M - i)
19652 return SDValue();
19653 }
19654 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19654, __extension__
__PRETTY_FUNCTION__))
;
19655
19656 // Great we found a shift right.
19657 MVT WideVT = VT;
19658 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19659 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19660 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19661 DAG.getUNDEF(WideVT), V1,
19662 DAG.getIntPtrConstant(0, DL));
19663 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
19664 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19665 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19666 DAG.getIntPtrConstant(0, DL));
19667}
19668
19669// Determine if this shuffle can be implemented with a KSHIFT instruction.
19670// Returns the shift amount if possible or -1 if not. This is a simplified
19671// version of matchShuffleAsShift.
19672static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
19673 int MaskOffset, const APInt &Zeroable) {
19674 int Size = Mask.size();
19675
19676 auto CheckZeros = [&](int Shift, bool Left) {
19677 for (int j = 0; j < Shift; ++j)
19678 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
19679 return false;
19680
19681 return true;
19682 };
19683
19684 auto MatchShift = [&](int Shift, bool Left) {
19685 unsigned Pos = Left ? Shift : 0;
19686 unsigned Low = Left ? 0 : Shift;
19687 unsigned Len = Size - Shift;
19688 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
19689 };
19690
19691 for (int Shift = 1; Shift != Size; ++Shift)
19692 for (bool Left : {true, false})
19693 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
19694 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
19695 return Shift;
19696 }
19697
19698 return -1;
19699}
19700
19701
19702// Lower vXi1 vector shuffles.
19703// There is no a dedicated instruction on AVX-512 that shuffles the masks.
19704// The only way to shuffle bits is to sign-extend the mask vector to SIMD
19705// vector, shuffle and then truncate it back.
19706static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19707 MVT VT, SDValue V1, SDValue V2,
19708 const APInt &Zeroable,
19709 const X86Subtarget &Subtarget,
19710 SelectionDAG &DAG) {
19711 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19712, __extension__
__PRETTY_FUNCTION__))
19712 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19712, __extension__
__PRETTY_FUNCTION__))
;
19713
19714 int NumElts = Mask.size();
19715
19716 // Try to recognize shuffles that are just padding a subvector with zeros.
19717 int SubvecElts = 0;
19718 int Src = -1;
19719 for (int i = 0; i != NumElts; ++i) {
19720 if (Mask[i] >= 0) {
19721 // Grab the source from the first valid mask. All subsequent elements need
19722 // to use this same source.
19723 if (Src < 0)
19724 Src = Mask[i] / NumElts;
19725 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
19726 break;
19727 }
19728
19729 ++SubvecElts;
19730 }
19731 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19731, __extension__
__PRETTY_FUNCTION__))
;
19732
19733 // Clip to a power 2.
19734 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
19735
19736 // Make sure the number of zeroable bits in the top at least covers the bits
19737 // not covered by the subvector.
19738 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
19739 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19739, __extension__
__PRETTY_FUNCTION__))
;
19740 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
19741 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
19742 Src == 0 ? V1 : V2,
19743 DAG.getIntPtrConstant(0, DL));
19744 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19745 DAG.getConstant(0, DL, VT),
19746 Extract, DAG.getIntPtrConstant(0, DL));
19747 }
19748
19749 // Try a simple shift right with undef elements. Later we'll try with zeros.
19750 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
19751 DAG))
19752 return Shift;
19753
19754 // Try to match KSHIFTs.
19755 unsigned Offset = 0;
19756 for (SDValue V : { V1, V2 }) {
19757 unsigned Opcode;
19758 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
19759 if (ShiftAmt >= 0) {
19760 MVT WideVT = VT;
19761 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19762 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19763 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19764 DAG.getUNDEF(WideVT), V,
19765 DAG.getIntPtrConstant(0, DL));
19766 // Widened right shifts need two shifts to ensure we shift in zeroes.
19767 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
19768 int WideElts = WideVT.getVectorNumElements();
19769 // Shift left to put the original vector in the MSBs of the new size.
19770 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
19771 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
19772 // Increase the shift amount to account for the left shift.
19773 ShiftAmt += WideElts - NumElts;
19774 }
19775
19776 Res = DAG.getNode(Opcode, DL, WideVT, Res,
19777 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19778 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19779 DAG.getIntPtrConstant(0, DL));
19780 }
19781 Offset += NumElts; // Increment for next iteration.
19782 }
19783
19784 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
19785 // TODO: What other unary shuffles would benefit from this?
19786 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
19787 V1->hasOneUse()) {
19788 SDValue Op0 = V1.getOperand(0);
19789 SDValue Op1 = V1.getOperand(1);
19790 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
19791 EVT OpVT = Op0.getValueType();
19792 return DAG.getSetCC(
19793 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
19794 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
19795 }
19796
19797 MVT ExtVT;
19798 switch (VT.SimpleTy) {
19799 default:
19800 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19800)
;
19801 case MVT::v2i1:
19802 ExtVT = MVT::v2i64;
19803 break;
19804 case MVT::v4i1:
19805 ExtVT = MVT::v4i32;
19806 break;
19807 case MVT::v8i1:
19808 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19809 // shuffle.
19810 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19811 break;
19812 case MVT::v16i1:
19813 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19814 // 256-bit operation available.
19815 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19816 break;
19817 case MVT::v32i1:
19818 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19819 // 256-bit operation available.
19820 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19820, __extension__
__PRETTY_FUNCTION__))
;
19821 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19822 break;
19823 case MVT::v64i1:
19824 // Fall back to scalarization. FIXME: We can do better if the shuffle
19825 // can be partitioned cleanly.
19826 if (!Subtarget.useBWIRegs())
19827 return SDValue();
19828 ExtVT = MVT::v64i8;
19829 break;
19830 }
19831
19832 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19833 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19834
19835 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19836 // i1 was sign extended we can use X86ISD::CVT2MASK.
19837 int NumElems = VT.getVectorNumElements();
19838 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19839 (Subtarget.hasDQI() && (NumElems < 32)))
19840 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19841 Shuffle, ISD::SETGT);
19842
19843 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19844}
19845
19846/// Helper function that returns true if the shuffle mask should be
19847/// commuted to improve canonicalization.
19848static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19849 int NumElements = Mask.size();
19850
19851 int NumV1Elements = 0, NumV2Elements = 0;
19852 for (int M : Mask)
19853 if (M < 0)
19854 continue;
19855 else if (M < NumElements)
19856 ++NumV1Elements;
19857 else
19858 ++NumV2Elements;
19859
19860 // Commute the shuffle as needed such that more elements come from V1 than
19861 // V2. This allows us to match the shuffle pattern strictly on how many
19862 // elements come from V1 without handling the symmetric cases.
19863 if (NumV2Elements > NumV1Elements)
19864 return true;
19865
19866 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19866, __extension__
__PRETTY_FUNCTION__))
;
19867
19868 if (NumV2Elements == 0)
19869 return false;
19870
19871 // When the number of V1 and V2 elements are the same, try to minimize the
19872 // number of uses of V2 in the low half of the vector. When that is tied,
19873 // ensure that the sum of indices for V1 is equal to or lower than the sum
19874 // indices for V2. When those are equal, try to ensure that the number of odd
19875 // indices for V1 is lower than the number of odd indices for V2.
19876 if (NumV1Elements == NumV2Elements) {
19877 int LowV1Elements = 0, LowV2Elements = 0;
19878 for (int M : Mask.slice(0, NumElements / 2))
19879 if (M >= NumElements)
19880 ++LowV2Elements;
19881 else if (M >= 0)
19882 ++LowV1Elements;
19883 if (LowV2Elements > LowV1Elements)
19884 return true;
19885 if (LowV2Elements == LowV1Elements) {
19886 int SumV1Indices = 0, SumV2Indices = 0;
19887 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19888 if (Mask[i] >= NumElements)
19889 SumV2Indices += i;
19890 else if (Mask[i] >= 0)
19891 SumV1Indices += i;
19892 if (SumV2Indices < SumV1Indices)
19893 return true;
19894 if (SumV2Indices == SumV1Indices) {
19895 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19896 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19897 if (Mask[i] >= NumElements)
19898 NumV2OddIndices += i % 2;
19899 else if (Mask[i] >= 0)
19900 NumV1OddIndices += i % 2;
19901 if (NumV2OddIndices < NumV1OddIndices)
19902 return true;
19903 }
19904 }
19905 }
19906
19907 return false;
19908}
19909
19910static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,
19911 const X86Subtarget &Subtarget) {
19912 if (!Subtarget.hasAVX512())
19913 return false;
19914
19915 MVT VT = V1.getSimpleValueType().getScalarType();
19916 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
19917 return false;
19918
19919 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
19920 // are preferable to blendw/blendvb/masked-mov.
19921 if ((VT == MVT::i16 || VT == MVT::i8) &&
19922 V1.getSimpleValueType().getSizeInBits() < 512)
19923 return false;
19924
19925 auto HasMaskOperation = [&](SDValue V) {
19926 // TODO: Currently we only check limited opcode. We probably extend
19927 // it to all binary operation by checking TLI.isBinOp().
19928 switch (V->getOpcode()) {
19929 default:
19930 return false;
19931 case ISD::ADD:
19932 case ISD::SUB:
19933 case ISD::AND:
19934 case ISD::XOR:
19935 case ISD::OR:
19936 case ISD::SMAX:
19937 case ISD::SMIN:
19938 case ISD::UMAX:
19939 case ISD::UMIN:
19940 case ISD::ABS:
19941 case ISD::SHL:
19942 case ISD::SRL:
19943 case ISD::SRA:
19944 case ISD::MUL:
19945 break;
19946 }
19947 if (!V->hasOneUse())
19948 return false;
19949
19950 return true;
19951 };
19952
19953 if (HasMaskOperation(V1) || HasMaskOperation(V2))
19954 return true;
19955
19956 return false;
19957}
19958
19959// Forward declaration.
19960static SDValue canonicalizeShuffleMaskWithHorizOp(
19961 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
19962 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
19963 const X86Subtarget &Subtarget);
19964
19965 /// Top-level lowering for x86 vector shuffles.
19966///
19967/// This handles decomposition, canonicalization, and lowering of all x86
19968/// vector shuffles. Most of the specific lowering strategies are encapsulated
19969/// above in helper routines. The canonicalization attempts to widen shuffles
19970/// to involve fewer lanes of wider elements, consolidate symmetric patterns
19971/// s.t. only one of the two inputs needs to be tested, etc.
19972static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
19973 SelectionDAG &DAG) {
19974 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
19975 ArrayRef<int> OrigMask = SVOp->getMask();
19976 SDValue V1 = Op.getOperand(0);
19977 SDValue V2 = Op.getOperand(1);
19978 MVT VT = Op.getSimpleValueType();
19979 int NumElements = VT.getVectorNumElements();
19980 SDLoc DL(Op);
19981 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
19982
19983 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19984, __extension__
__PRETTY_FUNCTION__))
19984 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19984, __extension__
__PRETTY_FUNCTION__))
;
19985
19986 bool V1IsUndef = V1.isUndef();
19987 bool V2IsUndef = V2.isUndef();
19988 if (V1IsUndef && V2IsUndef)
19989 return DAG.getUNDEF(VT);
19990
19991 // When we create a shuffle node we put the UNDEF node to second operand,
19992 // but in some cases the first operand may be transformed to UNDEF.
19993 // In this case we should just commute the node.
19994 if (V1IsUndef)
19995 return DAG.getCommutedVectorShuffle(*SVOp);
19996
19997 // Check for non-undef masks pointing at an undef vector and make the masks
19998 // undef as well. This makes it easier to match the shuffle based solely on
19999 // the mask.
20000 if (V2IsUndef &&
20001 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
20002 SmallVector<int, 8> NewMask(OrigMask);
20003 for (int &M : NewMask)
20004 if (M >= NumElements)
20005 M = -1;
20006 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
20007 }
20008
20009 // Check for illegal shuffle mask element index values.
20010 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
20011 (void)MaskUpperLimit;
20012 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20014, __extension__
__PRETTY_FUNCTION__))
20013 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20014, __extension__
__PRETTY_FUNCTION__))
20014 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20014, __extension__
__PRETTY_FUNCTION__))
;
20015
20016 // We actually see shuffles that are entirely re-arrangements of a set of
20017 // zero inputs. This mostly happens while decomposing complex shuffles into
20018 // simple ones. Directly lower these as a buildvector of zeros.
20019 APInt KnownUndef, KnownZero;
20020 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
20021
20022 APInt Zeroable = KnownUndef | KnownZero;
20023 if (Zeroable.isAllOnes())
20024 return getZeroVector(VT, Subtarget, DAG, DL);
20025
20026 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
20027
20028 // Try to collapse shuffles into using a vector type with fewer elements but
20029 // wider element types. We cap this to not form integers or floating point
20030 // elements wider than 64 bits. It does not seem beneficial to form i128
20031 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
20032 SmallVector<int, 16> WidenedMask;
20033 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
20034 !canCombineAsMaskOperation(V1, V2, Subtarget) &&
20035 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
20036 // Shuffle mask widening should not interfere with a broadcast opportunity
20037 // by obfuscating the operands with bitcasts.
20038 // TODO: Avoid lowering directly from this top-level function: make this
20039 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
20040 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
20041 Subtarget, DAG))
20042 return Broadcast;
20043
20044 MVT NewEltVT = VT.isFloatingPoint()
20045 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
20046 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
20047 int NewNumElts = NumElements / 2;
20048 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
20049 // Make sure that the new vector type is legal. For example, v2f64 isn't
20050 // legal on SSE1.
20051 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
20052 if (V2IsZero) {
20053 // Modify the new Mask to take all zeros from the all-zero vector.
20054 // Choose indices that are blend-friendly.
20055 bool UsedZeroVector = false;
20056 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20057, __extension__
__PRETTY_FUNCTION__))
20057 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20057, __extension__
__PRETTY_FUNCTION__))
;
20058 for (int i = 0; i != NewNumElts; ++i)
20059 if (WidenedMask[i] == SM_SentinelZero) {
20060 WidenedMask[i] = i + NewNumElts;
20061 UsedZeroVector = true;
20062 }
20063 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
20064 // some elements to be undef.
20065 if (UsedZeroVector)
20066 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
20067 }
20068 V1 = DAG.getBitcast(NewVT, V1);
20069 V2 = DAG.getBitcast(NewVT, V2);
20070 return DAG.getBitcast(
20071 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
20072 }
20073 }
20074
20075 SmallVector<SDValue> Ops = {V1, V2};
20076 SmallVector<int> Mask(OrigMask);
20077
20078 // Canonicalize the shuffle with any horizontal ops inputs.
20079 // NOTE: This may update Ops and Mask.
20080 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
20081 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
20082 return DAG.getBitcast(VT, HOp);
20083
20084 V1 = DAG.getBitcast(VT, Ops[0]);
20085 V2 = DAG.getBitcast(VT, Ops[1]);
20086 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))
20087 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))
20088 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))
;
20089
20090 // Commute the shuffle if it will improve canonicalization.
20091 if (canonicalizeShuffleMaskWithCommute(Mask)) {
20092 ShuffleVectorSDNode::commuteMask(Mask);
20093 std::swap(V1, V2);
20094 }
20095
20096 // For each vector width, delegate to a specialized lowering routine.
20097 if (VT.is128BitVector())
20098 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20099
20100 if (VT.is256BitVector())
20101 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20102
20103 if (VT.is512BitVector())
20104 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20105
20106 if (Is1BitVector)
20107 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20108
20109 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20109)
;
20110}
20111
20112/// Try to lower a VSELECT instruction to a vector shuffle.
20113static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
20114 const X86Subtarget &Subtarget,
20115 SelectionDAG &DAG) {
20116 SDValue Cond = Op.getOperand(0);
20117 SDValue LHS = Op.getOperand(1);
20118 SDValue RHS = Op.getOperand(2);
20119 MVT VT = Op.getSimpleValueType();
20120
20121 // Only non-legal VSELECTs reach this lowering, convert those into generic
20122 // shuffles and re-use the shuffle lowering path for blends.
20123 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
20124 SmallVector<int, 32> Mask;
20125 if (createShuffleMaskFromVSELECT(Mask, Cond))
20126 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
20127 }
20128
20129 return SDValue();
20130}
20131
20132SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
20133 SDValue Cond = Op.getOperand(0);
20134 SDValue LHS = Op.getOperand(1);
20135 SDValue RHS = Op.getOperand(2);
20136
20137 SDLoc dl(Op);
20138 MVT VT = Op.getSimpleValueType();
20139 if (isSoftFP16(VT)) {
20140 MVT NVT = VT.changeVectorElementTypeToInteger();
20141 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
20142 DAG.getBitcast(NVT, LHS),
20143 DAG.getBitcast(NVT, RHS)));
20144 }
20145
20146 // A vselect where all conditions and data are constants can be optimized into
20147 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
20148 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
20149 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
20150 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
20151 return SDValue();
20152
20153 // Try to lower this to a blend-style vector shuffle. This can handle all
20154 // constant condition cases.
20155 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
20156 return BlendOp;
20157
20158 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
20159 // with patterns on the mask registers on AVX-512.
20160 MVT CondVT = Cond.getSimpleValueType();
20161 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
20162 if (CondEltSize == 1)
20163 return Op;
20164
20165 // Variable blends are only legal from SSE4.1 onward.
20166 if (!Subtarget.hasSSE41())
20167 return SDValue();
20168
20169 unsigned EltSize = VT.getScalarSizeInBits();
20170 unsigned NumElts = VT.getVectorNumElements();
20171
20172 // Expand v32i16/v64i8 without BWI.
20173 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
20174 return SDValue();
20175
20176 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
20177 // into an i1 condition so that we can use the mask-based 512-bit blend
20178 // instructions.
20179 if (VT.getSizeInBits() == 512) {
20180 // Build a mask by testing the condition against zero.
20181 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
20182 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
20183 DAG.getConstant(0, dl, CondVT),
20184 ISD::SETNE);
20185 // Now return a new VSELECT using the mask.
20186 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
20187 }
20188
20189 // SEXT/TRUNC cases where the mask doesn't match the destination size.
20190 if (CondEltSize != EltSize) {
20191 // If we don't have a sign splat, rely on the expansion.
20192 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
20193 return SDValue();
20194
20195 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
20196 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
20197 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
20198 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
20199 }
20200
20201 // Only some types will be legal on some subtargets. If we can emit a legal
20202 // VSELECT-matching blend, return Op, and but if we need to expand, return
20203 // a null value.
20204 switch (VT.SimpleTy) {
20205 default:
20206 // Most of the vector types have blends past SSE4.1.
20207 return Op;
20208
20209 case MVT::v32i8:
20210 // The byte blends for AVX vectors were introduced only in AVX2.
20211 if (Subtarget.hasAVX2())
20212 return Op;
20213
20214 return SDValue();
20215
20216 case MVT::v8i16:
20217 case MVT::v16i16: {
20218 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
20219 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
20220 Cond = DAG.getBitcast(CastVT, Cond);
20221 LHS = DAG.getBitcast(CastVT, LHS);
20222 RHS = DAG.getBitcast(CastVT, RHS);
20223 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
20224 return DAG.getBitcast(VT, Select);
20225 }
20226 }
20227}
20228
20229static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
20230 MVT VT = Op.getSimpleValueType();
20231 SDValue Vec = Op.getOperand(0);
20232 SDValue Idx = Op.getOperand(1);
20233 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20233, __extension__
__PRETTY_FUNCTION__))
;
20234 SDLoc dl(Op);
20235
20236 if (!Vec.getSimpleValueType().is128BitVector())
20237 return SDValue();
20238
20239 if (VT.getSizeInBits() == 8) {
20240 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
20241 // we're going to zero extend the register or fold the store.
20242 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
20243 !X86::mayFoldIntoStore(Op))
20244 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
20245 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20246 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20247
20248 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
20249 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
20250 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20251 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20252 }
20253
20254 if (VT == MVT::f32) {
20255 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
20256 // the result back to FR32 register. It's only worth matching if the
20257 // result has a single use which is a store or a bitcast to i32. And in
20258 // the case of a store, it's not worth it if the index is a constant 0,
20259 // because a MOVSSmr can be used instead, which is smaller and faster.
20260 if (!Op.hasOneUse())
20261 return SDValue();
20262 SDNode *User = *Op.getNode()->use_begin();
20263 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
20264 (User->getOpcode() != ISD::BITCAST ||
20265 User->getValueType(0) != MVT::i32))
20266 return SDValue();
20267 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20268 DAG.getBitcast(MVT::v4i32, Vec), Idx);
20269 return DAG.getBitcast(MVT::f32, Extract);
20270 }
20271
20272 if (VT == MVT::i32 || VT == MVT::i64)
20273 return Op;
20274
20275 return SDValue();
20276}
20277
20278/// Extract one bit from mask vector, like v16i1 or v8i1.
20279/// AVX-512 feature.
20280static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
20281 const X86Subtarget &Subtarget) {
20282 SDValue Vec = Op.getOperand(0);
20283 SDLoc dl(Vec);
20284 MVT VecVT = Vec.getSimpleValueType();
20285 SDValue Idx = Op.getOperand(1);
20286 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20287 MVT EltVT = Op.getSimpleValueType();
20288
20289 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20290, __extension__
__PRETTY_FUNCTION__))
20290 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20290, __extension__
__PRETTY_FUNCTION__))
;
20291
20292 // variable index can't be handled in mask registers,
20293 // extend vector to VR512/128
20294 if (!IdxC) {
20295 unsigned NumElts = VecVT.getVectorNumElements();
20296 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
20297 // than extending to 128/256bit.
20298 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20299 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20300 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
20301 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
20302 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
20303 }
20304
20305 unsigned IdxVal = IdxC->getZExtValue();
20306 if (IdxVal == 0) // the operation is legal
20307 return Op;
20308
20309 // Extend to natively supported kshift.
20310 unsigned NumElems = VecVT.getVectorNumElements();
20311 MVT WideVecVT = VecVT;
20312 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20313 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20314 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20315 DAG.getUNDEF(WideVecVT), Vec,
20316 DAG.getIntPtrConstant(0, dl));
20317 }
20318
20319 // Use kshiftr instruction to move to the lower element.
20320 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20321 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20322
20323 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20324 DAG.getIntPtrConstant(0, dl));
20325}
20326
20327SDValue
20328X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
20329 SelectionDAG &DAG) const {
20330 SDLoc dl(Op);
20331 SDValue Vec = Op.getOperand(0);
20332 MVT VecVT = Vec.getSimpleValueType();
20333 SDValue Idx = Op.getOperand(1);
20334 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20335
20336 if (VecVT.getVectorElementType() == MVT::i1)
20337 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
20338
20339 if (!IdxC) {
20340 // Its more profitable to go through memory (1 cycles throughput)
20341 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
20342 // IACA tool was used to get performance estimation
20343 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
20344 //
20345 // example : extractelement <16 x i8> %a, i32 %i
20346 //
20347 // Block Throughput: 3.00 Cycles
20348 // Throughput Bottleneck: Port5
20349 //
20350 // | Num Of | Ports pressure in cycles | |
20351 // | Uops | 0 - DV | 5 | 6 | 7 | |
20352 // ---------------------------------------------
20353 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
20354 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
20355 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
20356 // Total Num Of Uops: 4
20357 //
20358 //
20359 // Block Throughput: 1.00 Cycles
20360 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
20361 //
20362 // | | Ports pressure in cycles | |
20363 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
20364 // ---------------------------------------------------------
20365 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
20366 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
20367 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
20368 // Total Num Of Uops: 4
20369
20370 return SDValue();
20371 }
20372
20373 unsigned IdxVal = IdxC->getZExtValue();
20374
20375 // If this is a 256-bit vector result, first extract the 128-bit vector and
20376 // then extract the element from the 128-bit vector.
20377 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
20378 // Get the 128-bit vector.
20379 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
20380 MVT EltVT = VecVT.getVectorElementType();
20381
20382 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
20383 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20383, __extension__
__PRETTY_FUNCTION__))
;
20384
20385 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
20386 // this can be done with a mask.
20387 IdxVal &= ElemsPerChunk - 1;
20388 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20389 DAG.getIntPtrConstant(IdxVal, dl));
20390 }
20391
20392 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20392, __extension__
__PRETTY_FUNCTION__))
;
20393
20394 MVT VT = Op.getSimpleValueType();
20395
20396 if (VT == MVT::i16) {
20397 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
20398 // we're going to zero extend the register or fold the store (SSE41 only).
20399 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
20400 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
20401 if (Subtarget.hasFP16())
20402 return Op;
20403
20404 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
20405 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20406 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20407 }
20408
20409 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
20410 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20411 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20412 }
20413
20414 if (Subtarget.hasSSE41())
20415 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
20416 return Res;
20417
20418 // TODO: We only extract a single element from v16i8, we can probably afford
20419 // to be more aggressive here before using the default approach of spilling to
20420 // stack.
20421 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
20422 // Extract either the lowest i32 or any i16, and extract the sub-byte.
20423 int DWordIdx = IdxVal / 4;
20424 if (DWordIdx == 0) {
20425 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20426 DAG.getBitcast(MVT::v4i32, Vec),
20427 DAG.getIntPtrConstant(DWordIdx, dl));
20428 int ShiftVal = (IdxVal % 4) * 8;
20429 if (ShiftVal != 0)
20430 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
20431 DAG.getConstant(ShiftVal, dl, MVT::i8));
20432 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20433 }
20434
20435 int WordIdx = IdxVal / 2;
20436 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
20437 DAG.getBitcast(MVT::v8i16, Vec),
20438 DAG.getIntPtrConstant(WordIdx, dl));
20439 int ShiftVal = (IdxVal % 2) * 8;
20440 if (ShiftVal != 0)
20441 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
20442 DAG.getConstant(ShiftVal, dl, MVT::i8));
20443 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20444 }
20445
20446 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
20447 if (IdxVal == 0)
20448 return Op;
20449
20450 // Shuffle the element to the lowest element, then movss or movsh.
20451 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
20452 Mask[0] = static_cast<int>(IdxVal);
20453 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20454 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20455 DAG.getIntPtrConstant(0, dl));
20456 }
20457
20458 if (VT.getSizeInBits() == 64) {
20459 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
20460 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
20461 // to match extract_elt for f64.
20462 if (IdxVal == 0)
20463 return Op;
20464
20465 // UNPCKHPD the element to the lowest double word, then movsd.
20466 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
20467 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
20468 int Mask[2] = { 1, -1 };
20469 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20470 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20471 DAG.getIntPtrConstant(0, dl));
20472 }
20473
20474 return SDValue();
20475}
20476
20477/// Insert one bit to mask vector, like v16i1 or v8i1.
20478/// AVX-512 feature.
20479static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
20480 const X86Subtarget &Subtarget) {
20481 SDLoc dl(Op);
20482 SDValue Vec = Op.getOperand(0);
20483 SDValue Elt = Op.getOperand(1);
20484 SDValue Idx = Op.getOperand(2);
20485 MVT VecVT = Vec.getSimpleValueType();
20486
20487 if (!isa<ConstantSDNode>(Idx)) {
20488 // Non constant index. Extend source and destination,
20489 // insert element and then truncate the result.
20490 unsigned NumElts = VecVT.getVectorNumElements();
20491 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20492 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20493 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
20494 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
20495 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
20496 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
20497 }
20498
20499 // Copy into a k-register, extract to v1i1 and insert_subvector.
20500 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
20501 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
20502}
20503
20504SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
20505 SelectionDAG &DAG) const {
20506 MVT VT = Op.getSimpleValueType();
20507 MVT EltVT = VT.getVectorElementType();
20508 unsigned NumElts = VT.getVectorNumElements();
20509 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
20510
20511 if (EltVT == MVT::i1)
20512 return InsertBitToMaskVector(Op, DAG, Subtarget);
20513
20514 SDLoc dl(Op);
20515 SDValue N0 = Op.getOperand(0);
20516 SDValue N1 = Op.getOperand(1);
20517 SDValue N2 = Op.getOperand(2);
20518 auto *N2C = dyn_cast<ConstantSDNode>(N2);
20519
20520 if (!N2C) {
20521 // Variable insertion indices, usually we're better off spilling to stack,
20522 // but AVX512 can use a variable compare+select by comparing against all
20523 // possible vector indices, and FP insertion has less gpr->simd traffic.
20524 if (!(Subtarget.hasBWI() ||
20525 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
20526 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
20527 return SDValue();
20528
20529 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
20530 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
20531 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
20532 return SDValue();
20533
20534 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
20535 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
20536 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
20537
20538 SmallVector<SDValue, 16> RawIndices;
20539 for (unsigned I = 0; I != NumElts; ++I)
20540 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
20541 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
20542
20543 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
20544 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
20545 ISD::CondCode::SETEQ);
20546 }
20547
20548 if (N2C->getAPIntValue().uge(NumElts))
20549 return SDValue();
20550 uint64_t IdxVal = N2C->getZExtValue();
20551
20552 bool IsZeroElt = X86::isZeroNode(N1);
20553 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
20554
20555 if (IsZeroElt || IsAllOnesElt) {
20556 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
20557 // We don't deal with i8 0 since it appears to be handled elsewhere.
20558 if (IsAllOnesElt &&
20559 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
20560 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
20561 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
20562 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
20563 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
20564 CstVectorElts[IdxVal] = OnesCst;
20565 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
20566 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
20567 }
20568 // See if we can do this more efficiently with a blend shuffle with a
20569 // rematerializable vector.
20570 if (Subtarget.hasSSE41() &&
20571 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
20572 SmallVector<int, 8> BlendMask;
20573 for (unsigned i = 0; i != NumElts; ++i)
20574 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20575 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
20576 : getOnesVector(VT, DAG, dl);
20577 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
20578 }
20579 }
20580
20581 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
20582 // into that, and then insert the subvector back into the result.
20583 if (VT.is256BitVector() || VT.is512BitVector()) {
20584 // With a 256-bit vector, we can insert into the zero element efficiently
20585 // using a blend if we have AVX or AVX2 and the right data type.
20586 if (VT.is256BitVector() && IdxVal == 0) {
20587 // TODO: It is worthwhile to cast integer to floating point and back
20588 // and incur a domain crossing penalty if that's what we'll end up
20589 // doing anyway after extracting to a 128-bit vector.
20590 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
20591 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
20592 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20593 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
20594 DAG.getTargetConstant(1, dl, MVT::i8));
20595 }
20596 }
20597
20598 unsigned NumEltsIn128 = 128 / EltSizeInBits;
20599 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20600, __extension__
__PRETTY_FUNCTION__))
20600 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20600, __extension__
__PRETTY_FUNCTION__))
;
20601
20602 // If we are not inserting into the low 128-bit vector chunk,
20603 // then prefer the broadcast+blend sequence.
20604 // FIXME: relax the profitability check iff all N1 uses are insertions.
20605 if (IdxVal >= NumEltsIn128 &&
20606 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
20607 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
20608 X86::mayFoldLoad(N1, Subtarget)))) {
20609 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
20610 SmallVector<int, 8> BlendMask;
20611 for (unsigned i = 0; i != NumElts; ++i)
20612 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20613 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
20614 }
20615
20616 // Get the desired 128-bit vector chunk.
20617 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
20618
20619 // Insert the element into the desired chunk.
20620 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
20621 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
20622
20623 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
20624 DAG.getIntPtrConstant(IdxIn128, dl));
20625
20626 // Insert the changed part back into the bigger vector
20627 return insert128BitVector(N0, V, IdxVal, DAG, dl);
20628 }
20629 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20629, __extension__
__PRETTY_FUNCTION__))
;
20630
20631 // This will be just movw/movd/movq/movsh/movss/movsd.
20632 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
20633 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
20634 EltVT == MVT::f16 || EltVT == MVT::i64) {
20635 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20636 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20637 }
20638
20639 // We can't directly insert an i8 or i16 into a vector, so zero extend
20640 // it to i32 first.
20641 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
20642 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
20643 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20644 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
20645 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20646 return DAG.getBitcast(VT, N1);
20647 }
20648 }
20649
20650 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
20651 // argument. SSE41 required for pinsrb.
20652 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
20653 unsigned Opc;
20654 if (VT == MVT::v8i16) {
20655 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20655, __extension__
__PRETTY_FUNCTION__))
;
20656 Opc = X86ISD::PINSRW;
20657 } else {
20658 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20658, __extension__
__PRETTY_FUNCTION__))
;
20659 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20659, __extension__
__PRETTY_FUNCTION__))
;
20660 Opc = X86ISD::PINSRB;
20661 }
20662
20663 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20663, __extension__
__PRETTY_FUNCTION__))
;
20664 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
20665 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
20666 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
20667 }
20668
20669 if (Subtarget.hasSSE41()) {
20670 if (EltVT == MVT::f32) {
20671 // Bits [7:6] of the constant are the source select. This will always be
20672 // zero here. The DAG Combiner may combine an extract_elt index into
20673 // these bits. For example (insert (extract, 3), 2) could be matched by
20674 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
20675 // Bits [5:4] of the constant are the destination select. This is the
20676 // value of the incoming immediate.
20677 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
20678 // combine either bitwise AND or insert of float 0.0 to set these bits.
20679
20680 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
20681 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
20682 // If this is an insertion of 32-bits into the low 32-bits of
20683 // a vector, we prefer to generate a blend with immediate rather
20684 // than an insertps. Blends are simpler operations in hardware and so
20685 // will always have equal or better performance than insertps.
20686 // But if optimizing for size and there's a load folding opportunity,
20687 // generate insertps because blendps does not have a 32-bit memory
20688 // operand form.
20689 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20690 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
20691 DAG.getTargetConstant(1, dl, MVT::i8));
20692 }
20693 // Create this as a scalar to vector..
20694 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20695 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
20696 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
20697 }
20698
20699 // PINSR* works with constant index.
20700 if (EltVT == MVT::i32 || EltVT == MVT::i64)
20701 return Op;
20702 }
20703
20704 return SDValue();
20705}
20706
20707static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
20708 SelectionDAG &DAG) {
20709 SDLoc dl(Op);
20710 MVT OpVT = Op.getSimpleValueType();
20711
20712 // It's always cheaper to replace a xor+movd with xorps and simplifies further
20713 // combines.
20714 if (X86::isZeroNode(Op.getOperand(0)))
20715 return getZeroVector(OpVT, Subtarget, DAG, dl);
20716
20717 // If this is a 256-bit vector result, first insert into a 128-bit
20718 // vector and then insert into the 256-bit vector.
20719 if (!OpVT.is128BitVector()) {
20720 // Insert into a 128-bit vector.
20721 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
20722 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
20723 OpVT.getVectorNumElements() / SizeFactor);
20724
20725 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
20726
20727 // Insert the 128-bit vector.
20728 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
20729 }
20730 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20731, __extension__
__PRETTY_FUNCTION__))
20731 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20731, __extension__
__PRETTY_FUNCTION__))
;
20732
20733 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
20734 // tblgen.
20735 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
20736 return Op;
20737
20738 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
20739 return DAG.getBitcast(
20740 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
20741}
20742
20743// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
20744// simple superregister reference or explicit instructions to insert
20745// the upper bits of a vector.
20746static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20747 SelectionDAG &DAG) {
20748 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20748, __extension__
__PRETTY_FUNCTION__))
;
20749
20750 return insert1BitVector(Op, DAG, Subtarget);
20751}
20752
20753static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20754 SelectionDAG &DAG) {
20755 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20756, __extension__
__PRETTY_FUNCTION__))
20756 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20756, __extension__
__PRETTY_FUNCTION__))
;
20757
20758 SDLoc dl(Op);
20759 SDValue Vec = Op.getOperand(0);
20760 uint64_t IdxVal = Op.getConstantOperandVal(1);
20761
20762 if (IdxVal == 0) // the operation is legal
20763 return Op;
20764
20765 MVT VecVT = Vec.getSimpleValueType();
20766 unsigned NumElems = VecVT.getVectorNumElements();
20767
20768 // Extend to natively supported kshift.
20769 MVT WideVecVT = VecVT;
20770 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20771 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20772 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20773 DAG.getUNDEF(WideVecVT), Vec,
20774 DAG.getIntPtrConstant(0, dl));
20775 }
20776
20777 // Shift to the LSB.
20778 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20779 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20780
20781 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
20782 DAG.getIntPtrConstant(0, dl));
20783}
20784
20785// Returns the appropriate wrapper opcode for a global reference.
20786unsigned X86TargetLowering::getGlobalWrapperKind(
20787 const GlobalValue *GV, const unsigned char OpFlags) const {
20788 // References to absolute symbols are never PC-relative.
20789 if (GV && GV->isAbsoluteSymbolRef())
20790 return X86ISD::Wrapper;
20791
20792 CodeModel::Model M = getTargetMachine().getCodeModel();
20793 if (Subtarget.isPICStyleRIPRel() &&
20794 (M == CodeModel::Small || M == CodeModel::Kernel))
20795 return X86ISD::WrapperRIP;
20796
20797 // In the medium model, functions can always be referenced RIP-relatively,
20798 // since they must be within 2GiB. This is also possible in non-PIC mode, and
20799 // shorter than the 64-bit absolute immediate that would otherwise be emitted.
20800 if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))
20801 return X86ISD::WrapperRIP;
20802
20803 // GOTPCREL references must always use RIP.
20804 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
20805 return X86ISD::WrapperRIP;
20806
20807 return X86ISD::Wrapper;
20808}
20809
20810// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
20811// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
20812// one of the above mentioned nodes. It has to be wrapped because otherwise
20813// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
20814// be used to form addressing mode. These wrapped nodes will be selected
20815// into MOV32ri.
20816SDValue
20817X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
20818 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
20819
20820 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20821 // global base reg.
20822 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20823
20824 auto PtrVT = getPointerTy(DAG.getDataLayout());
20825 SDValue Result = DAG.getTargetConstantPool(
20826 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
20827 SDLoc DL(CP);
20828 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20829 // With PIC, the address is actually $g + Offset.
20830 if (OpFlag) {
20831 Result =
20832 DAG.getNode(ISD::ADD, DL, PtrVT,
20833 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20834 }
20835
20836 return Result;
20837}
20838
20839SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
20840 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
20841
20842 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20843 // global base reg.
20844 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20845
20846 auto PtrVT = getPointerTy(DAG.getDataLayout());
20847 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
20848 SDLoc DL(JT);
20849 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20850
20851 // With PIC, the address is actually $g + Offset.
20852 if (OpFlag)
20853 Result =
20854 DAG.getNode(ISD::ADD, DL, PtrVT,
20855 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20856
20857 return Result;
20858}
20859
20860SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
20861 SelectionDAG &DAG) const {
20862 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20863}
20864
20865SDValue
20866X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20867 // Create the TargetBlockAddressAddress node.
20868 unsigned char OpFlags =
20869 Subtarget.classifyBlockAddressReference();
20870 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20871 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20872 SDLoc dl(Op);
20873 auto PtrVT = getPointerTy(DAG.getDataLayout());
20874 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20875 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20876
20877 // With PIC, the address is actually $g + Offset.
20878 if (isGlobalRelativeToPICBase(OpFlags)) {
20879 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20880 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20881 }
20882
20883 return Result;
20884}
20885
20886/// Creates target global address or external symbol nodes for calls or
20887/// other uses.
20888SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20889 bool ForCall) const {
20890 // Unpack the global address or external symbol.
20891 const SDLoc &dl = SDLoc(Op);
20892 const GlobalValue *GV = nullptr;
20893 int64_t Offset = 0;
20894 const char *ExternalSym = nullptr;
20895 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20896 GV = G->getGlobal();
20897 Offset = G->getOffset();
20898 } else {
20899 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20900 ExternalSym = ES->getSymbol();
20901 }
20902
20903 // Calculate some flags for address lowering.
20904 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
20905 unsigned char OpFlags;
20906 if (ForCall)
20907 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
20908 else
20909 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
20910 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
20911 bool NeedsLoad = isGlobalStubReference(OpFlags);
20912
20913 CodeModel::Model M = DAG.getTarget().getCodeModel();
20914 auto PtrVT = getPointerTy(DAG.getDataLayout());
20915 SDValue Result;
20916
20917 if (GV) {
20918 // Create a target global address if this is a global. If possible, fold the
20919 // offset into the global address reference. Otherwise, ADD it on later.
20920 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
20921 // allowed because if the address of foo is 0, the ELF R_X86_64_32
20922 // relocation will compute to a negative value, which is invalid.
20923 int64_t GlobalOffset = 0;
20924 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20925 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
20926 std::swap(GlobalOffset, Offset);
20927 }
20928 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20929 } else {
20930 // If this is not a global address, this must be an external symbol.
20931 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20932 }
20933
20934 // If this is a direct call, avoid the wrapper if we don't need to do any
20935 // loads or adds. This allows SDAG ISel to match direct calls.
20936 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20937 return Result;
20938
20939 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20940
20941 // With PIC, the address is actually $g + Offset.
20942 if (HasPICReg) {
20943 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20944 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20945 }
20946
20947 // For globals that require a load from a stub to get the address, emit the
20948 // load.
20949 if (NeedsLoad)
20950 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20951 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20952
20953 // If there was a non-zero offset that we didn't fold, create an explicit
20954 // addition for it.
20955 if (Offset != 0)
20956 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20957 DAG.getConstant(Offset, dl, PtrVT));
20958
20959 return Result;
20960}
20961
20962SDValue
20963X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20964 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20965}
20966
20967static SDValue
20968GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
20969 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
20970 unsigned char OperandFlags, bool LocalDynamic = false) {
20971 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20972 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20973 SDLoc dl(GA);
20974 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20975 GA->getValueType(0),
20976 GA->getOffset(),
20977 OperandFlags);
20978
20979 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
20980 : X86ISD::TLSADDR;
20981
20982 if (InGlue) {
20983 SDValue Ops[] = { Chain, TGA, *InGlue };
20984 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20985 } else {
20986 SDValue Ops[] = { Chain, TGA };
20987 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20988 }
20989
20990 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20991 MFI.setAdjustsStack(true);
20992 MFI.setHasCalls(true);
20993
20994 SDValue Glue = Chain.getValue(1);
20995 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
20996}
20997
20998// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20999static SDValue
21000LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21001 const EVT PtrVT) {
21002 SDValue InGlue;
21003 SDLoc dl(GA); // ? function entry point might be better
21004 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21005 DAG.getNode(X86ISD::GlobalBaseReg,
21006 SDLoc(), PtrVT), InGlue);
21007 InGlue = Chain.getValue(1);
21008
21009 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
21010}
21011
21012// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
21013static SDValue
21014LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21015 const EVT PtrVT) {
21016 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21017 X86::RAX, X86II::MO_TLSGD);
21018}
21019
21020// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
21021static SDValue
21022LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21023 const EVT PtrVT) {
21024 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21025 X86::EAX, X86II::MO_TLSGD);
21026}
21027
21028static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
21029 SelectionDAG &DAG, const EVT PtrVT,
21030 bool Is64Bit, bool Is64BitLP64) {
21031 SDLoc dl(GA);
21032
21033 // Get the start address of the TLS block for this module.
21034 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
21035 .getInfo<X86MachineFunctionInfo>();
21036 MFI->incNumLocalDynamicTLSAccesses();
21037
21038 SDValue Base;
21039 if (Is64Bit) {
21040 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
21041 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
21042 X86II::MO_TLSLD, /*LocalDynamic=*/true);
21043 } else {
21044 SDValue InGlue;
21045 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21046 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
21047 InGlue = Chain.getValue(1);
21048 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
21049 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
21050 }
21051
21052 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
21053 // of Base.
21054
21055 // Build x@dtpoff.
21056 unsigned char OperandFlags = X86II::MO_DTPOFF;
21057 unsigned WrapperKind = X86ISD::Wrapper;
21058 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21059 GA->getValueType(0),
21060 GA->getOffset(), OperandFlags);
21061 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21062
21063 // Add x@dtpoff with the base.
21064 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
21065}
21066
21067// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
21068static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21069 const EVT PtrVT, TLSModel::Model model,
21070 bool is64Bit, bool isPIC) {
21071 SDLoc dl(GA);
21072
21073 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
21074 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
21075 is64Bit ? 257 : 256));
21076
21077 SDValue ThreadPointer =
21078 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
21079 MachinePointerInfo(Ptr));
21080
21081 unsigned char OperandFlags = 0;
21082 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
21083 // initialexec.
21084 unsigned WrapperKind = X86ISD::Wrapper;
21085 if (model == TLSModel::LocalExec) {
21086 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
21087 } else if (model == TLSModel::InitialExec) {
21088 if (is64Bit) {
21089 OperandFlags = X86II::MO_GOTTPOFF;
21090 WrapperKind = X86ISD::WrapperRIP;
21091 } else {
21092 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
21093 }
21094 } else {
21095 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21095)
;
21096 }
21097
21098 // emit "addl x@ntpoff,%eax" (local exec)
21099 // or "addl x@indntpoff,%eax" (initial exec)
21100 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
21101 SDValue TGA =
21102 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
21103 GA->getOffset(), OperandFlags);
21104 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21105
21106 if (model == TLSModel::InitialExec) {
21107 if (isPIC && !is64Bit) {
21108 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
21109 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21110 Offset);
21111 }
21112
21113 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
21114 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21115 }
21116
21117 // The address of the thread local variable is the add of the thread
21118 // pointer with the offset of the variable.
21119 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
21120}
21121
21122SDValue
21123X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
21124
21125 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
21126
21127 if (DAG.getTarget().useEmulatedTLS())
21128 return LowerToTLSEmulatedModel(GA, DAG);
21129
21130 const GlobalValue *GV = GA->getGlobal();
21131 auto PtrVT = getPointerTy(DAG.getDataLayout());
21132 bool PositionIndependent = isPositionIndependent();
21133
21134 if (Subtarget.isTargetELF()) {
21135 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
21136 switch (model) {
21137 case TLSModel::GeneralDynamic:
21138 if (Subtarget.is64Bit()) {
21139 if (Subtarget.isTarget64BitLP64())
21140 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
21141 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
21142 }
21143 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
21144 case TLSModel::LocalDynamic:
21145 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
21146 Subtarget.isTarget64BitLP64());
21147 case TLSModel::InitialExec:
21148 case TLSModel::LocalExec:
21149 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
21150 PositionIndependent);
21151 }
21152 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21152)
;
21153 }
21154
21155 if (Subtarget.isTargetDarwin()) {
21156 // Darwin only has one model of TLS. Lower to that.
21157 unsigned char OpFlag = 0;
21158 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
21159 X86ISD::WrapperRIP : X86ISD::Wrapper;
21160
21161 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21162 // global base reg.
21163 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
21164 if (PIC32)
21165 OpFlag = X86II::MO_TLVP_PIC_BASE;
21166 else
21167 OpFlag = X86II::MO_TLVP;
21168 SDLoc DL(Op);
21169 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
21170 GA->getValueType(0),
21171 GA->getOffset(), OpFlag);
21172 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
21173
21174 // With PIC32, the address is actually $g + Offset.
21175 if (PIC32)
21176 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
21177 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21178 Offset);
21179
21180 // Lowering the machine isd will make sure everything is in the right
21181 // location.
21182 SDValue Chain = DAG.getEntryNode();
21183 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21184 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
21185 SDValue Args[] = { Chain, Offset };
21186 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
21187 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
21188
21189 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
21190 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21191 MFI.setAdjustsStack(true);
21192
21193 // And our return value (tls address) is in the standard call return value
21194 // location.
21195 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
21196 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
21197 }
21198
21199 if (Subtarget.isOSWindows()) {
21200 // Just use the implicit TLS architecture
21201 // Need to generate something similar to:
21202 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
21203 // ; from TEB
21204 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
21205 // mov rcx, qword [rdx+rcx*8]
21206 // mov eax, .tls$:tlsvar
21207 // [rax+rcx] contains the address
21208 // Windows 64bit: gs:0x58
21209 // Windows 32bit: fs:__tls_array
21210
21211 SDLoc dl(GA);
21212 SDValue Chain = DAG.getEntryNode();
21213
21214 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
21215 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
21216 // use its literal value of 0x2C.
21217 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
21218 ? Type::getInt8PtrTy(*DAG.getContext(),
21219 256)
21220 : Type::getInt32PtrTy(*DAG.getContext(),
21221 257));
21222
21223 SDValue TlsArray = Subtarget.is64Bit()
21224 ? DAG.getIntPtrConstant(0x58, dl)
21225 : (Subtarget.isTargetWindowsGNU()
21226 ? DAG.getIntPtrConstant(0x2C, dl)
21227 : DAG.getExternalSymbol("_tls_array", PtrVT));
21228
21229 SDValue ThreadPointer =
21230 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
21231
21232 SDValue res;
21233 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
21234 res = ThreadPointer;
21235 } else {
21236 // Load the _tls_index variable
21237 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
21238 if (Subtarget.is64Bit())
21239 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
21240 MachinePointerInfo(), MVT::i32);
21241 else
21242 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
21243
21244 const DataLayout &DL = DAG.getDataLayout();
21245 SDValue Scale =
21246 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
21247 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
21248
21249 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
21250 }
21251
21252 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
21253
21254 // Get the offset of start of .tls section
21255 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21256 GA->getValueType(0),
21257 GA->getOffset(), X86II::MO_SECREL);
21258 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
21259
21260 // The address of the thread local variable is the add of the thread
21261 // pointer with the offset of the variable.
21262 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
21263 }
21264
21265 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21265)
;
21266}
21267
21268/// Lower SRA_PARTS and friends, which return two i32 values
21269/// and take a 2 x i32 value to shift plus a shift amount.
21270/// TODO: Can this be moved to general expansion code?
21271static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
21272 SDValue Lo, Hi;
21273 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
21274 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
21275}
21276
21277// Try to use a packed vector operation to handle i64 on 32-bit targets when
21278// AVX512DQ is enabled.
21279static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
21280 const X86Subtarget &Subtarget) {
21281 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))
21282 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))
21283 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))
21284 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))
21285 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))
;
21286 bool IsStrict = Op->isStrictFPOpcode();
21287 unsigned OpNo = IsStrict ? 1 : 0;
21288 SDValue Src = Op.getOperand(OpNo);
21289 MVT SrcVT = Src.getSimpleValueType();
21290 MVT VT = Op.getSimpleValueType();
21291
21292 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
21293 (VT != MVT::f32 && VT != MVT::f64))
21294 return SDValue();
21295
21296 // Pack the i64 into a vector, do the operation and extract.
21297
21298 // Using 256-bit to ensure result is 128-bits for f32 case.
21299 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
21300 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
21301 MVT VecVT = MVT::getVectorVT(VT, NumElts);
21302
21303 SDLoc dl(Op);
21304 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
21305 if (IsStrict) {
21306 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
21307 {Op.getOperand(0), InVec});
21308 SDValue Chain = CvtVec.getValue(1);
21309 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21310 DAG.getIntPtrConstant(0, dl));
21311 return DAG.getMergeValues({Value, Chain}, dl);
21312 }
21313
21314 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
21315
21316 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21317 DAG.getIntPtrConstant(0, dl));
21318}
21319
21320// Try to use a packed vector operation to handle i64 on 32-bit targets.
21321static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
21322 const X86Subtarget &Subtarget) {
21323 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))
21324 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))
21325 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))
21326 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))
21327 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))
;
21328 bool IsStrict = Op->isStrictFPOpcode();
21329 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21330 MVT SrcVT = Src.getSimpleValueType();
21331 MVT VT = Op.getSimpleValueType();
21332
21333 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
21334 return SDValue();
21335
21336 // Pack the i64 into a vector, do the operation and extract.
21337
21338 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21338, __extension__
__PRETTY_FUNCTION__))
;
21339
21340 SDLoc dl(Op);
21341 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
21342 if (IsStrict) {
21343 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
21344 {Op.getOperand(0), InVec});
21345 SDValue Chain = CvtVec.getValue(1);
21346 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21347 DAG.getIntPtrConstant(0, dl));
21348 return DAG.getMergeValues({Value, Chain}, dl);
21349 }
21350
21351 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
21352
21353 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21354 DAG.getIntPtrConstant(0, dl));
21355}
21356
21357static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
21358 const X86Subtarget &Subtarget) {
21359 switch (Opcode) {
21360 case ISD::SINT_TO_FP:
21361 // TODO: Handle wider types with AVX/AVX512.
21362 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
21363 return false;
21364 // CVTDQ2PS or (V)CVTDQ2PD
21365 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
21366
21367 case ISD::UINT_TO_FP:
21368 // TODO: Handle wider types and i64 elements.
21369 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
21370 return false;
21371 // VCVTUDQ2PS or VCVTUDQ2PD
21372 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
21373
21374 default:
21375 return false;
21376 }
21377}
21378
21379/// Given a scalar cast operation that is extracted from a vector, try to
21380/// vectorize the cast op followed by extraction. This will avoid an expensive
21381/// round-trip between XMM and GPR.
21382static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
21383 const X86Subtarget &Subtarget) {
21384 // TODO: This could be enhanced to handle smaller integer types by peeking
21385 // through an extend.
21386 SDValue Extract = Cast.getOperand(0);
21387 MVT DestVT = Cast.getSimpleValueType();
21388 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21389 !isa<ConstantSDNode>(Extract.getOperand(1)))
21390 return SDValue();
21391
21392 // See if we have a 128-bit vector cast op for this type of cast.
21393 SDValue VecOp = Extract.getOperand(0);
21394 MVT FromVT = VecOp.getSimpleValueType();
21395 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
21396 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
21397 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
21398 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
21399 return SDValue();
21400
21401 // If we are extracting from a non-zero element, first shuffle the source
21402 // vector to allow extracting from element zero.
21403 SDLoc DL(Cast);
21404 if (!isNullConstant(Extract.getOperand(1))) {
21405 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
21406 Mask[0] = Extract.getConstantOperandVal(1);
21407 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
21408 }
21409 // If the source vector is wider than 128-bits, extract the low part. Do not
21410 // create an unnecessarily wide vector cast op.
21411 if (FromVT != Vec128VT)
21412 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
21413
21414 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
21415 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
21416 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
21417 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
21418 DAG.getIntPtrConstant(0, DL));
21419}
21420
21421/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
21422/// try to vectorize the cast ops. This will avoid an expensive round-trip
21423/// between XMM and GPR.
21424static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
21425 const X86Subtarget &Subtarget) {
21426 // TODO: Allow FP_TO_UINT.
21427 SDValue CastToInt = CastToFP.getOperand(0);
21428 MVT VT = CastToFP.getSimpleValueType();
21429 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
21430 return SDValue();
21431
21432 MVT IntVT = CastToInt.getSimpleValueType();
21433 SDValue X = CastToInt.getOperand(0);
21434 MVT SrcVT = X.getSimpleValueType();
21435 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
21436 return SDValue();
21437
21438 // See if we have 128-bit vector cast instructions for this type of cast.
21439 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
21440 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
21441 IntVT != MVT::i32)
21442 return SDValue();
21443
21444 unsigned SrcSize = SrcVT.getSizeInBits();
21445 unsigned IntSize = IntVT.getSizeInBits();
21446 unsigned VTSize = VT.getSizeInBits();
21447 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
21448 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
21449 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
21450
21451 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
21452 unsigned ToIntOpcode =
21453 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
21454 unsigned ToFPOpcode =
21455 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
21456
21457 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
21458 //
21459 // We are not defining the high elements (for example, zero them) because
21460 // that could nullify any performance advantage that we hoped to gain from
21461 // this vector op hack. We do not expect any adverse effects (like denorm
21462 // penalties) with cast ops.
21463 SDLoc DL(CastToFP);
21464 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
21465 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
21466 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
21467 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
21468 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
21469}
21470
21471static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
21472 const X86Subtarget &Subtarget) {
21473 SDLoc DL(Op);
21474 bool IsStrict = Op->isStrictFPOpcode();
21475 MVT VT = Op->getSimpleValueType(0);
21476 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
21477
21478 if (Subtarget.hasDQI()) {
21479 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21479, __extension__
__PRETTY_FUNCTION__))
;
21480
21481 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21483, __extension__
__PRETTY_FUNCTION__))
21482 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21483, __extension__
__PRETTY_FUNCTION__))
21483 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21483, __extension__
__PRETTY_FUNCTION__))
;
21484
21485 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
21486 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21487, __extension__
__PRETTY_FUNCTION__))
21487 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21487, __extension__
__PRETTY_FUNCTION__))
;
21488 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21489
21490 // Need to concat with zero vector for strict fp to avoid spurious
21491 // exceptions.
21492 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
21493 : DAG.getUNDEF(MVT::v8i64);
21494 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
21495 DAG.getIntPtrConstant(0, DL));
21496 SDValue Res, Chain;
21497 if (IsStrict) {
21498 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
21499 {Op->getOperand(0), Src});
21500 Chain = Res.getValue(1);
21501 } else {
21502 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
21503 }
21504
21505 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21506 DAG.getIntPtrConstant(0, DL));
21507
21508 if (IsStrict)
21509 return DAG.getMergeValues({Res, Chain}, DL);
21510 return Res;
21511 }
21512
21513 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
21514 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
21515 if (VT != MVT::v4f32 || IsSigned)
21516 return SDValue();
21517
21518 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
21519 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
21520 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
21521 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
21522 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
21523 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
21524 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
21525 SmallVector<SDValue, 4> SignCvts(4);
21526 SmallVector<SDValue, 4> Chains(4);
21527 for (int i = 0; i != 4; ++i) {
21528 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
21529 DAG.getIntPtrConstant(i, DL));
21530 if (IsStrict) {
21531 SignCvts[i] =
21532 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
21533 {Op.getOperand(0), Elt});
21534 Chains[i] = SignCvts[i].getValue(1);
21535 } else {
21536 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
21537 }
21538 }
21539 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
21540
21541 SDValue Slow, Chain;
21542 if (IsStrict) {
21543 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
21544 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
21545 {Chain, SignCvt, SignCvt});
21546 Chain = Slow.getValue(1);
21547 } else {
21548 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
21549 }
21550
21551 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
21552 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
21553
21554 if (IsStrict)
21555 return DAG.getMergeValues({Cvt, Chain}, DL);
21556
21557 return Cvt;
21558}
21559
21560static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
21561 bool IsStrict = Op->isStrictFPOpcode();
21562 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21563 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21564 MVT VT = Op.getSimpleValueType();
21565 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21566 SDLoc dl(Op);
21567
21568 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
21569 if (IsStrict)
21570 return DAG.getNode(
21571 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
21572 {Chain,
21573 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
21574 Rnd});
21575 return DAG.getNode(ISD::FP_ROUND, dl, VT,
21576 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
21577}
21578
21579static bool isLegalConversion(MVT VT, bool IsSigned,
21580 const X86Subtarget &Subtarget) {
21581 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
21582 return true;
21583 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
21584 return true;
21585 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
21586 return true;
21587 if (Subtarget.useAVX512Regs()) {
21588 if (VT == MVT::v16i32)
21589 return true;
21590 if (VT == MVT::v8i64 && Subtarget.hasDQI())
21591 return true;
21592 }
21593 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
21594 (VT == MVT::v2i64 || VT == MVT::v4i64))
21595 return true;
21596 return false;
21597}
21598
21599SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
21600 SelectionDAG &DAG) const {
21601 bool IsStrict = Op->isStrictFPOpcode();
21602 unsigned OpNo = IsStrict ? 1 : 0;
21603 SDValue Src = Op.getOperand(OpNo);
21604 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21605 MVT SrcVT = Src.getSimpleValueType();
21606 MVT VT = Op.getSimpleValueType();
21607 SDLoc dl(Op);
21608
21609 if (isSoftFP16(VT))
21610 return promoteXINT_TO_FP(Op, DAG);
21611 else if (isLegalConversion(SrcVT, true, Subtarget))
21612 return Op;
21613
21614 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21615 return LowerWin64_INT128_TO_FP(Op, DAG);
21616
21617 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21618 return Extract;
21619
21620 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
21621 return R;
21622
21623 if (SrcVT.isVector()) {
21624 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
21625 // Note: Since v2f64 is a legal type. We don't need to zero extend the
21626 // source for strict FP.
21627 if (IsStrict)
21628 return DAG.getNode(
21629 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
21630 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21631 DAG.getUNDEF(SrcVT))});
21632 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
21633 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21634 DAG.getUNDEF(SrcVT)));
21635 }
21636 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
21637 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21638
21639 return SDValue();
21640 }
21641
21642 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21643, __extension__
__PRETTY_FUNCTION__))
21643 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21643, __extension__
__PRETTY_FUNCTION__))
;
21644
21645 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
21646
21647 // These are really Legal; return the operand so the caller accepts it as
21648 // Legal.
21649 if (SrcVT == MVT::i32 && UseSSEReg)
21650 return Op;
21651 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
21652 return Op;
21653
21654 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21655 return V;
21656 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21657 return V;
21658
21659 // SSE doesn't have an i16 conversion so we need to promote.
21660 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
21661 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
21662 if (IsStrict)
21663 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
21664 {Chain, Ext});
21665
21666 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
21667 }
21668
21669 if (VT == MVT::f128 || !Subtarget.hasX87())
21670 return SDValue();
21671
21672 SDValue ValueToStore = Src;
21673 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
21674 // Bitcasting to f64 here allows us to do a single 64-bit store from
21675 // an SSE register, avoiding the store forwarding penalty that would come
21676 // with two 32-bit stores.
21677 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21678
21679 unsigned Size = SrcVT.getStoreSize();
21680 Align Alignment(Size);
21681 MachineFunction &MF = DAG.getMachineFunction();
21682 auto PtrVT = getPointerTy(MF.getDataLayout());
21683 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
21684 MachinePointerInfo MPI =
21685 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21686 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21687 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
21688 std::pair<SDValue, SDValue> Tmp =
21689 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
21690
21691 if (IsStrict)
21692 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21693
21694 return Tmp.first;
21695}
21696
21697std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
21698 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
21699 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
21700 // Build the FILD
21701 SDVTList Tys;
21702 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
21703 if (useSSE)
21704 Tys = DAG.getVTList(MVT::f80, MVT::Other);
21705 else
21706 Tys = DAG.getVTList(DstVT, MVT::Other);
21707
21708 SDValue FILDOps[] = {Chain, Pointer};
21709 SDValue Result =
21710 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
21711 Alignment, MachineMemOperand::MOLoad);
21712 Chain = Result.getValue(1);
21713
21714 if (useSSE) {
21715 MachineFunction &MF = DAG.getMachineFunction();
21716 unsigned SSFISize = DstVT.getStoreSize();
21717 int SSFI =
21718 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
21719 auto PtrVT = getPointerTy(MF.getDataLayout());
21720 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21721 Tys = DAG.getVTList(MVT::Other);
21722 SDValue FSTOps[] = {Chain, Result, StackSlot};
21723 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
21724 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
21725 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
21726
21727 Chain =
21728 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
21729 Result = DAG.getLoad(
21730 DstVT, DL, Chain, StackSlot,
21731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
21732 Chain = Result.getValue(1);
21733 }
21734
21735 return { Result, Chain };
21736}
21737
21738/// Horizontal vector math instructions may be slower than normal math with
21739/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
21740/// implementation, and likely shuffle complexity of the alternate sequence.
21741static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
21742 const X86Subtarget &Subtarget) {
21743 bool IsOptimizingSize = DAG.shouldOptForSize();
21744 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
21745 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
21746}
21747
21748/// 64-bit unsigned integer to double expansion.
21749static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
21750 const X86Subtarget &Subtarget) {
21751 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
21752 // when converting 0 when rounding toward negative infinity. Caller will
21753 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
21754 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21754, __extension__
__PRETTY_FUNCTION__))
;
21755 // This algorithm is not obvious. Here it is what we're trying to output:
21756 /*
21757 movq %rax, %xmm0
21758 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
21759 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
21760 #ifdef __SSE3__
21761 haddpd %xmm0, %xmm0
21762 #else
21763 pshufd $0x4e, %xmm0, %xmm1
21764 addpd %xmm1, %xmm0
21765 #endif
21766 */
21767
21768 SDLoc dl(Op);
21769 LLVMContext *Context = DAG.getContext();
21770
21771 // Build some magic constants.
21772 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21773 Constant *C0 = ConstantDataVector::get(*Context, CV0);
21774 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21775 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21776
21777 SmallVector<Constant*,2> CV1;
21778 CV1.push_back(
21779 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21780 APInt(64, 0x4330000000000000ULL))));
21781 CV1.push_back(
21782 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21783 APInt(64, 0x4530000000000000ULL))));
21784 Constant *C1 = ConstantVector::get(CV1);
21785 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21786
21787 // Load the 64-bit value into an XMM register.
21788 SDValue XR1 =
21789 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21790 SDValue CLod0 = DAG.getLoad(
21791 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21792 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21793 SDValue Unpck1 =
21794 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21795
21796 SDValue CLod1 = DAG.getLoad(
21797 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21798 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21799 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21800 // TODO: Are there any fast-math-flags to propagate here?
21801 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21802 SDValue Result;
21803
21804 if (Subtarget.hasSSE3() &&
21805 shouldUseHorizontalOp(true, DAG, Subtarget)) {
21806 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21807 } else {
21808 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21809 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21810 }
21811 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21812 DAG.getIntPtrConstant(0, dl));
21813 return Result;
21814}
21815
21816/// 32-bit unsigned integer to float expansion.
21817static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
21818 const X86Subtarget &Subtarget) {
21819 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21820 SDLoc dl(Op);
21821 // FP constant to bias correct the final result.
21822 SDValue Bias = DAG.getConstantFP(
21823 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
21824
21825 // Load the 32-bit value into an XMM register.
21826 SDValue Load =
21827 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21828
21829 // Zero out the upper parts of the register.
21830 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21831
21832 // Or the load with the bias.
21833 SDValue Or = DAG.getNode(
21834 ISD::OR, dl, MVT::v2i64,
21835 DAG.getBitcast(MVT::v2i64, Load),
21836 DAG.getBitcast(MVT::v2i64,
21837 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21838 Or =
21839 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21840 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
21841
21842 if (Op.getNode()->isStrictFPOpcode()) {
21843 // Subtract the bias.
21844 // TODO: Are there any fast-math-flags to propagate here?
21845 SDValue Chain = Op.getOperand(0);
21846 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21847 {Chain, Or, Bias});
21848
21849 if (Op.getValueType() == Sub.getValueType())
21850 return Sub;
21851
21852 // Handle final rounding.
21853 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21854 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21855
21856 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21857 }
21858
21859 // Subtract the bias.
21860 // TODO: Are there any fast-math-flags to propagate here?
21861 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21862
21863 // Handle final rounding.
21864 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21865}
21866
21867static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
21868 const X86Subtarget &Subtarget,
21869 const SDLoc &DL) {
21870 if (Op.getSimpleValueType() != MVT::v2f64)
21871 return SDValue();
21872
21873 bool IsStrict = Op->isStrictFPOpcode();
21874
21875 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21876 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21876, __extension__
__PRETTY_FUNCTION__))
;
21877
21878 if (Subtarget.hasAVX512()) {
21879 if (!Subtarget.hasVLX()) {
21880 // Let generic type legalization widen this.
21881 if (!IsStrict)
21882 return SDValue();
21883 // Otherwise pad the integer input with 0s and widen the operation.
21884 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21885 DAG.getConstant(0, DL, MVT::v2i32));
21886 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21887 {Op.getOperand(0), N0});
21888 SDValue Chain = Res.getValue(1);
21889 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21890 DAG.getIntPtrConstant(0, DL));
21891 return DAG.getMergeValues({Res, Chain}, DL);
21892 }
21893
21894 // Legalize to v4i32 type.
21895 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21896 DAG.getUNDEF(MVT::v2i32));
21897 if (IsStrict)
21898 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21899 {Op.getOperand(0), N0});
21900 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21901 }
21902
21903 // Zero extend to 2i64, OR with the floating point representation of 2^52.
21904 // This gives us the floating point equivalent of 2^52 + the i32 integer
21905 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
21906 // point leaving just our i32 integers in double format.
21907 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21908 SDValue VBias = DAG.getConstantFP(
21909 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
21910 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21911 DAG.getBitcast(MVT::v2i64, VBias));
21912 Or = DAG.getBitcast(MVT::v2f64, Or);
21913
21914 if (IsStrict)
21915 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21916 {Op.getOperand(0), Or, VBias});
21917 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21918}
21919
21920static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
21921 const X86Subtarget &Subtarget) {
21922 SDLoc DL(Op);
21923 bool IsStrict = Op->isStrictFPOpcode();
21924 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21925 MVT VecIntVT = V.getSimpleValueType();
21926 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21927, __extension__
__PRETTY_FUNCTION__))
21927 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21927, __extension__
__PRETTY_FUNCTION__))
;
21928
21929 if (Subtarget.hasAVX512()) {
21930 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21931 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21931, __extension__
__PRETTY_FUNCTION__))
;
21932 MVT VT = Op->getSimpleValueType(0);
21933
21934 // v8i32->v8f64 is legal with AVX512 so just return it.
21935 if (VT == MVT::v8f64)
21936 return Op;
21937
21938 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21939, __extension__
__PRETTY_FUNCTION__))
21939 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21939, __extension__
__PRETTY_FUNCTION__))
;
21940 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21941 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21942 // Need to concat with zero vector for strict fp to avoid spurious
21943 // exceptions.
21944 SDValue Tmp =
21945 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21946 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21947 DAG.getIntPtrConstant(0, DL));
21948 SDValue Res, Chain;
21949 if (IsStrict) {
21950 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21951 {Op->getOperand(0), V});
21952 Chain = Res.getValue(1);
21953 } else {
21954 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21955 }
21956
21957 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21958 DAG.getIntPtrConstant(0, DL));
21959
21960 if (IsStrict)
21961 return DAG.getMergeValues({Res, Chain}, DL);
21962 return Res;
21963 }
21964
21965 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21966 Op->getSimpleValueType(0) == MVT::v4f64) {
21967 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21968 Constant *Bias = ConstantFP::get(
21969 *DAG.getContext(),
21970 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21971 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21972 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21973 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21974 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21975 SDValue VBias = DAG.getMemIntrinsicNode(
21976 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21977 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
21978 MachineMemOperand::MOLoad);
21979
21980 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21981 DAG.getBitcast(MVT::v4i64, VBias));
21982 Or = DAG.getBitcast(MVT::v4f64, Or);
21983
21984 if (IsStrict)
21985 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21986 {Op.getOperand(0), Or, VBias});
21987 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21988 }
21989
21990 // The algorithm is the following:
21991 // #ifdef __SSE4_1__
21992 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21993 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21994 // (uint4) 0x53000000, 0xaa);
21995 // #else
21996 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21997 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21998 // #endif
21999 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22000 // return (float4) lo + fhi;
22001
22002 bool Is128 = VecIntVT == MVT::v4i32;
22003 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
22004 // If we convert to something else than the supported type, e.g., to v4f64,
22005 // abort early.
22006 if (VecFloatVT != Op->getSimpleValueType(0))
22007 return SDValue();
22008
22009 // In the #idef/#else code, we have in common:
22010 // - The vector of constants:
22011 // -- 0x4b000000
22012 // -- 0x53000000
22013 // - A shift:
22014 // -- v >> 16
22015
22016 // Create the splat vector for 0x4b000000.
22017 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
22018 // Create the splat vector for 0x53000000.
22019 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
22020
22021 // Create the right shift.
22022 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
22023 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
22024
22025 SDValue Low, High;
22026 if (Subtarget.hasSSE41()) {
22027 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
22028 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22029 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
22030 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
22031 // Low will be bitcasted right away, so do not bother bitcasting back to its
22032 // original type.
22033 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
22034 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22035 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22036 // (uint4) 0x53000000, 0xaa);
22037 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
22038 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
22039 // High will be bitcasted right away, so do not bother bitcasting back to
22040 // its original type.
22041 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
22042 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22043 } else {
22044 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
22045 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22046 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
22047 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
22048
22049 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
22050 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
22051 }
22052
22053 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
22054 SDValue VecCstFSub = DAG.getConstantFP(
22055 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
22056
22057 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22058 // NOTE: By using fsub of a positive constant instead of fadd of a negative
22059 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
22060 // enabled. See PR24512.
22061 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
22062 // TODO: Are there any fast-math-flags to propagate here?
22063 // (float4) lo;
22064 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
22065 // return (float4) lo + fhi;
22066 if (IsStrict) {
22067 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
22068 {Op.getOperand(0), HighBitcast, VecCstFSub});
22069 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
22070 {FHigh.getValue(1), LowBitcast, FHigh});
22071 }
22072
22073 SDValue FHigh =
22074 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
22075 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
22076}
22077
22078static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
22079 const X86Subtarget &Subtarget) {
22080 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
22081 SDValue N0 = Op.getOperand(OpNo);
22082 MVT SrcVT = N0.getSimpleValueType();
22083 SDLoc dl(Op);
22084
22085 switch (SrcVT.SimpleTy) {
22086 default:
22087 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22087)
;
22088 case MVT::v2i32:
22089 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
22090 case MVT::v4i32:
22091 case MVT::v8i32:
22092 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
22093 case MVT::v2i64:
22094 case MVT::v4i64:
22095 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
22096 }
22097}
22098
22099SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
22100 SelectionDAG &DAG) const {
22101 bool IsStrict = Op->isStrictFPOpcode();
22102 unsigned OpNo = IsStrict ? 1 : 0;
22103 SDValue Src = Op.getOperand(OpNo);
22104 SDLoc dl(Op);
22105 auto PtrVT = getPointerTy(DAG.getDataLayout());
22106 MVT SrcVT = Src.getSimpleValueType();
22107 MVT DstVT = Op->getSimpleValueType(0);
22108 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22109
22110 // Bail out when we don't have native conversion instructions.
22111 if (DstVT == MVT::f128)
22112 return SDValue();
22113
22114 if (isSoftFP16(DstVT))
22115 return promoteXINT_TO_FP(Op, DAG);
22116 else if (isLegalConversion(SrcVT, false, Subtarget))
22117 return Op;
22118
22119 if (DstVT.isVector())
22120 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
22121
22122 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
22123 return LowerWin64_INT128_TO_FP(Op, DAG);
22124
22125 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
22126 return Extract;
22127
22128 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
22129 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
22130 // Conversions from unsigned i32 to f32/f64 are legal,
22131 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
22132 return Op;
22133 }
22134
22135 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
22136 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
22137 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
22138 if (IsStrict)
22139 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
22140 {Chain, Src});
22141 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
22142 }
22143
22144 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
22145 return V;
22146 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
22147 return V;
22148
22149 // The transform for i64->f64 isn't correct for 0 when rounding to negative
22150 // infinity. It produces -0.0, so disable under strictfp.
22151 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
22152 !IsStrict)
22153 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
22154 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
22155 // negative infinity. So disable under strictfp. Using FILD instead.
22156 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
22157 !IsStrict)
22158 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
22159 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
22160 (DstVT == MVT::f32 || DstVT == MVT::f64))
22161 return SDValue();
22162
22163 // Make a 64-bit buffer, and use it to build an FILD.
22164 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
22165 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
22166 Align SlotAlign(8);
22167 MachinePointerInfo MPI =
22168 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
22169 if (SrcVT == MVT::i32) {
22170 SDValue OffsetSlot =
22171 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
22172 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
22173 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
22174 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
22175 std::pair<SDValue, SDValue> Tmp =
22176 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
22177 if (IsStrict)
22178 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
22179
22180 return Tmp.first;
22181 }
22182
22183 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22183, __extension__
__PRETTY_FUNCTION__))
;
22184 SDValue ValueToStore = Src;
22185 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
22186 // Bitcasting to f64 here allows us to do a single 64-bit store from
22187 // an SSE register, avoiding the store forwarding penalty that would come
22188 // with two 32-bit stores.
22189 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
22190 }
22191 SDValue Store =
22192 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
22193 // For i64 source, we need to add the appropriate power of 2 if the input
22194 // was negative. We must be careful to do the computation in x87 extended
22195 // precision, not in SSE.
22196 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22197 SDValue Ops[] = { Store, StackSlot };
22198 SDValue Fild =
22199 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
22200 SlotAlign, MachineMemOperand::MOLoad);
22201 Chain = Fild.getValue(1);
22202
22203
22204 // Check whether the sign bit is set.
22205 SDValue SignSet = DAG.getSetCC(
22206 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
22207 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
22208
22209 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
22210 APInt FF(64, 0x5F80000000000000ULL);
22211 SDValue FudgePtr = DAG.getConstantPool(
22212 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
22213 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
22214
22215 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
22216 SDValue Zero = DAG.getIntPtrConstant(0, dl);
22217 SDValue Four = DAG.getIntPtrConstant(4, dl);
22218 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
22219 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
22220
22221 // Load the value out, extending it from f32 to f80.
22222 SDValue Fudge = DAG.getExtLoad(
22223 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
22224 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
22225 CPAlignment);
22226 Chain = Fudge.getValue(1);
22227 // Extend everything to 80 bits to force it to be done on x87.
22228 // TODO: Are there any fast-math-flags to propagate here?
22229 if (IsStrict) {
22230 unsigned Opc = ISD::STRICT_FADD;
22231 // Windows needs the precision control changed to 80bits around this add.
22232 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22233 Opc = X86ISD::STRICT_FP80_ADD;
22234
22235 SDValue Add =
22236 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
22237 // STRICT_FP_ROUND can't handle equal types.
22238 if (DstVT == MVT::f80)
22239 return Add;
22240 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
22241 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
22242 }
22243 unsigned Opc = ISD::FADD;
22244 // Windows needs the precision control changed to 80bits around this add.
22245 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22246 Opc = X86ISD::FP80_ADD;
22247
22248 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
22249 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
22250 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22251}
22252
22253// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
22254// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
22255// just return an SDValue().
22256// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
22257// to i16, i32 or i64, and we lower it to a legal sequence and return the
22258// result.
22259SDValue
22260X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
22261 bool IsSigned, SDValue &Chain) const {
22262 bool IsStrict = Op->isStrictFPOpcode();
22263 SDLoc DL(Op);
22264
22265 EVT DstTy = Op.getValueType();
22266 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
22267 EVT TheVT = Value.getValueType();
22268 auto PtrVT = getPointerTy(DAG.getDataLayout());
22269
22270 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
22271 // f16 must be promoted before using the lowering in this routine.
22272 // fp128 does not use this lowering.
22273 return SDValue();
22274 }
22275
22276 // If using FIST to compute an unsigned i64, we'll need some fixup
22277 // to handle values above the maximum signed i64. A FIST is always
22278 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
22279 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
22280
22281 // FIXME: This does not generate an invalid exception if the input does not
22282 // fit in i32. PR44019
22283 if (!IsSigned && DstTy != MVT::i64) {
22284 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
22285 // The low 32 bits of the fist result will have the correct uint32 result.
22286 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22286, __extension__
__PRETTY_FUNCTION__))
;
22287 DstTy = MVT::i64;
22288 }
22289
22290 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22292, __extension__
__PRETTY_FUNCTION__))
22291 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22292, __extension__
__PRETTY_FUNCTION__))
22292 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22292, __extension__
__PRETTY_FUNCTION__))
;
22293
22294 // We lower FP->int64 into FISTP64 followed by a load from a temporary
22295 // stack slot.
22296 MachineFunction &MF = DAG.getMachineFunction();
22297 unsigned MemSize = DstTy.getStoreSize();
22298 int SSFI =
22299 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
22300 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
22301
22302 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22303
22304 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
22305
22306 if (UnsignedFixup) {
22307 //
22308 // Conversion to unsigned i64 is implemented with a select,
22309 // depending on whether the source value fits in the range
22310 // of a signed i64. Let Thresh be the FP equivalent of
22311 // 0x8000000000000000ULL.
22312 //
22313 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
22314 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
22315 // FistSrc = (Value - FltOfs);
22316 // Fist-to-mem64 FistSrc
22317 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
22318 // to XOR'ing the high 32 bits with Adjust.
22319 //
22320 // Being a power of 2, Thresh is exactly representable in all FP formats.
22321 // For X87 we'd like to use the smallest FP type for this constant, but
22322 // for DAG type consistency we have to match the FP operand type.
22323
22324 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
22325 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
22326 bool LosesInfo = false;
22327 if (TheVT == MVT::f64)
22328 // The rounding mode is irrelevant as the conversion should be exact.
22329 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
22330 &LosesInfo);
22331 else if (TheVT == MVT::f80)
22332 Status = Thresh.convert(APFloat::x87DoubleExtended(),
22333 APFloat::rmNearestTiesToEven, &LosesInfo);
22334
22335 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22336, __extension__
__PRETTY_FUNCTION__))
22336 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22336, __extension__
__PRETTY_FUNCTION__))
;
22337
22338 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
22339
22340 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
22341 *DAG.getContext(), TheVT);
22342 SDValue Cmp;
22343 if (IsStrict) {
22344 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
22345 /*IsSignaling*/ true);
22346 Chain = Cmp.getValue(1);
22347 } else {
22348 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
22349 }
22350
22351 // Our preferred lowering of
22352 //
22353 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
22354 //
22355 // is
22356 //
22357 // (Value >= Thresh) << 63
22358 //
22359 // but since we can get here after LegalOperations, DAGCombine might do the
22360 // wrong thing if we create a select. So, directly create the preferred
22361 // version.
22362 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
22363 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
22364 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
22365
22366 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
22367 DAG.getConstantFP(0.0, DL, TheVT));
22368
22369 if (IsStrict) {
22370 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
22371 { Chain, Value, FltOfs });
22372 Chain = Value.getValue(1);
22373 } else
22374 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
22375 }
22376
22377 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
22378
22379 // FIXME This causes a redundant load/store if the SSE-class value is already
22380 // in memory, such as if it is on the callstack.
22381 if (isScalarFPTypeInSSEReg(TheVT)) {
22382 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22382, __extension__
__PRETTY_FUNCTION__))
;
22383 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
22384 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22385 SDValue Ops[] = { Chain, StackSlot };
22386
22387 unsigned FLDSize = TheVT.getStoreSize();
22388 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__))
;
22389 MachineMemOperand *MMO = MF.getMachineMemOperand(
22390 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
22391 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
22392 Chain = Value.getValue(1);
22393 }
22394
22395 // Build the FP_TO_INT*_IN_MEM
22396 MachineMemOperand *MMO = MF.getMachineMemOperand(
22397 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
22398 SDValue Ops[] = { Chain, Value, StackSlot };
22399 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
22400 DAG.getVTList(MVT::Other),
22401 Ops, DstTy, MMO);
22402
22403 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
22404 Chain = Res.getValue(1);
22405
22406 // If we need an unsigned fixup, XOR the result with adjust.
22407 if (UnsignedFixup)
22408 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
22409
22410 return Res;
22411}
22412
22413static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
22414 const X86Subtarget &Subtarget) {
22415 MVT VT = Op.getSimpleValueType();
22416 SDValue In = Op.getOperand(0);
22417 MVT InVT = In.getSimpleValueType();
22418 SDLoc dl(Op);
22419 unsigned Opc = Op.getOpcode();
22420
22421 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22421, __extension__
__PRETTY_FUNCTION__))
;
22422 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22423, __extension__
__PRETTY_FUNCTION__))
22423 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22423, __extension__
__PRETTY_FUNCTION__))
;
22424 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22425, __extension__
__PRETTY_FUNCTION__))
22425 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22425, __extension__
__PRETTY_FUNCTION__))
;
22426 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))
22427 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))
22428 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))
22429 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))
;
22430 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))
22431 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))
22432 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))
22433 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))
;
22434
22435 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
22436
22437 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
22438 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22438, __extension__
__PRETTY_FUNCTION__))
;
22439 return splitVectorIntUnary(Op, DAG);
22440 }
22441
22442 if (Subtarget.hasInt256())
22443 return Op;
22444
22445 // Optimize vectors in AVX mode:
22446 //
22447 // v8i16 -> v8i32
22448 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
22449 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
22450 // Concat upper and lower parts.
22451 //
22452 // v4i32 -> v4i64
22453 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
22454 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
22455 // Concat upper and lower parts.
22456 //
22457 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22458 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
22459
22460 // Short-circuit if we can determine that each 128-bit half is the same value.
22461 // Otherwise, this is difficult to match and optimize.
22462 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
22463 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
22464 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
22465
22466 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
22467 SDValue Undef = DAG.getUNDEF(InVT);
22468 bool NeedZero = Opc == ISD::ZERO_EXTEND;
22469 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
22470 OpHi = DAG.getBitcast(HalfVT, OpHi);
22471
22472 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22473}
22474
22475// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
22476static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
22477 const SDLoc &dl, SelectionDAG &DAG) {
22478 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22478, __extension__
__PRETTY_FUNCTION__))
;
22479 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22480 DAG.getIntPtrConstant(0, dl));
22481 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22482 DAG.getIntPtrConstant(8, dl));
22483 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
22484 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
22485 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
22486 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22487}
22488
22489static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
22490 const X86Subtarget &Subtarget,
22491 SelectionDAG &DAG) {
22492 MVT VT = Op->getSimpleValueType(0);
22493 SDValue In = Op->getOperand(0);
22494 MVT InVT = In.getSimpleValueType();
22495 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22495, __extension__
__PRETTY_FUNCTION__))
;
22496 SDLoc DL(Op);
22497 unsigned NumElts = VT.getVectorNumElements();
22498
22499 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
22500 // avoids a constant pool load.
22501 if (VT.getVectorElementType() != MVT::i8) {
22502 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
22503 return DAG.getNode(ISD::SRL, DL, VT, Extend,
22504 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
22505 }
22506
22507 // Extend VT if BWI is not supported.
22508 MVT ExtVT = VT;
22509 if (!Subtarget.hasBWI()) {
22510 // If v16i32 is to be avoided, we'll need to split and concatenate.
22511 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22512 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
22513
22514 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22515 }
22516
22517 // Widen to 512-bits if VLX is not supported.
22518 MVT WideVT = ExtVT;
22519 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22520 NumElts *= 512 / ExtVT.getSizeInBits();
22521 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22522 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
22523 In, DAG.getIntPtrConstant(0, DL));
22524 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
22525 NumElts);
22526 }
22527
22528 SDValue One = DAG.getConstant(1, DL, WideVT);
22529 SDValue Zero = DAG.getConstant(0, DL, WideVT);
22530
22531 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
22532
22533 // Truncate if we had to extend above.
22534 if (VT != ExtVT) {
22535 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
22536 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
22537 }
22538
22539 // Extract back to 128/256-bit if we widened.
22540 if (WideVT != VT)
22541 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
22542 DAG.getIntPtrConstant(0, DL));
22543
22544 return SelectedVal;
22545}
22546
22547static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22548 SelectionDAG &DAG) {
22549 SDValue In = Op.getOperand(0);
22550 MVT SVT = In.getSimpleValueType();
22551
22552 if (SVT.getVectorElementType() == MVT::i1)
22553 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
22554
22555 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22555, __extension__
__PRETTY_FUNCTION__))
;
22556 return LowerAVXExtend(Op, DAG, Subtarget);
22557}
22558
22559/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
22560/// It makes use of the fact that vectors with enough leading sign/zero bits
22561/// prevent the PACKSS/PACKUS from saturating the results.
22562/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
22563/// within each 128-bit lane.
22564static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
22565 const SDLoc &DL, SelectionDAG &DAG,
22566 const X86Subtarget &Subtarget) {
22567 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22568, __extension__
__PRETTY_FUNCTION__))
22568 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22568, __extension__
__PRETTY_FUNCTION__))
;
22569 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22569, __extension__
__PRETTY_FUNCTION__))
;
22570
22571 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
22572 if (!Subtarget.hasSSE2())
22573 return SDValue();
22574
22575 EVT SrcVT = In.getValueType();
22576
22577 // No truncation required, we might get here due to recursive calls.
22578 if (SrcVT == DstVT)
22579 return In;
22580
22581 // We only support vector truncation to 64bits or greater from a
22582 // 128bits or greater source.
22583 unsigned DstSizeInBits = DstVT.getSizeInBits();
22584 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
22585 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
22586 return SDValue();
22587
22588 unsigned NumElems = SrcVT.getVectorNumElements();
22589 if (!isPowerOf2_32(NumElems))
22590 return SDValue();
22591
22592 LLVMContext &Ctx = *DAG.getContext();
22593 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22593, __extension__
__PRETTY_FUNCTION__))
;
22594 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22594, __extension__
__PRETTY_FUNCTION__))
;
22595
22596 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
22597
22598 // Pack to the largest type possible:
22599 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
22600 EVT InVT = MVT::i16, OutVT = MVT::i8;
22601 if (SrcVT.getScalarSizeInBits() > 16 &&
22602 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
22603 InVT = MVT::i32;
22604 OutVT = MVT::i16;
22605 }
22606
22607 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
22608 if (SrcVT.is128BitVector()) {
22609 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
22610 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
22611 In = DAG.getBitcast(InVT, In);
22612 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
22613 Res = extractSubVector(Res, 0, DAG, DL, 64);
22614 return DAG.getBitcast(DstVT, Res);
22615 }
22616
22617 // Split lower/upper subvectors.
22618 SDValue Lo, Hi;
22619 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
22620
22621 unsigned SubSizeInBits = SrcSizeInBits / 2;
22622 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
22623 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
22624
22625 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
22626 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
22627 Lo = DAG.getBitcast(InVT, Lo);
22628 Hi = DAG.getBitcast(InVT, Hi);
22629 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22630 return DAG.getBitcast(DstVT, Res);
22631 }
22632
22633 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
22634 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
22635 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
22636 Lo = DAG.getBitcast(InVT, Lo);
22637 Hi = DAG.getBitcast(InVT, Hi);
22638 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22639
22640 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
22641 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
22642 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
22643 SmallVector<int, 64> Mask;
22644 int Scale = 64 / OutVT.getScalarSizeInBits();
22645 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
22646 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
22647
22648 if (DstVT.is256BitVector())
22649 return DAG.getBitcast(DstVT, Res);
22650
22651 // If 512bit -> 128bit truncate another stage.
22652 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22653 Res = DAG.getBitcast(PackedVT, Res);
22654 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22655 }
22656
22657 // Recursively pack lower/upper subvectors, concat result and pack again.
22658 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22658, __extension__
__PRETTY_FUNCTION__))
;
22659 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
22660 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
22661 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
22662
22663 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22664 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
22665 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22666}
22667
22668static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
22669 const X86Subtarget &Subtarget) {
22670
22671 SDLoc DL(Op);
22672 MVT VT = Op.getSimpleValueType();
22673 SDValue In = Op.getOperand(0);
22674 MVT InVT = In.getSimpleValueType();
22675
22676 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22676, __extension__
__PRETTY_FUNCTION__))
;
22677
22678 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
22679 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
22680 if (InVT.getScalarSizeInBits() <= 16) {
22681 if (Subtarget.hasBWI()) {
22682 // legal, will go to VPMOVB2M, VPMOVW2M
22683 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22684 // We need to shift to get the lsb into sign position.
22685 // Shift packed bytes not supported natively, bitcast to word
22686 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
22687 In = DAG.getNode(ISD::SHL, DL, ExtVT,
22688 DAG.getBitcast(ExtVT, In),
22689 DAG.getConstant(ShiftInx, DL, ExtVT));
22690 In = DAG.getBitcast(InVT, In);
22691 }
22692 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
22693 In, ISD::SETGT);
22694 }
22695 // Use TESTD/Q, extended vector to packed dword/qword.
22696 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22697, __extension__
__PRETTY_FUNCTION__))
22697 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22697, __extension__
__PRETTY_FUNCTION__))
;
22698 unsigned NumElts = InVT.getVectorNumElements();
22699 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22699, __extension__
__PRETTY_FUNCTION__))
;
22700 // We need to change to a wider element type that we have support for.
22701 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
22702 // For 16 element vectors we extend to v16i32 unless we are explicitly
22703 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
22704 // we need to split into two 8 element vectors which we can extend to v8i32,
22705 // truncate and concat the results. There's an additional complication if
22706 // the original type is v16i8. In that case we can't split the v16i8
22707 // directly, so we need to shuffle high elements to low and use
22708 // sign_extend_vector_inreg.
22709 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
22710 SDValue Lo, Hi;
22711 if (InVT == MVT::v16i8) {
22712 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
22713 Hi = DAG.getVectorShuffle(
22714 InVT, DL, In, In,
22715 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
22716 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
22717 } else {
22718 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22718, __extension__
__PRETTY_FUNCTION__))
;
22719 Lo = extract128BitVector(In, 0, DAG, DL);
22720 Hi = extract128BitVector(In, 8, DAG, DL);
22721 }
22722 // We're split now, just emit two truncates and a concat. The two
22723 // truncates will trigger legalization to come back to this function.
22724 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
22725 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
22726 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22727 }
22728 // We either have 8 elements or we're allowed to use 512-bit vectors.
22729 // If we have VLX, we want to use the narrowest vector that can get the
22730 // job done so we use vXi32.
22731 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
22732 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
22733 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
22734 InVT = ExtVT;
22735 ShiftInx = InVT.getScalarSizeInBits() - 1;
22736 }
22737
22738 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22739 // We need to shift to get the lsb into sign position.
22740 In = DAG.getNode(ISD::SHL, DL, InVT, In,
22741 DAG.getConstant(ShiftInx, DL, InVT));
22742 }
22743 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
22744 if (Subtarget.hasDQI())
22745 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
22746 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
22747}
22748
22749SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
22750 SDLoc DL(Op);
22751 MVT VT = Op.getSimpleValueType();
22752 SDValue In = Op.getOperand(0);
22753 MVT InVT = In.getSimpleValueType();
22754 unsigned InNumEltBits = InVT.getScalarSizeInBits();
22755
22756 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22757, __extension__
__PRETTY_FUNCTION__))
22757 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22757, __extension__
__PRETTY_FUNCTION__))
;
22758
22759 // If we're called by the type legalizer, handle a few cases.
22760 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22761 if (!TLI.isTypeLegal(InVT)) {
22762 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22763 VT.is128BitVector()) {
22764 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22765, __extension__
__PRETTY_FUNCTION__))
22765 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22765, __extension__
__PRETTY_FUNCTION__))
;
22766 // The default behavior is to truncate one step, concatenate, and then
22767 // truncate the remainder. We'd rather produce two 64-bit results and
22768 // concatenate those.
22769 SDValue Lo, Hi;
22770 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
22771
22772 EVT LoVT, HiVT;
22773 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22774
22775 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
22776 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
22777 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22778 }
22779
22780 // Otherwise let default legalization handle it.
22781 return SDValue();
22782 }
22783
22784 if (VT.getVectorElementType() == MVT::i1)
22785 return LowerTruncateVecI1(Op, DAG, Subtarget);
22786
22787 // vpmovqb/w/d, vpmovdb/w, vpmovwb
22788 if (Subtarget.hasAVX512()) {
22789 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
22790 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22790, __extension__
__PRETTY_FUNCTION__))
;
22791 return splitVectorIntUnary(Op, DAG);
22792 }
22793
22794 // word to byte only under BWI. Otherwise we have to promoted to v16i32
22795 // and then truncate that. But we should only do that if we haven't been
22796 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
22797 // handled by isel patterns.
22798 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
22799 Subtarget.canExtendTo512DQ())
22800 return Op;
22801 }
22802
22803 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22804 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22805
22806 // Truncate with PACKUS if we are truncating a vector with leading zero bits
22807 // that extend all the way to the packed/truncated value.
22808 // Pre-SSE41 we can only use PACKUSWB.
22809 KnownBits Known = DAG.computeKnownBits(In);
22810 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
22811 if (SDValue V =
22812 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
22813 return V;
22814
22815 // Truncate with PACKSS if we are truncating a vector with sign-bits that
22816 // extend all the way to the packed/truncated value.
22817 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
22818 if (SDValue V =
22819 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
22820 return V;
22821
22822 // Handle truncation of V256 to V128 using shuffles.
22823 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22823, __extension__
__PRETTY_FUNCTION__))
;
22824
22825 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
22826 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
22827 if (Subtarget.hasInt256()) {
22828 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
22829 In = DAG.getBitcast(MVT::v8i32, In);
22830 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
22831 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
22832 DAG.getIntPtrConstant(0, DL));
22833 }
22834
22835 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22836 DAG.getIntPtrConstant(0, DL));
22837 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22838 DAG.getIntPtrConstant(2, DL));
22839 static const int ShufMask[] = {0, 2, 4, 6};
22840 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
22841 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
22842 }
22843
22844 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
22845 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
22846 if (Subtarget.hasInt256()) {
22847 // The PSHUFB mask:
22848 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
22849 -1, -1, -1, -1, -1, -1, -1, -1,
22850 16, 17, 20, 21, 24, 25, 28, 29,
22851 -1, -1, -1, -1, -1, -1, -1, -1 };
22852 In = DAG.getBitcast(MVT::v32i8, In);
22853 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
22854 In = DAG.getBitcast(MVT::v4i64, In);
22855
22856 static const int ShufMask2[] = {0, 2, -1, -1};
22857 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
22858 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22859 DAG.getIntPtrConstant(0, DL));
22860 return DAG.getBitcast(MVT::v8i16, In);
22861 }
22862
22863 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22864 DAG.getIntPtrConstant(0, DL));
22865 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22866 DAG.getIntPtrConstant(4, DL));
22867
22868 // The PSHUFB mask:
22869 static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
22870
22871 OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
22872 OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
22873
22874 OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
22875 OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
22876
22877 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
22878 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
22879
22880 // The MOVLHPS Mask:
22881 static const int ShufMask2[] = {0, 1, 4, 5};
22882 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
22883 return DAG.getBitcast(MVT::v8i16, res);
22884 }
22885
22886 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
22887 // Use an AND to zero uppper bits for PACKUS.
22888 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
22889
22890 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22891 DAG.getIntPtrConstant(0, DL));
22892 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22893 DAG.getIntPtrConstant(8, DL));
22894 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
22895 }
22896
22897 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22897)
;
22898}
22899
22900// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
22901// behaves on out of range inputs to generate optimized conversions.
22902static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
22903 SelectionDAG &DAG,
22904 const X86Subtarget &Subtarget) {
22905 MVT SrcVT = Src.getSimpleValueType();
22906 unsigned DstBits = VT.getScalarSizeInBits();
22907 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22907, __extension__
__PRETTY_FUNCTION__))
;
22908
22909 // Calculate the converted result for values in the range 0 to
22910 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22911 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
22912 SDValue Big =
22913 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
22914 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
22915 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
22916
22917 // The "CVTTP2SI" instruction conveniently sets the sign bit if
22918 // and only if the value was out of range. So we can use that
22919 // as our indicator that we rather use "Big" instead of "Small".
22920 //
22921 // Use "Small" if "IsOverflown" has all bits cleared
22922 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22923
22924 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
22925 // use the slightly slower blendv select instead.
22926 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
22927 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
22928 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
22929 }
22930
22931 SDValue IsOverflown =
22932 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
22933 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
22934 return DAG.getNode(ISD::OR, dl, VT, Small,
22935 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22936}
22937
22938SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
22939 bool IsStrict = Op->isStrictFPOpcode();
22940 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
22941 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
22942 MVT VT = Op->getSimpleValueType(0);
22943 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22944 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
22945 MVT SrcVT = Src.getSimpleValueType();
22946 SDLoc dl(Op);
22947
22948 SDValue Res;
22949 if (isSoftFP16(SrcVT)) {
22950 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
22951 if (IsStrict)
22952 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
22953 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
22954 {NVT, MVT::Other}, {Chain, Src})});
22955 return DAG.getNode(Op.getOpcode(), dl, VT,
22956 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
22957 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
22958 return Op;
22959 }
22960
22961 if (VT.isVector()) {
22962 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
22963 MVT ResVT = MVT::v4i32;
22964 MVT TruncVT = MVT::v4i1;
22965 unsigned Opc;
22966 if (IsStrict)
22967 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
22968 else
22969 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22970
22971 if (!IsSigned && !Subtarget.hasVLX()) {
22972 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22972, __extension__
__PRETTY_FUNCTION__))
;
22973 // Widen to 512-bits.
22974 ResVT = MVT::v8i32;
22975 TruncVT = MVT::v8i1;
22976 Opc = Op.getOpcode();
22977 // Need to concat with zero vector for strict fp to avoid spurious
22978 // exceptions.
22979 // TODO: Should we just do this for non-strict as well?
22980 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
22981 : DAG.getUNDEF(MVT::v8f64);
22982 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
22983 DAG.getIntPtrConstant(0, dl));
22984 }
22985 if (IsStrict) {
22986 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
22987 Chain = Res.getValue(1);
22988 } else {
22989 Res = DAG.getNode(Opc, dl, ResVT, Src);
22990 }
22991
22992 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
22993 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
22994 DAG.getIntPtrConstant(0, dl));
22995 if (IsStrict)
22996 return DAG.getMergeValues({Res, Chain}, dl);
22997 return Res;
22998 }
22999
23000 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
23001 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
23002 return Op;
23003
23004 MVT ResVT = VT;
23005 MVT EleVT = VT.getVectorElementType();
23006 if (EleVT != MVT::i64)
23007 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
23008
23009 if (SrcVT != MVT::v8f16) {
23010 SDValue Tmp =
23011 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
23012 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
23013 Ops[0] = Src;
23014 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
23015 }
23016
23017 if (IsStrict) {
23018 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
23019 : X86ISD::STRICT_CVTTP2UI,
23020 dl, {ResVT, MVT::Other}, {Chain, Src});
23021 Chain = Res.getValue(1);
23022 } else {
23023 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
23024 ResVT, Src);
23025 }
23026
23027 // TODO: Need to add exception check code for strict FP.
23028 if (EleVT.getSizeInBits() < 16) {
23029 ResVT = MVT::getVectorVT(EleVT, 8);
23030 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
23031 }
23032
23033 if (ResVT != VT)
23034 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23035 DAG.getIntPtrConstant(0, dl));
23036
23037 if (IsStrict)
23038 return DAG.getMergeValues({Res, Chain}, dl);
23039 return Res;
23040 }
23041
23042 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
23043 if (VT.getVectorElementType() == MVT::i16) {
23044 assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23046, __extension__
__PRETTY_FUNCTION__))
23045 SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23046, __extension__
__PRETTY_FUNCTION__))
23046 "Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23046, __extension__
__PRETTY_FUNCTION__))
;
23047 MVT NVT = VT.changeVectorElementType(MVT::i32);
23048 if (IsStrict) {
23049 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
23050 : ISD::STRICT_FP_TO_UINT,
23051 dl, {NVT, MVT::Other}, {Chain, Src});
23052 Chain = Res.getValue(1);
23053 } else {
23054 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
23055 NVT, Src);
23056 }
23057
23058 // TODO: Need to add exception check code for strict FP.
23059 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23060
23061 if (IsStrict)
23062 return DAG.getMergeValues({Res, Chain}, dl);
23063 return Res;
23064 }
23065
23066 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
23067 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
23068 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23068, __extension__
__PRETTY_FUNCTION__))
;
23069 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23069, __extension__
__PRETTY_FUNCTION__))
;
23070 return Op;
23071 }
23072
23073 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
23074 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
23075 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
23076 Subtarget.useAVX512Regs()) {
23077 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__))
;
23078 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23078, __extension__
__PRETTY_FUNCTION__))
;
23079 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
23080 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
23081 // Need to concat with zero vector for strict fp to avoid spurious
23082 // exceptions.
23083 // TODO: Should we just do this for non-strict as well?
23084 SDValue Tmp =
23085 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23086 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23087 DAG.getIntPtrConstant(0, dl));
23088
23089 if (IsStrict) {
23090 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
23091 {Chain, Src});
23092 Chain = Res.getValue(1);
23093 } else {
23094 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
23095 }
23096
23097 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23098 DAG.getIntPtrConstant(0, dl));
23099
23100 if (IsStrict)
23101 return DAG.getMergeValues({Res, Chain}, dl);
23102 return Res;
23103 }
23104
23105 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
23106 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
23107 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
23108 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
23109 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23109, __extension__
__PRETTY_FUNCTION__))
;
23110 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
23111 // Need to concat with zero vector for strict fp to avoid spurious
23112 // exceptions.
23113 // TODO: Should we just do this for non-strict as well?
23114 SDValue Tmp =
23115 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23116 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23117 DAG.getIntPtrConstant(0, dl));
23118
23119 if (IsStrict) {
23120 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23121 {Chain, Src});
23122 Chain = Res.getValue(1);
23123 } else {
23124 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
23125 }
23126
23127 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23128 DAG.getIntPtrConstant(0, dl));
23129
23130 if (IsStrict)
23131 return DAG.getMergeValues({Res, Chain}, dl);
23132 return Res;
23133 }
23134
23135 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
23136 if (!Subtarget.hasVLX()) {
23137 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
23138 // legalizer and then widened again by vector op legalization.
23139 if (!IsStrict)
23140 return SDValue();
23141
23142 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
23143 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
23144 {Src, Zero, Zero, Zero});
23145 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23146 {Chain, Tmp});
23147 SDValue Chain = Tmp.getValue(1);
23148 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
23149 DAG.getIntPtrConstant(0, dl));
23150 return DAG.getMergeValues({Tmp, Chain}, dl);
23151 }
23152
23153 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23153, __extension__
__PRETTY_FUNCTION__))
;
23154 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23155 DAG.getUNDEF(MVT::v2f32));
23156 if (IsStrict) {
23157 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
23158 : X86ISD::STRICT_CVTTP2UI;
23159 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
23160 }
23161 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23162 return DAG.getNode(Opc, dl, VT, Tmp);
23163 }
23164
23165 // Generate optimized instructions for pre AVX512 unsigned conversions from
23166 // vXf32 to vXi32.
23167 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
23168 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
23169 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
23170 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23170, __extension__
__PRETTY_FUNCTION__))
;
23171 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
23172 }
23173
23174 return SDValue();
23175 }
23176
23177 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23177, __extension__ __PRETTY_FUNCTION__))
;
23178
23179 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
23180
23181 if (!IsSigned && UseSSEReg) {
23182 // Conversions from f32/f64 with AVX512 should be legal.
23183 if (Subtarget.hasAVX512())
23184 return Op;
23185
23186 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
23187 // behaves on out of range inputs to generate optimized conversions.
23188 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
23189 (VT == MVT::i64 && Subtarget.is64Bit()))) {
23190 unsigned DstBits = VT.getScalarSizeInBits();
23191 APInt UIntLimit = APInt::getSignMask(DstBits);
23192 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
23193 DAG.getConstant(UIntLimit, dl, VT));
23194 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
23195
23196 // Calculate the converted result for values in the range:
23197 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23198 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
23199 SDValue Small =
23200 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
23201 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
23202 SDValue Big = DAG.getNode(
23203 X86ISD::CVTTS2SI, dl, VT,
23204 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
23205 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
23206
23207 // The "CVTTS2SI" instruction conveniently sets the sign bit if
23208 // and only if the value was out of range. So we can use that
23209 // as our indicator that we rather use "Big" instead of "Small".
23210 //
23211 // Use "Small" if "IsOverflown" has all bits cleared
23212 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23213 SDValue IsOverflown = DAG.getNode(
23214 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
23215 return DAG.getNode(ISD::OR, dl, VT, Small,
23216 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23217 }
23218
23219 // Use default expansion for i64.
23220 if (VT == MVT::i64)
23221 return SDValue();
23222
23223 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23223, __extension__
__PRETTY_FUNCTION__))
;
23224
23225 // Promote i32 to i64 and use a signed operation on 64-bit targets.
23226 // FIXME: This does not generate an invalid exception if the input does not
23227 // fit in i32. PR44019
23228 if (Subtarget.is64Bit()) {
23229 if (IsStrict) {
23230 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
23231 {Chain, Src});
23232 Chain = Res.getValue(1);
23233 } else
23234 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
23235
23236 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23237 if (IsStrict)
23238 return DAG.getMergeValues({Res, Chain}, dl);
23239 return Res;
23240 }
23241
23242 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
23243 // use fisttp which will be handled later.
23244 if (!Subtarget.hasSSE3())
23245 return SDValue();
23246 }
23247
23248 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
23249 // FIXME: This does not generate an invalid exception if the input does not
23250 // fit in i16. PR44019
23251 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
23252 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23252, __extension__
__PRETTY_FUNCTION__))
;
23253 if (IsStrict) {
23254 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
23255 {Chain, Src});
23256 Chain = Res.getValue(1);
23257 } else
23258 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
23259
23260 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23261 if (IsStrict)
23262 return DAG.getMergeValues({Res, Chain}, dl);
23263 return Res;
23264 }
23265
23266 // If this is a FP_TO_SINT using SSEReg we're done.
23267 if (UseSSEReg && IsSigned)
23268 return Op;
23269
23270 // fp128 needs to use a libcall.
23271 if (SrcVT == MVT::f128) {
23272 RTLIB::Libcall LC;
23273 if (IsSigned)
23274 LC = RTLIB::getFPTOSINT(SrcVT, VT);
23275 else
23276 LC = RTLIB::getFPTOUINT(SrcVT, VT);
23277
23278 MakeLibCallOptions CallOptions;
23279 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
23280 SDLoc(Op), Chain);
23281
23282 if (IsStrict)
23283 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
23284
23285 return Tmp.first;
23286 }
23287
23288 // Fall back to X87.
23289 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
23290 if (IsStrict)
23291 return DAG.getMergeValues({V, Chain}, dl);
23292 return V;
23293 }
23294
23295 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23295)
;
23296}
23297
23298SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
23299 SelectionDAG &DAG) const {
23300 SDValue Src = Op.getOperand(0);
23301 MVT SrcVT = Src.getSimpleValueType();
23302
23303 if (SrcVT == MVT::f16)
23304 return SDValue();
23305
23306 // If the source is in an SSE register, the node is Legal.
23307 if (isScalarFPTypeInSSEReg(SrcVT))
23308 return Op;
23309
23310 return LRINT_LLRINTHelper(Op.getNode(), DAG);
23311}
23312
23313SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
23314 SelectionDAG &DAG) const {
23315 EVT DstVT = N->getValueType(0);
23316 SDValue Src = N->getOperand(0);
23317 EVT SrcVT = Src.getValueType();
23318
23319 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
23320 // f16 must be promoted before using the lowering in this routine.
23321 // fp128 does not use this lowering.
23322 return SDValue();
23323 }
23324
23325 SDLoc DL(N);
23326 SDValue Chain = DAG.getEntryNode();
23327
23328 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
23329
23330 // If we're converting from SSE, the stack slot needs to hold both types.
23331 // Otherwise it only needs to hold the DstVT.
23332 EVT OtherVT = UseSSE ? SrcVT : DstVT;
23333 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
23334 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23335 MachinePointerInfo MPI =
23336 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
23337
23338 if (UseSSE) {
23339 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23339, __extension__
__PRETTY_FUNCTION__))
;
23340 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
23341 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
23342 SDValue Ops[] = { Chain, StackPtr };
23343
23344 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
23345 /*Align*/ std::nullopt,
23346 MachineMemOperand::MOLoad);
23347 Chain = Src.getValue(1);
23348 }
23349
23350 SDValue StoreOps[] = { Chain, Src, StackPtr };
23351 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
23352 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
23353 MachineMemOperand::MOStore);
23354
23355 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
23356}
23357
23358SDValue
23359X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
23360 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
23361 // but making use of X86 specifics to produce better instruction sequences.
23362 SDNode *Node = Op.getNode();
23363 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
23364 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
23365 SDLoc dl(SDValue(Node, 0));
23366 SDValue Src = Node->getOperand(0);
23367
23368 // There are three types involved here: SrcVT is the source floating point
23369 // type, DstVT is the type of the result, and TmpVT is the result of the
23370 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
23371 // DstVT).
23372 EVT SrcVT = Src.getValueType();
23373 EVT DstVT = Node->getValueType(0);
23374 EVT TmpVT = DstVT;
23375
23376 // This code is only for floats and doubles. Fall back to generic code for
23377 // anything else.
23378 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
23379 return SDValue();
23380
23381 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
23382 unsigned SatWidth = SatVT.getScalarSizeInBits();
23383 unsigned DstWidth = DstVT.getScalarSizeInBits();
23384 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
23385 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23386, __extension__
__PRETTY_FUNCTION__))
23386 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23386, __extension__
__PRETTY_FUNCTION__))
;
23387
23388 // Promote result of FP_TO_*INT to at least 32 bits.
23389 if (TmpWidth < 32) {
23390 TmpVT = MVT::i32;
23391 TmpWidth = 32;
23392 }
23393
23394 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
23395 // us to use a native signed conversion instead.
23396 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
23397 TmpVT = MVT::i64;
23398 TmpWidth = 64;
23399 }
23400
23401 // If the saturation width is smaller than the size of the temporary result,
23402 // we can always use signed conversion, which is native.
23403 if (SatWidth < TmpWidth)
23404 FpToIntOpcode = ISD::FP_TO_SINT;
23405
23406 // Determine minimum and maximum integer values and their corresponding
23407 // floating-point values.
23408 APInt MinInt, MaxInt;
23409 if (IsSigned) {
23410 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
23411 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
23412 } else {
23413 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
23414 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
23415 }
23416
23417 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23418 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23419
23420 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
23421 MinInt, IsSigned, APFloat::rmTowardZero);
23422 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
23423 MaxInt, IsSigned, APFloat::rmTowardZero);
23424 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
23425 && !(MaxStatus & APFloat::opStatus::opInexact);
23426
23427 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
23428 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
23429
23430 // If the integer bounds are exactly representable as floats, emit a
23431 // min+max+fptoi sequence. Otherwise use comparisons and selects.
23432 if (AreExactFloatBounds) {
23433 if (DstVT != TmpVT) {
23434 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
23435 SDValue MinClamped = DAG.getNode(
23436 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
23437 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
23438 SDValue BothClamped = DAG.getNode(
23439 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
23440 // Convert clamped value to integer.
23441 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
23442
23443 // NaN will become INDVAL, with the top bit set and the rest zero.
23444 // Truncation will discard the top bit, resulting in zero.
23445 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23446 }
23447
23448 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
23449 SDValue MinClamped = DAG.getNode(
23450 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
23451 // Clamp by MaxFloat from above. NaN cannot occur.
23452 SDValue BothClamped = DAG.getNode(
23453 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
23454 // Convert clamped value to integer.
23455 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
23456
23457 if (!IsSigned) {
23458 // In the unsigned case we're done, because we mapped NaN to MinFloat,
23459 // which is zero.
23460 return FpToInt;
23461 }
23462
23463 // Otherwise, select zero if Src is NaN.
23464 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23465 return DAG.getSelectCC(
23466 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
23467 }
23468
23469 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
23470 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
23471
23472 // Result of direct conversion, which may be selected away.
23473 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
23474
23475 if (DstVT != TmpVT) {
23476 // NaN will become INDVAL, with the top bit set and the rest zero.
23477 // Truncation will discard the top bit, resulting in zero.
23478 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23479 }
23480
23481 SDValue Select = FpToInt;
23482 // For signed conversions where we saturate to the same size as the
23483 // result type of the fptoi instructions, INDVAL coincides with integer
23484 // minimum, so we don't need to explicitly check it.
23485 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
23486 // If Src ULT MinFloat, select MinInt. In particular, this also selects
23487 // MinInt if Src is NaN.
23488 Select = DAG.getSelectCC(
23489 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
23490 }
23491
23492 // If Src OGT MaxFloat, select MaxInt.
23493 Select = DAG.getSelectCC(
23494 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
23495
23496 // In the unsigned case we are done, because we mapped NaN to MinInt, which
23497 // is already zero. The promoted case was already handled above.
23498 if (!IsSigned || DstVT != TmpVT) {
23499 return Select;
23500 }
23501
23502 // Otherwise, select 0 if Src is NaN.
23503 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23504 return DAG.getSelectCC(
23505 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
23506}
23507
23508SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
23509 bool IsStrict = Op->isStrictFPOpcode();
23510
23511 SDLoc DL(Op);
23512 MVT VT = Op.getSimpleValueType();
23513 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23514 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23515 MVT SVT = In.getSimpleValueType();
23516
23517 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
23518 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
23519 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
23520 !Subtarget.getTargetTriple().isOSDarwin()))
23521 return SDValue();
23522
23523 if (SVT == MVT::f16) {
23524 if (Subtarget.hasFP16())
23525 return Op;
23526
23527 if (VT != MVT::f32) {
23528 if (IsStrict)
23529 return DAG.getNode(
23530 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
23531 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
23532 {MVT::f32, MVT::Other}, {Chain, In})});
23533
23534 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
23535 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
23536 }
23537
23538 if (!Subtarget.hasF16C()) {
23539 if (!Subtarget.getTargetTriple().isOSDarwin())
23540 return SDValue();
23541
23542 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23542, __extension__
__PRETTY_FUNCTION__))
;
23543
23544 // Need a libcall, but ABI for f16 is soft-float on MacOS.
23545 TargetLowering::CallLoweringInfo CLI(DAG);
23546 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23547
23548 In = DAG.getBitcast(MVT::i16, In);
23549 TargetLowering::ArgListTy Args;
23550 TargetLowering::ArgListEntry Entry;
23551 Entry.Node = In;
23552 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
23553 Entry.IsSExt = false;
23554 Entry.IsZExt = true;
23555 Args.push_back(Entry);
23556
23557 SDValue Callee = DAG.getExternalSymbol(
23558 getLibcallName(RTLIB::FPEXT_F16_F32),
23559 getPointerTy(DAG.getDataLayout()));
23560 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23561 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
23562 std::move(Args));
23563
23564 SDValue Res;
23565 std::tie(Res,Chain) = LowerCallTo(CLI);
23566 if (IsStrict)
23567 Res = DAG.getMergeValues({Res, Chain}, DL);
23568
23569 return Res;
23570 }
23571
23572 In = DAG.getBitcast(MVT::i16, In);
23573 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
23574 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
23575 DAG.getIntPtrConstant(0, DL));
23576 SDValue Res;
23577 if (IsStrict) {
23578 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
23579 {Chain, In});
23580 Chain = Res.getValue(1);
23581 } else {
23582 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
23583 DAG.getTargetConstant(4, DL, MVT::i32));
23584 }
23585 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
23586 DAG.getIntPtrConstant(0, DL));
23587 if (IsStrict)
23588 return DAG.getMergeValues({Res, Chain}, DL);
23589 return Res;
23590 }
23591
23592 if (!SVT.isVector())
23593 return Op;
23594
23595 if (SVT.getVectorElementType() == MVT::f16) {
23596 assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23596, __extension__
__PRETTY_FUNCTION__))
;
23597 if (SVT == MVT::v2f16)
23598 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
23599 DAG.getUNDEF(MVT::v2f16));
23600 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
23601 DAG.getUNDEF(MVT::v4f16));
23602 if (IsStrict)
23603 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23604 {Op->getOperand(0), Res});
23605 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23606 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
23607 return Op;
23608 }
23609
23610 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23610, __extension__
__PRETTY_FUNCTION__))
;
23611
23612 SDValue Res =
23613 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
23614 if (IsStrict)
23615 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23616 {Op->getOperand(0), Res});
23617 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23618}
23619
23620SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
23621 bool IsStrict = Op->isStrictFPOpcode();
23622
23623 SDLoc DL(Op);
23624 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23625 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23626 MVT VT = Op.getSimpleValueType();
23627 MVT SVT = In.getSimpleValueType();
23628
23629 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
23630 return SDValue();
23631
23632 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
23633 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
23634 if (!Subtarget.getTargetTriple().isOSDarwin())
23635 return SDValue();
23636
23637 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
23638 TargetLowering::CallLoweringInfo CLI(DAG);
23639 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23640
23641 TargetLowering::ArgListTy Args;
23642 TargetLowering::ArgListEntry Entry;
23643 Entry.Node = In;
23644 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
23645 Entry.IsSExt = false;
23646 Entry.IsZExt = true;
23647 Args.push_back(Entry);
23648
23649 SDValue Callee = DAG.getExternalSymbol(
23650 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
23651 : RTLIB::FPROUND_F32_F16),
23652 getPointerTy(DAG.getDataLayout()));
23653 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23654 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
23655 std::move(Args));
23656
23657 SDValue Res;
23658 std::tie(Res, Chain) = LowerCallTo(CLI);
23659
23660 Res = DAG.getBitcast(MVT::f16, Res);
23661
23662 if (IsStrict)
23663 Res = DAG.getMergeValues({Res, Chain}, DL);
23664
23665 return Res;
23666 }
23667
23668 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
23669 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
23670 return SDValue();
23671
23672 if (VT.isVector())
23673 return Op;
23674
23675 SDValue Res;
23676 SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
23677 MVT::i32);
23678 if (IsStrict) {
23679 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
23680 DAG.getConstantFP(0, DL, MVT::v4f32), In,
23681 DAG.getIntPtrConstant(0, DL));
23682 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
23683 {Chain, Res, Rnd});
23684 Chain = Res.getValue(1);
23685 } else {
23686 // FIXME: Should we use zeros for upper elements for non-strict?
23687 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
23688 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
23689 }
23690
23691 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
23692 DAG.getIntPtrConstant(0, DL));
23693 Res = DAG.getBitcast(MVT::f16, Res);
23694
23695 if (IsStrict)
23696 return DAG.getMergeValues({Res, Chain}, DL);
23697
23698 return Res;
23699 }
23700
23701 return Op;
23702}
23703
23704static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
23705 bool IsStrict = Op->isStrictFPOpcode();
23706 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23707 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23708, __extension__
__PRETTY_FUNCTION__))
23708 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23708, __extension__
__PRETTY_FUNCTION__))
;
23709
23710 SDLoc dl(Op);
23711 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
23712 DAG.getConstant(0, dl, MVT::v8i16), Src,
23713 DAG.getIntPtrConstant(0, dl));
23714
23715 SDValue Chain;
23716 if (IsStrict) {
23717 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
23718 {Op.getOperand(0), Res});
23719 Chain = Res.getValue(1);
23720 } else {
23721 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
23722 }
23723
23724 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
23725 DAG.getIntPtrConstant(0, dl));
23726
23727 if (IsStrict)
23728 return DAG.getMergeValues({Res, Chain}, dl);
23729
23730 return Res;
23731}
23732
23733static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
23734 bool IsStrict = Op->isStrictFPOpcode();
23735 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23736 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23737, __extension__
__PRETTY_FUNCTION__))
23737 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23737, __extension__
__PRETTY_FUNCTION__))
;
23738
23739 SDLoc dl(Op);
23740 SDValue Res, Chain;
23741 if (IsStrict) {
23742 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
23743 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
23744 DAG.getIntPtrConstant(0, dl));
23745 Res = DAG.getNode(
23746 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
23747 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
23748 Chain = Res.getValue(1);
23749 } else {
23750 // FIXME: Should we use zeros for upper elements for non-strict?
23751 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
23752 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
23753 DAG.getTargetConstant(4, dl, MVT::i32));
23754 }
23755
23756 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
23757 DAG.getIntPtrConstant(0, dl));
23758
23759 if (IsStrict)
23760 return DAG.getMergeValues({Res, Chain}, dl);
23761
23762 return Res;
23763}
23764
23765SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
23766 SelectionDAG &DAG) const {
23767 SDLoc DL(Op);
23768 MakeLibCallOptions CallOptions;
23769 RTLIB::Libcall LC =
23770 RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
23771 SDValue Res =
23772 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
23773 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
23774 DAG.getBitcast(MVT::i32, Res));
23775}
23776
23777/// Depending on uarch and/or optimizing for size, we might prefer to use a
23778/// vector operation in place of the typical scalar operation.
23779static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
23780 const X86Subtarget &Subtarget) {
23781 // If both operands have other uses, this is probably not profitable.
23782 SDValue LHS = Op.getOperand(0);
23783 SDValue RHS = Op.getOperand(1);
23784 if (!LHS.hasOneUse() && !RHS.hasOneUse())
23785 return Op;
23786
23787 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
23788 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
23789 if (IsFP && !Subtarget.hasSSE3())
23790 return Op;
23791 if (!IsFP && !Subtarget.hasSSSE3())
23792 return Op;
23793
23794 // Extract from a common vector.
23795 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23796 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23797 LHS.getOperand(0) != RHS.getOperand(0) ||
23798 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
23799 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
23800 !shouldUseHorizontalOp(true, DAG, Subtarget))
23801 return Op;
23802
23803 // Allow commuted 'hadd' ops.
23804 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
23805 unsigned HOpcode;
23806 switch (Op.getOpcode()) {
23807 case ISD::ADD: HOpcode = X86ISD::HADD; break;
23808 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
23809 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
23810 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
23811 default:
23812 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23812)
;
23813 }
23814 unsigned LExtIndex = LHS.getConstantOperandVal(1);
23815 unsigned RExtIndex = RHS.getConstantOperandVal(1);
23816 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
23817 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
23818 std::swap(LExtIndex, RExtIndex);
23819
23820 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
23821 return Op;
23822
23823 SDValue X = LHS.getOperand(0);
23824 EVT VecVT = X.getValueType();
23825 unsigned BitWidth = VecVT.getSizeInBits();
23826 unsigned NumLanes = BitWidth / 128;
23827 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
23828 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23829, __extension__
__PRETTY_FUNCTION__))
23829 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23829, __extension__
__PRETTY_FUNCTION__))
;
23830
23831 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
23832 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
23833 SDLoc DL(Op);
23834 if (BitWidth == 256 || BitWidth == 512) {
23835 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
23836 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
23837 LExtIndex %= NumEltsPerLane;
23838 }
23839
23840 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
23841 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
23842 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
23843 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
23844 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
23845 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
23846 DAG.getIntPtrConstant(LExtIndex / 2, DL));
23847}
23848
23849/// Depending on uarch and/or optimizing for size, we might prefer to use a
23850/// vector operation in place of the typical scalar operation.
23851SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
23852 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23853, __extension__
__PRETTY_FUNCTION__))
23853 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23853, __extension__
__PRETTY_FUNCTION__))
;
23854 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
23855}
23856
23857/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
23858/// This mode isn't supported in hardware on X86. But as long as we aren't
23859/// compiling with trapping math, we can emulate this with
23860/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
23861static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
23862 SDValue N0 = Op.getOperand(0);
23863 SDLoc dl(Op);
23864 MVT VT = Op.getSimpleValueType();
23865
23866 // N0 += copysign(nextafter(0.5, 0.0), N0)
23867 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23868 bool Ignored;
23869 APFloat Point5Pred = APFloat(0.5f);
23870 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
23871 Point5Pred.next(/*nextDown*/true);
23872
23873 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
23874 DAG.getConstantFP(Point5Pred, dl, VT), N0);
23875 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
23876
23877 // Truncate the result to remove fraction.
23878 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
23879}
23880
23881/// The only differences between FABS and FNEG are the mask and the logic op.
23882/// FNEG also has a folding opportunity for FNEG(FABS(x)).
23883static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
23884 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23885, __extension__
__PRETTY_FUNCTION__))
23885 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23885, __extension__
__PRETTY_FUNCTION__))
;
23886
23887 bool IsFABS = (Op.getOpcode() == ISD::FABS);
23888
23889 // If this is a FABS and it has an FNEG user, bail out to fold the combination
23890 // into an FNABS. We'll lower the FABS after that if it is still in use.
23891 if (IsFABS)
23892 for (SDNode *User : Op->uses())
23893 if (User->getOpcode() == ISD::FNEG)
23894 return Op;
23895
23896 SDLoc dl(Op);
23897 MVT VT = Op.getSimpleValueType();
23898
23899 bool IsF128 = (VT == MVT::f128);
23900 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23902, __extension__
__PRETTY_FUNCTION__))
23901 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23902, __extension__
__PRETTY_FUNCTION__))
23902 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23902, __extension__
__PRETTY_FUNCTION__))
;
23903
23904 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
23905 // decide if we should generate a 16-byte constant mask when we only need 4 or
23906 // 8 bytes for the scalar case.
23907
23908 // There are no scalar bitwise logical SSE/AVX instructions, so we
23909 // generate a 16-byte vector constant and logic op even for the scalar case.
23910 // Using a 16-byte mask allows folding the load of the mask with
23911 // the logic op, so it can save (~4 bytes) on code size.
23912 bool IsFakeVector = !VT.isVector() && !IsF128;
23913 MVT LogicVT = VT;
23914 if (IsFakeVector)
23915 LogicVT = (VT == MVT::f64) ? MVT::v2f64
23916 : (VT == MVT::f32) ? MVT::v4f32
23917 : MVT::v8f16;
23918
23919 unsigned EltBits = VT.getScalarSizeInBits();
23920 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
23921 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
23922 APInt::getSignMask(EltBits);
23923 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23924 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
23925
23926 SDValue Op0 = Op.getOperand(0);
23927 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
23928 unsigned LogicOp = IsFABS ? X86ISD::FAND :
23929 IsFNABS ? X86ISD::FOR :
23930 X86ISD::FXOR;
23931 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
23932
23933 if (VT.isVector() || IsF128)
23934 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
23935
23936 // For the scalar case extend to a 128-bit vector, perform the logic op,
23937 // and extract the scalar result back out.
23938 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
23939 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
23940 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
23941 DAG.getIntPtrConstant(0, dl));
23942}
23943
23944static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
23945 SDValue Mag = Op.getOperand(0);
23946 SDValue Sign = Op.getOperand(1);
23947 SDLoc dl(Op);
23948
23949 // If the sign operand is smaller, extend it first.
23950 MVT VT = Op.getSimpleValueType();
23951 if (Sign.getSimpleValueType().bitsLT(VT))
23952 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
23953
23954 // And if it is bigger, shrink it first.
23955 if (Sign.getSimpleValueType().bitsGT(VT))
23956 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
23957 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
23958
23959 // At this point the operands and the result should have the same
23960 // type, and that won't be f80 since that is not custom lowered.
23961 bool IsF128 = (VT == MVT::f128);
23962 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23964, __extension__
__PRETTY_FUNCTION__))
23963 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23964, __extension__
__PRETTY_FUNCTION__))
23964 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23964, __extension__
__PRETTY_FUNCTION__))
;
23965
23966 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23967
23968 // Perform all scalar logic operations as 16-byte vectors because there are no
23969 // scalar FP logic instructions in SSE.
23970 // TODO: This isn't necessary. If we used scalar types, we might avoid some
23971 // unnecessary splats, but we might miss load folding opportunities. Should
23972 // this decision be based on OptimizeForSize?
23973 bool IsFakeVector = !VT.isVector() && !IsF128;
23974 MVT LogicVT = VT;
23975 if (IsFakeVector)
23976 LogicVT = (VT == MVT::f64) ? MVT::v2f64
23977 : (VT == MVT::f32) ? MVT::v4f32
23978 : MVT::v8f16;
23979
23980 // The mask constants are automatically splatted for vector types.
23981 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23982 SDValue SignMask = DAG.getConstantFP(
23983 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
23984 SDValue MagMask = DAG.getConstantFP(
23985 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
23986
23987 // First, clear all bits but the sign bit from the second operand (sign).
23988 if (IsFakeVector)
23989 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
23990 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
23991
23992 // Next, clear the sign bit from the first operand (magnitude).
23993 // TODO: If we had general constant folding for FP logic ops, this check
23994 // wouldn't be necessary.
23995 SDValue MagBits;
23996 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
23997 APFloat APF = Op0CN->getValueAPF();
23998 APF.clearSign();
23999 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
24000 } else {
24001 // If the magnitude operand wasn't a constant, we need to AND out the sign.
24002 if (IsFakeVector)
24003 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
24004 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
24005 }
24006
24007 // OR the magnitude value with the sign bit.
24008 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
24009 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
24010 DAG.getIntPtrConstant(0, dl));
24011}
24012
24013static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
24014 SDValue N0 = Op.getOperand(0);
24015 SDLoc dl(Op);
24016 MVT VT = Op.getSimpleValueType();
24017
24018 MVT OpVT = N0.getSimpleValueType();
24019 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))
24020 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))
;
24021
24022 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
24023 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
24024 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
24025 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
24026 Res = DAG.getZExtOrTrunc(Res, dl, VT);
24027 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
24028 return Res;
24029}
24030
24031/// Helper for attempting to create a X86ISD::BT node.
24032static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
24033 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
24034 // instruction. Since the shift amount is in-range-or-undefined, we know
24035 // that doing a bittest on the i32 value is ok. We extend to i32 because
24036 // the encoding for the i16 version is larger than the i32 version.
24037 // Also promote i16 to i32 for performance / code size reason.
24038 if (Src.getValueType().getScalarSizeInBits() < 32)
24039 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
24040
24041 // No legal type found, give up.
24042 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
24043 return SDValue();
24044
24045 // See if we can use the 32-bit instruction instead of the 64-bit one for a
24046 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
24047 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
24048 // known to be zero.
24049 if (Src.getValueType() == MVT::i64 &&
24050 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
24051 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
24052
24053 // If the operand types disagree, extend the shift amount to match. Since
24054 // BT ignores high bits (like shifts) we can use anyextend.
24055 if (Src.getValueType() != BitNo.getValueType()) {
24056 // Peek through a mask/modulo operation.
24057 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
24058 // we probably need a better IsDesirableToPromoteOp to handle this as well.
24059 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
24060 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
24061 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24062 BitNo.getOperand(0)),
24063 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24064 BitNo.getOperand(1)));
24065 else
24066 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
24067 }
24068
24069 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
24070}
24071
24072/// Helper for creating a X86ISD::SETCC node.
24073static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
24074 SelectionDAG &DAG) {
24075 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
24076 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
24077}
24078
24079/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
24080/// recognizable memcmp expansion.
24081static bool isOrXorXorTree(SDValue X, bool Root = true) {
24082 if (X.getOpcode() == ISD::OR)
24083 return isOrXorXorTree(X.getOperand(0), false) &&
24084 isOrXorXorTree(X.getOperand(1), false);
24085 if (Root)
24086 return false;
24087 return X.getOpcode() == ISD::XOR;
24088}
24089
24090/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
24091/// expansion.
24092template <typename F>
24093static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
24094 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
24095 SDValue Op0 = X.getOperand(0);
24096 SDValue Op1 = X.getOperand(1);
24097 if (X.getOpcode() == ISD::OR) {
24098 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24099 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24100 if (VecVT != CmpVT)
24101 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
24102 if (HasPT)
24103 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
24104 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
24105 }
24106 if (X.getOpcode() == ISD::XOR) {
24107 SDValue A = SToV(Op0);
24108 SDValue B = SToV(Op1);
24109 if (VecVT != CmpVT)
24110 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
24111 if (HasPT)
24112 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
24113 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
24114 }
24115 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24115)
;
24116}
24117
24118/// Try to map a 128-bit or larger integer comparison to vector instructions
24119/// before type legalization splits it up into chunks.
24120static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
24121 ISD::CondCode CC,
24122 const SDLoc &DL,
24123 SelectionDAG &DAG,
24124 const X86Subtarget &Subtarget) {
24125 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24125, __extension__
__PRETTY_FUNCTION__))
;
24126
24127 // We're looking for an oversized integer equality comparison.
24128 EVT OpVT = X.getValueType();
24129 unsigned OpSize = OpVT.getSizeInBits();
24130 if (!OpVT.isScalarInteger() || OpSize < 128)
24131 return SDValue();
24132
24133 // Ignore a comparison with zero because that gets special treatment in
24134 // EmitTest(). But make an exception for the special case of a pair of
24135 // logically-combined vector-sized operands compared to zero. This pattern may
24136 // be generated by the memcmp expansion pass with oversized integer compares
24137 // (see PR33325).
24138 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
24139 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
24140 return SDValue();
24141
24142 // Don't perform this combine if constructing the vector will be expensive.
24143 auto IsVectorBitCastCheap = [](SDValue X) {
24144 X = peekThroughBitcasts(X);
24145 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
24146 X.getOpcode() == ISD::LOAD;
24147 };
24148 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
24149 !IsOrXorXorTreeCCZero)
24150 return SDValue();
24151
24152 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
24153 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
24154 // Otherwise use PCMPEQ (plus AND) and mask testing.
24155 bool NoImplicitFloatOps =
24156 DAG.getMachineFunction().getFunction().hasFnAttribute(
24157 Attribute::NoImplicitFloat);
24158 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
24159 ((OpSize == 128 && Subtarget.hasSSE2()) ||
24160 (OpSize == 256 && Subtarget.hasAVX()) ||
24161 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
24162 bool HasPT = Subtarget.hasSSE41();
24163
24164 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
24165 // vector registers are essentially free. (Technically, widening registers
24166 // prevents load folding, but the tradeoff is worth it.)
24167 bool PreferKOT = Subtarget.preferMaskRegisters();
24168 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
24169
24170 EVT VecVT = MVT::v16i8;
24171 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
24172 if (OpSize == 256) {
24173 VecVT = MVT::v32i8;
24174 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
24175 }
24176 EVT CastVT = VecVT;
24177 bool NeedsAVX512FCast = false;
24178 if (OpSize == 512 || NeedZExt) {
24179 if (Subtarget.hasBWI()) {
24180 VecVT = MVT::v64i8;
24181 CmpVT = MVT::v64i1;
24182 if (OpSize == 512)
24183 CastVT = VecVT;
24184 } else {
24185 VecVT = MVT::v16i32;
24186 CmpVT = MVT::v16i1;
24187 CastVT = OpSize == 512 ? VecVT
24188 : OpSize == 256 ? MVT::v8i32
24189 : MVT::v4i32;
24190 NeedsAVX512FCast = true;
24191 }
24192 }
24193
24194 auto ScalarToVector = [&](SDValue X) -> SDValue {
24195 bool TmpZext = false;
24196 EVT TmpCastVT = CastVT;
24197 if (X.getOpcode() == ISD::ZERO_EXTEND) {
24198 SDValue OrigX = X.getOperand(0);
24199 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
24200 if (OrigSize < OpSize) {
24201 if (OrigSize == 128) {
24202 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
24203 X = OrigX;
24204 TmpZext = true;
24205 } else if (OrigSize == 256) {
24206 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
24207 X = OrigX;
24208 TmpZext = true;
24209 }
24210 }
24211 }
24212 X = DAG.getBitcast(TmpCastVT, X);
24213 if (!NeedZExt && !TmpZext)
24214 return X;
24215 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
24216 DAG.getConstant(0, DL, VecVT), X,
24217 DAG.getVectorIdxConstant(0, DL));
24218 };
24219
24220 SDValue Cmp;
24221 if (IsOrXorXorTreeCCZero) {
24222 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
24223 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
24224 // Use 2 vector equality compares and 'and' the results before doing a
24225 // MOVMSK.
24226 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
24227 } else {
24228 SDValue VecX = ScalarToVector(X);
24229 SDValue VecY = ScalarToVector(Y);
24230 if (VecVT != CmpVT) {
24231 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
24232 } else if (HasPT) {
24233 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
24234 } else {
24235 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
24236 }
24237 }
24238 // AVX512 should emit a setcc that will lower to kortest.
24239 if (VecVT != CmpVT) {
24240 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
24241 : CmpVT == MVT::v32i1 ? MVT::i32
24242 : MVT::i16;
24243 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
24244 DAG.getConstant(0, DL, KRegVT), CC);
24245 }
24246 if (HasPT) {
24247 SDValue BCCmp =
24248 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
24249 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
24250 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24251 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
24252 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
24253 }
24254 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
24255 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
24256 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
24257 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24258, __extension__
__PRETTY_FUNCTION__))
24258 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24258, __extension__
__PRETTY_FUNCTION__))
;
24259 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
24260 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
24261 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
24262 }
24263
24264 return SDValue();
24265}
24266
24267/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
24268/// style scalarized (associative) reduction patterns. Partial reductions
24269/// are supported when the pointer SrcMask is non-null.
24270/// TODO - move this to SelectionDAG?
24271static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
24272 SmallVectorImpl<SDValue> &SrcOps,
24273 SmallVectorImpl<APInt> *SrcMask = nullptr) {
24274 SmallVector<SDValue, 8> Opnds;
24275 DenseMap<SDValue, APInt> SrcOpMap;
24276 EVT VT = MVT::Other;
24277
24278 // Recognize a special case where a vector is casted into wide integer to
24279 // test all 0s.
24280 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24281, __extension__
__PRETTY_FUNCTION__))
24281 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24281, __extension__
__PRETTY_FUNCTION__))
;
24282 Opnds.push_back(Op.getOperand(0));
24283 Opnds.push_back(Op.getOperand(1));
24284
24285 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
24286 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
24287 // BFS traverse all BinOp operands.
24288 if (I->getOpcode() == unsigned(BinOp)) {
24289 Opnds.push_back(I->getOperand(0));
24290 Opnds.push_back(I->getOperand(1));
24291 // Re-evaluate the number of nodes to be traversed.
24292 e += 2; // 2 more nodes (LHS and RHS) are pushed.
24293 continue;
24294 }
24295
24296 // Quit if a non-EXTRACT_VECTOR_ELT
24297 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24298 return false;
24299
24300 // Quit if without a constant index.
24301 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
24302 if (!Idx)
24303 return false;
24304
24305 SDValue Src = I->getOperand(0);
24306 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
24307 if (M == SrcOpMap.end()) {
24308 VT = Src.getValueType();
24309 // Quit if not the same type.
24310 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
24311 return false;
24312 unsigned NumElts = VT.getVectorNumElements();
24313 APInt EltCount = APInt::getZero(NumElts);
24314 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
24315 SrcOps.push_back(Src);
24316 }
24317
24318 // Quit if element already used.
24319 unsigned CIdx = Idx->getZExtValue();
24320 if (M->second[CIdx])
24321 return false;
24322 M->second.setBit(CIdx);
24323 }
24324
24325 if (SrcMask) {
24326 // Collect the source partial masks.
24327 for (SDValue &SrcOp : SrcOps)
24328 SrcMask->push_back(SrcOpMap[SrcOp]);
24329 } else {
24330 // Quit if not all elements are used.
24331 for (const auto &I : SrcOpMap)
24332 if (!I.second.isAllOnes())
24333 return false;
24334 }
24335
24336 return true;
24337}
24338
24339// Helper function for comparing all bits of two vectors.
24340static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
24341 ISD::CondCode CC, const APInt &OriginalMask,
24342 const X86Subtarget &Subtarget,
24343 SelectionDAG &DAG, X86::CondCode &X86CC) {
24344 EVT VT = LHS.getValueType();
24345 unsigned ScalarSize = VT.getScalarSizeInBits();
24346 if (OriginalMask.getBitWidth() != ScalarSize) {
24347 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24347, __extension__
__PRETTY_FUNCTION__))
;
24348 return SDValue();
24349 }
24350
24351 // Quit if not convertable to legal scalar or 128/256-bit vector.
24352 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24353 return SDValue();
24354
24355 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
24356 if (VT.isFloatingPoint())
24357 return SDValue();
24358
24359 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24359, __extension__
__PRETTY_FUNCTION__))
;
24360 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
24361
24362 APInt Mask = OriginalMask;
24363
24364 auto MaskBits = [&](SDValue Src) {
24365 if (Mask.isAllOnes())
24366 return Src;
24367 EVT SrcVT = Src.getValueType();
24368 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
24369 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
24370 };
24371
24372 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
24373 if (VT.getSizeInBits() < 128) {
24374 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
24375 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
24376 if (IntVT != MVT::i64)
24377 return SDValue();
24378 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
24379 MVT::i32, MVT::i32);
24380 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
24381 MVT::i32, MVT::i32);
24382 SDValue Lo =
24383 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
24384 SDValue Hi =
24385 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
24386 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24387 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
24388 DAG.getConstant(0, DL, MVT::i32));
24389 }
24390 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24391 DAG.getBitcast(IntVT, MaskBits(LHS)),
24392 DAG.getBitcast(IntVT, MaskBits(RHS)));
24393 }
24394
24395 // Without PTEST, a masked v2i64 or-reduction is not faster than
24396 // scalarization.
24397 bool UseKORTEST = Subtarget.useAVX512Regs();
24398 bool UsePTEST = Subtarget.hasSSE41();
24399 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
24400 return SDValue();
24401
24402 // Split down to 128/256/512-bit vector.
24403 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
24404
24405 // If the input vector has vector elements wider than the target test size,
24406 // then cast to <X x i64> so it will safely split.
24407 if (ScalarSize > TestSize) {
24408 if (!Mask.isAllOnes())
24409 return SDValue();
24410 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
24411 LHS = DAG.getBitcast(VT, LHS);
24412 RHS = DAG.getBitcast(VT, RHS);
24413 Mask = APInt::getAllOnes(64);
24414 }
24415
24416 if (VT.getSizeInBits() > TestSize) {
24417 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
24418 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
24419 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
24420 while (VT.getSizeInBits() > TestSize) {
24421 auto Split = DAG.SplitVector(LHS, DL);
24422 VT = Split.first.getValueType();
24423 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24424 }
24425 RHS = DAG.getAllOnesConstant(DL, VT);
24426 } else if (!UsePTEST && !KnownRHS.isZero()) {
24427 // MOVMSK Special Case:
24428 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
24429 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
24430 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
24431 LHS = DAG.getBitcast(VT, MaskBits(LHS));
24432 RHS = DAG.getBitcast(VT, MaskBits(RHS));
24433 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
24434 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
24435 V = DAG.getSExtOrTrunc(V, DL, VT);
24436 while (VT.getSizeInBits() > TestSize) {
24437 auto Split = DAG.SplitVector(V, DL);
24438 VT = Split.first.getValueType();
24439 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24440 }
24441 V = DAG.getNOT(DL, V, VT);
24442 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24443 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24444 DAG.getConstant(0, DL, MVT::i32));
24445 } else {
24446 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
24447 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24448 while (VT.getSizeInBits() > TestSize) {
24449 auto Split = DAG.SplitVector(V, DL);
24450 VT = Split.first.getValueType();
24451 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
24452 }
24453 LHS = V;
24454 RHS = DAG.getConstant(0, DL, VT);
24455 }
24456 }
24457
24458 if (UseKORTEST && VT.is512BitVector()) {
24459 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
24460 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
24461 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24462 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24463 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
24464 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
24465 }
24466
24467 if (UsePTEST) {
24468 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
24469 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24470 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24471 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
24472 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
24473 }
24474
24475 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24475, __extension__
__PRETTY_FUNCTION__))
;
24476 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
24477 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
24478 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
24479 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
24480 V = DAG.getNOT(DL, V, MaskVT);
24481 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24482 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24483 DAG.getConstant(0, DL, MVT::i32));
24484}
24485
24486// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
24487// to CMP(MOVMSK(PCMPEQB(X,Y))).
24488static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
24489 ISD::CondCode CC, const SDLoc &DL,
24490 const X86Subtarget &Subtarget,
24491 SelectionDAG &DAG,
24492 X86::CondCode &X86CC) {
24493 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24493, __extension__
__PRETTY_FUNCTION__))
;
24494
24495 bool CmpNull = isNullConstant(RHS);
24496 bool CmpAllOnes = isAllOnesConstant(RHS);
24497 if (!CmpNull && !CmpAllOnes)
24498 return SDValue();
24499
24500 SDValue Op = LHS;
24501 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
24502 return SDValue();
24503
24504 // Check whether we're masking/truncating an OR-reduction result, in which
24505 // case track the masked bits.
24506 // TODO: Add CmpAllOnes support.
24507 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
24508 if (CmpNull) {
24509 switch (Op.getOpcode()) {
24510 case ISD::TRUNCATE: {
24511 SDValue Src = Op.getOperand(0);
24512 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
24513 Op.getScalarValueSizeInBits());
24514 Op = Src;
24515 break;
24516 }
24517 case ISD::AND: {
24518 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
24519 Mask = Cst->getAPIntValue();
24520 Op = Op.getOperand(0);
24521 }
24522 break;
24523 }
24524 }
24525 }
24526
24527 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
24528
24529 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
24530 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
24531 SmallVector<SDValue, 8> VecIns;
24532 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
24533 EVT VT = VecIns[0].getValueType();
24534 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24536, __extension__
__PRETTY_FUNCTION__))
24535 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24536, __extension__
__PRETTY_FUNCTION__))
24536 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24536, __extension__
__PRETTY_FUNCTION__))
;
24537
24538 // Quit if not splittable to scalar/128/256/512-bit vector.
24539 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24540 return SDValue();
24541
24542 // If more than one full vector is evaluated, AND/OR them first before
24543 // PTEST.
24544 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
24545 Slot += 2, e += 1) {
24546 // Each iteration will AND/OR 2 nodes and append the result until there is
24547 // only 1 node left, i.e. the final value of all vectors.
24548 SDValue LHS = VecIns[Slot];
24549 SDValue RHS = VecIns[Slot + 1];
24550 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
24551 }
24552
24553 return LowerVectorAllEqual(DL, VecIns.back(),
24554 CmpNull ? DAG.getConstant(0, DL, VT)
24555 : DAG.getAllOnesConstant(DL, VT),
24556 CC, Mask, Subtarget, DAG, X86CC);
24557 }
24558
24559 // Match icmp(reduce_or(X),0) anyof reduction patterns.
24560 // Match icmp(reduce_and(X),-1) allof reduction patterns.
24561 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24562 ISD::NodeType BinOp;
24563 if (SDValue Match =
24564 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
24565 EVT MatchVT = Match.getValueType();
24566 return LowerVectorAllEqual(DL, Match,
24567 CmpNull ? DAG.getConstant(0, DL, MatchVT)
24568 : DAG.getAllOnesConstant(DL, MatchVT),
24569 CC, Mask, Subtarget, DAG, X86CC);
24570 }
24571 }
24572
24573 if (Mask.isAllOnes()) {
24574 assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24575, __extension__
__PRETTY_FUNCTION__))
24575 "Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24575, __extension__
__PRETTY_FUNCTION__))
;
24576 SDValue Src = peekThroughBitcasts(Op);
24577 if (Src.getValueType().isFixedLengthVector() &&
24578 Src.getValueType().getScalarType() == MVT::i1) {
24579 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
24580 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
24581 if (Src.getOpcode() == ISD::SETCC) {
24582 SDValue LHS = Src.getOperand(0);
24583 SDValue RHS = Src.getOperand(1);
24584 EVT LHSVT = LHS.getValueType();
24585 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
24586 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
24587 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
24588 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
24589 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
24590 X86CC);
24591 }
24592 }
24593 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
24594 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
24595 // Peek through truncation, mask the LSB and compare against zero/LSB.
24596 if (Src.getOpcode() == ISD::TRUNCATE) {
24597 SDValue Inner = Src.getOperand(0);
24598 EVT InnerVT = Inner.getValueType();
24599 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
24600 unsigned BW = InnerVT.getScalarSizeInBits();
24601 APInt SrcMask = APInt(BW, 1);
24602 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
24603 return LowerVectorAllEqual(DL, Inner,
24604 DAG.getConstant(Cmp, DL, InnerVT), CC,
24605 SrcMask, Subtarget, DAG, X86CC);
24606 }
24607 }
24608 }
24609 }
24610
24611 return SDValue();
24612}
24613
24614/// return true if \c Op has a use that doesn't just read flags.
24615static bool hasNonFlagsUse(SDValue Op) {
24616 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
24617 ++UI) {
24618 SDNode *User = *UI;
24619 unsigned UOpNo = UI.getOperandNo();
24620 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
24621 // Look pass truncate.
24622 UOpNo = User->use_begin().getOperandNo();
24623 User = *User->use_begin();
24624 }
24625
24626 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
24627 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
24628 return true;
24629 }
24630 return false;
24631}
24632
24633// Transform to an x86-specific ALU node with flags if there is a chance of
24634// using an RMW op or only the flags are used. Otherwise, leave
24635// the node alone and emit a 'cmp' or 'test' instruction.
24636static bool isProfitableToUseFlagOp(SDValue Op) {
24637 for (SDNode *U : Op->uses())
24638 if (U->getOpcode() != ISD::CopyToReg &&
24639 U->getOpcode() != ISD::SETCC &&
24640 U->getOpcode() != ISD::STORE)
24641 return false;
24642
24643 return true;
24644}
24645
24646/// Emit nodes that will be selected as "test Op0,Op0", or something
24647/// equivalent.
24648static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
24649 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
24650 // CF and OF aren't always set the way we want. Determine which
24651 // of these we need.
24652 bool NeedCF = false;
24653 bool NeedOF = false;
24654 switch (X86CC) {
24655 default: break;
24656 case X86::COND_A: case X86::COND_AE:
24657 case X86::COND_B: case X86::COND_BE:
24658 NeedCF = true;
24659 break;
24660 case X86::COND_G: case X86::COND_GE:
24661 case X86::COND_L: case X86::COND_LE:
24662 case X86::COND_O: case X86::COND_NO: {
24663 // Check if we really need to set the
24664 // Overflow flag. If NoSignedWrap is present
24665 // that is not actually needed.
24666 switch (Op->getOpcode()) {
24667 case ISD::ADD:
24668 case ISD::SUB:
24669 case ISD::MUL:
24670 case ISD::SHL:
24671 if (Op.getNode()->getFlags().hasNoSignedWrap())
24672 break;
24673 [[fallthrough]];
24674 default:
24675 NeedOF = true;
24676 break;
24677 }
24678 break;
24679 }
24680 }
24681 // See if we can use the EFLAGS value from the operand instead of
24682 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
24683 // we prove that the arithmetic won't overflow, we can't use OF or CF.
24684 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
24685 // Emit a CMP with 0, which is the TEST pattern.
24686 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24687 DAG.getConstant(0, dl, Op.getValueType()));
24688 }
24689 unsigned Opcode = 0;
24690 unsigned NumOperands = 0;
24691
24692 SDValue ArithOp = Op;
24693
24694 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
24695 // which may be the result of a CAST. We use the variable 'Op', which is the
24696 // non-casted variable when we check for possible users.
24697 switch (ArithOp.getOpcode()) {
24698 case ISD::AND:
24699 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
24700 // because a TEST instruction will be better.
24701 if (!hasNonFlagsUse(Op))
24702 break;
24703
24704 [[fallthrough]];
24705 case ISD::ADD:
24706 case ISD::SUB:
24707 case ISD::OR:
24708 case ISD::XOR:
24709 if (!isProfitableToUseFlagOp(Op))
24710 break;
24711
24712 // Otherwise use a regular EFLAGS-setting instruction.
24713 switch (ArithOp.getOpcode()) {
24714 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24714)
;
24715 case ISD::ADD: Opcode = X86ISD::ADD; break;
24716 case ISD::SUB: Opcode = X86ISD::SUB; break;
24717 case ISD::XOR: Opcode = X86ISD::XOR; break;
24718 case ISD::AND: Opcode = X86ISD::AND; break;
24719 case ISD::OR: Opcode = X86ISD::OR; break;
24720 }
24721
24722 NumOperands = 2;
24723 break;
24724 case X86ISD::ADD:
24725 case X86ISD::SUB:
24726 case X86ISD::OR:
24727 case X86ISD::XOR:
24728 case X86ISD::AND:
24729 return SDValue(Op.getNode(), 1);
24730 case ISD::SSUBO:
24731 case ISD::USUBO: {
24732 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
24733 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24734 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
24735 Op->getOperand(1)).getValue(1);
24736 }
24737 default:
24738 break;
24739 }
24740
24741 if (Opcode == 0) {
24742 // Emit a CMP with 0, which is the TEST pattern.
24743 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24744 DAG.getConstant(0, dl, Op.getValueType()));
24745 }
24746 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24747 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
24748
24749 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
24750 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
24751 return SDValue(New.getNode(), 1);
24752}
24753
24754/// Emit nodes that will be selected as "cmp Op0,Op1", or something
24755/// equivalent.
24756static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
24757 const SDLoc &dl, SelectionDAG &DAG,
24758 const X86Subtarget &Subtarget) {
24759 if (isNullConstant(Op1))
24760 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
24761
24762 EVT CmpVT = Op0.getValueType();
24763
24764 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24765, __extension__
__PRETTY_FUNCTION__))
24765 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24765, __extension__
__PRETTY_FUNCTION__))
;
24766
24767 // Only promote the compare up to I32 if it is a 16 bit operation
24768 // with an immediate. 16 bit immediates are to be avoided.
24769 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
24770 !DAG.getMachineFunction().getFunction().hasMinSize()) {
24771 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
24772 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
24773 // Don't do this if the immediate can fit in 8-bits.
24774 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
24775 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
24776 unsigned ExtendOp =
24777 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24778 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
24779 // For equality comparisons try to use SIGN_EXTEND if the input was
24780 // truncate from something with enough sign bits.
24781 if (Op0.getOpcode() == ISD::TRUNCATE) {
24782 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
24783 ExtendOp = ISD::SIGN_EXTEND;
24784 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
24785 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
24786 ExtendOp = ISD::SIGN_EXTEND;
24787 }
24788 }
24789
24790 CmpVT = MVT::i32;
24791 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
24792 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
24793 }
24794 }
24795
24796 // Try to shrink i64 compares if the input has enough zero bits.
24797 // FIXME: Do this for non-constant compares for constant on LHS?
24798 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
24799 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
24800 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
24801 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
24802 CmpVT = MVT::i32;
24803 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
24804 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
24805 }
24806
24807 // 0-x == y --> x+y == 0
24808 // 0-x != y --> x+y != 0
24809 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
24810 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24811 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24812 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
24813 return Add.getValue(1);
24814 }
24815
24816 // x == 0-y --> x+y == 0
24817 // x != 0-y --> x+y != 0
24818 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
24819 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24820 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24821 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
24822 return Add.getValue(1);
24823 }
24824
24825 // Use SUB instead of CMP to enable CSE between SUB and CMP.
24826 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24827 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
24828 return Sub.getValue(1);
24829}
24830
24831/// Check if replacement of SQRT with RSQRT should be disabled.
24832bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
24833 EVT VT = Op.getValueType();
24834
24835 // We don't need to replace SQRT with RSQRT for half type.
24836 if (VT.getScalarType() == MVT::f16)
24837 return true;
24838
24839 // We never want to use both SQRT and RSQRT instructions for the same input.
24840 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
24841 return false;
24842
24843 if (VT.isVector())
24844 return Subtarget.hasFastVectorFSQRT();
24845 return Subtarget.hasFastScalarFSQRT();
24846}
24847
24848/// The minimum architected relative accuracy is 2^-12. We need one
24849/// Newton-Raphson step to have a good float result (24 bits of precision).
24850SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
24851 SelectionDAG &DAG, int Enabled,
24852 int &RefinementSteps,
24853 bool &UseOneConstNR,
24854 bool Reciprocal) const {
24855 SDLoc DL(Op);
24856 EVT VT = Op.getValueType();
24857
24858 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
24859 // It is likely not profitable to do this for f64 because a double-precision
24860 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
24861 // instructions: convert to single, rsqrtss, convert back to double, refine
24862 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
24863 // along with FMA, this could be a throughput win.
24864 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
24865 // after legalize types.
24866 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24867 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
24868 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
24869 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24870 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24871 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24872 RefinementSteps = 1;
24873
24874 UseOneConstNR = false;
24875 // There is no FSQRT for 512-bits, but there is RSQRT14.
24876 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
24877 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
24878 if (RefinementSteps == 0 && !Reciprocal)
24879 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
24880 return Estimate;
24881 }
24882
24883 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24884 Subtarget.hasFP16()) {
24885 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24885, __extension__
__PRETTY_FUNCTION__))
;
24886 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24887 RefinementSteps = 0;
24888
24889 if (VT == MVT::f16) {
24890 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24891 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24892 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24893 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
24894 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24895 }
24896
24897 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
24898 }
24899 return SDValue();
24900}
24901
24902/// The minimum architected relative accuracy is 2^-12. We need one
24903/// Newton-Raphson step to have a good float result (24 bits of precision).
24904SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
24905 int Enabled,
24906 int &RefinementSteps) const {
24907 SDLoc DL(Op);
24908 EVT VT = Op.getValueType();
24909
24910 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
24911 // It is likely not profitable to do this for f64 because a double-precision
24912 // reciprocal estimate with refinement on x86 prior to FMA requires
24913 // 15 instructions: convert to single, rcpss, convert back to double, refine
24914 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
24915 // along with FMA, this could be a throughput win.
24916
24917 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24918 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
24919 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24920 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24921 // Enable estimate codegen with 1 refinement step for vector division.
24922 // Scalar division estimates are disabled because they break too much
24923 // real-world code. These defaults are intended to match GCC behavior.
24924 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
24925 return SDValue();
24926
24927 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24928 RefinementSteps = 1;
24929
24930 // There is no FSQRT for 512-bits, but there is RCP14.
24931 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
24932 return DAG.getNode(Opcode, DL, VT, Op);
24933 }
24934
24935 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24936 Subtarget.hasFP16()) {
24937 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24938 RefinementSteps = 0;
24939
24940 if (VT == MVT::f16) {
24941 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24942 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24943 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24944 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
24945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24946 }
24947
24948 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
24949 }
24950 return SDValue();
24951}
24952
24953/// If we have at least two divisions that use the same divisor, convert to
24954/// multiplication by a reciprocal. This may need to be adjusted for a given
24955/// CPU if a division's cost is not at least twice the cost of a multiplication.
24956/// This is because we still need one division to calculate the reciprocal and
24957/// then we need two multiplies by that reciprocal as replacements for the
24958/// original divisions.
24959unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
24960 return 2;
24961}
24962
24963SDValue
24964X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
24965 SelectionDAG &DAG,
24966 SmallVectorImpl<SDNode *> &Created) const {
24967 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
24968 if (isIntDivCheap(N->getValueType(0), Attr))
24969 return SDValue(N,0); // Lower SDIV as SDIV
24970
24971 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24972, __extension__
__PRETTY_FUNCTION__))
24972 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24972, __extension__
__PRETTY_FUNCTION__))
;
24973
24974 // Only perform this transform if CMOV is supported otherwise the select
24975 // below will become a branch.
24976 if (!Subtarget.canUseCMOV())
24977 return SDValue();
24978
24979 // fold (sdiv X, pow2)
24980 EVT VT = N->getValueType(0);
24981 // FIXME: Support i8.
24982 if (VT != MVT::i16 && VT != MVT::i32 &&
24983 !(Subtarget.is64Bit() && VT == MVT::i64))
24984 return SDValue();
24985
24986 unsigned Lg2 = Divisor.countr_zero();
24987
24988 // If the divisor is 2 or -2, the default expansion is better.
24989 if (Lg2 == 1)
24990 return SDValue();
24991
24992 SDLoc DL(N);
24993 SDValue N0 = N->getOperand(0);
24994 SDValue Zero = DAG.getConstant(0, DL, VT);
24995 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
24996 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
24997
24998 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
24999 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
25000 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
25001 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
25002
25003 Created.push_back(Cmp.getNode());
25004 Created.push_back(Add.getNode());
25005 Created.push_back(CMov.getNode());
25006
25007 // Divide by pow2.
25008 SDValue SRA =
25009 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
25010
25011 // If we're dividing by a positive value, we're done. Otherwise, we must
25012 // negate the result.
25013 if (Divisor.isNonNegative())
25014 return SRA;
25015
25016 Created.push_back(SRA.getNode());
25017 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
25018}
25019
25020/// Result of 'and' is compared against zero. Change to a BT node if possible.
25021/// Returns the BT node and the condition code needed to use it.
25022static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
25023 SelectionDAG &DAG, X86::CondCode &X86CC) {
25024 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25024, __extension__
__PRETTY_FUNCTION__))
;
25025 SDValue Op0 = And.getOperand(0);
25026 SDValue Op1 = And.getOperand(1);
25027 if (Op0.getOpcode() == ISD::TRUNCATE)
25028 Op0 = Op0.getOperand(0);
25029 if (Op1.getOpcode() == ISD::TRUNCATE)
25030 Op1 = Op1.getOperand(0);
25031
25032 SDValue Src, BitNo;
25033 if (Op1.getOpcode() == ISD::SHL)
25034 std::swap(Op0, Op1);
25035 if (Op0.getOpcode() == ISD::SHL) {
25036 if (isOneConstant(Op0.getOperand(0))) {
25037 // If we looked past a truncate, check that it's only truncating away
25038 // known zeros.
25039 unsigned BitWidth = Op0.getValueSizeInBits();
25040 unsigned AndBitWidth = And.getValueSizeInBits();
25041 if (BitWidth > AndBitWidth) {
25042 KnownBits Known = DAG.computeKnownBits(Op0);
25043 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
25044 return SDValue();
25045 }
25046 Src = Op1;
25047 BitNo = Op0.getOperand(1);
25048 }
25049 } else if (Op1.getOpcode() == ISD::Constant) {
25050 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
25051 uint64_t AndRHSVal = AndRHS->getZExtValue();
25052 SDValue AndLHS = Op0;
25053
25054 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
25055 Src = AndLHS.getOperand(0);
25056 BitNo = AndLHS.getOperand(1);
25057 } else {
25058 // Use BT if the immediate can't be encoded in a TEST instruction or we
25059 // are optimizing for size and the immedaite won't fit in a byte.
25060 bool OptForSize = DAG.shouldOptForSize();
25061 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
25062 isPowerOf2_64(AndRHSVal)) {
25063 Src = AndLHS;
25064 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
25065 Src.getValueType());
25066 }
25067 }
25068 }
25069
25070 // No patterns found, give up.
25071 if (!Src.getNode())
25072 return SDValue();
25073
25074 // Remove any bit flip.
25075 if (isBitwiseNot(Src)) {
25076 Src = Src.getOperand(0);
25077 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
25078 }
25079
25080 // Attempt to create the X86ISD::BT node.
25081 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
25082 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25083 return BT;
25084 }
25085
25086 return SDValue();
25087}
25088
25089// Check if pre-AVX condcode can be performed by a single FCMP op.
25090static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
25091 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
25092}
25093
25094/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
25095/// CMPs.
25096static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
25097 SDValue &Op1, bool &IsAlwaysSignaling) {
25098 unsigned SSECC;
25099 bool Swap = false;
25100
25101 // SSE Condition code mapping:
25102 // 0 - EQ
25103 // 1 - LT
25104 // 2 - LE
25105 // 3 - UNORD
25106 // 4 - NEQ
25107 // 5 - NLT
25108 // 6 - NLE
25109 // 7 - ORD
25110 switch (SetCCOpcode) {
25111 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25111)
;
25112 case ISD::SETOEQ:
25113 case ISD::SETEQ: SSECC = 0; break;
25114 case ISD::SETOGT:
25115 case ISD::SETGT: Swap = true; [[fallthrough]];
25116 case ISD::SETLT:
25117 case ISD::SETOLT: SSECC = 1; break;
25118 case ISD::SETOGE:
25119 case ISD::SETGE: Swap = true; [[fallthrough]];
25120 case ISD::SETLE:
25121 case ISD::SETOLE: SSECC = 2; break;
25122 case ISD::SETUO: SSECC = 3; break;
25123 case ISD::SETUNE:
25124 case ISD::SETNE: SSECC = 4; break;
25125 case ISD::SETULE: Swap = true; [[fallthrough]];
25126 case ISD::SETUGE: SSECC = 5; break;
25127 case ISD::SETULT: Swap = true; [[fallthrough]];
25128 case ISD::SETUGT: SSECC = 6; break;
25129 case ISD::SETO: SSECC = 7; break;
25130 case ISD::SETUEQ: SSECC = 8; break;
25131 case ISD::SETONE: SSECC = 12; break;
25132 }
25133 if (Swap)
25134 std::swap(Op0, Op1);
25135
25136 switch (SetCCOpcode) {
25137 default:
25138 IsAlwaysSignaling = true;
25139 break;
25140 case ISD::SETEQ:
25141 case ISD::SETOEQ:
25142 case ISD::SETUEQ:
25143 case ISD::SETNE:
25144 case ISD::SETONE:
25145 case ISD::SETUNE:
25146 case ISD::SETO:
25147 case ISD::SETUO:
25148 IsAlwaysSignaling = false;
25149 break;
25150 }
25151
25152 return SSECC;
25153}
25154
25155/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
25156/// concatenate the result back.
25157static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
25158 ISD::CondCode Cond, SelectionDAG &DAG,
25159 const SDLoc &dl) {
25160 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25161, __extension__
__PRETTY_FUNCTION__))
25161 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25161, __extension__
__PRETTY_FUNCTION__))
;
25162
25163 SDValue CC = DAG.getCondCode(Cond);
25164
25165 // Extract the LHS Lo/Hi vectors
25166 SDValue LHS1, LHS2;
25167 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
25168
25169 // Extract the RHS Lo/Hi vectors
25170 SDValue RHS1, RHS2;
25171 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
25172
25173 // Issue the operation on the smaller types and concatenate the result back
25174 EVT LoVT, HiVT;
25175 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
25176 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25177 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
25178 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
25179}
25180
25181static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
25182
25183 SDValue Op0 = Op.getOperand(0);
25184 SDValue Op1 = Op.getOperand(1);
25185 SDValue CC = Op.getOperand(2);
25186 MVT VT = Op.getSimpleValueType();
25187 SDLoc dl(Op);
25188
25189 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25190, __extension__
__PRETTY_FUNCTION__))
25190 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25190, __extension__
__PRETTY_FUNCTION__))
;
25191
25192 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
25193
25194 // Prefer SETGT over SETLT.
25195 if (SetCCOpcode == ISD::SETLT) {
25196 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
25197 std::swap(Op0, Op1);
25198 }
25199
25200 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
25201}
25202
25203/// Given a buildvector constant, return a new vector constant with each element
25204/// incremented or decremented. If incrementing or decrementing would result in
25205/// unsigned overflow or underflow or this is not a simple vector constant,
25206/// return an empty value.
25207static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
25208 bool NSW) {
25209 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
25210 if (!BV || !V.getValueType().isSimple())
25211 return SDValue();
25212
25213 MVT VT = V.getSimpleValueType();
25214 MVT EltVT = VT.getVectorElementType();
25215 unsigned NumElts = VT.getVectorNumElements();
25216 SmallVector<SDValue, 8> NewVecC;
25217 SDLoc DL(V);
25218 for (unsigned i = 0; i < NumElts; ++i) {
25219 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
25220 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
25221 return SDValue();
25222
25223 // Avoid overflow/underflow.
25224 const APInt &EltC = Elt->getAPIntValue();
25225 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
25226 return SDValue();
25227 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
25228 (!IsInc && EltC.isMinSignedValue())))
25229 return SDValue();
25230
25231 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
25232 }
25233
25234 return DAG.getBuildVector(VT, DL, NewVecC);
25235}
25236
25237/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
25238/// Op0 u<= Op1:
25239/// t = psubus Op0, Op1
25240/// pcmpeq t, <0..0>
25241static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
25242 ISD::CondCode Cond, const SDLoc &dl,
25243 const X86Subtarget &Subtarget,
25244 SelectionDAG &DAG) {
25245 if (!Subtarget.hasSSE2())
25246 return SDValue();
25247
25248 MVT VET = VT.getVectorElementType();
25249 if (VET != MVT::i8 && VET != MVT::i16)
25250 return SDValue();
25251
25252 switch (Cond) {
25253 default:
25254 return SDValue();
25255 case ISD::SETULT: {
25256 // If the comparison is against a constant we can turn this into a
25257 // setule. With psubus, setule does not require a swap. This is
25258 // beneficial because the constant in the register is no longer
25259 // destructed as the destination so it can be hoisted out of a loop.
25260 // Only do this pre-AVX since vpcmp* is no longer destructive.
25261 if (Subtarget.hasAVX())
25262 return SDValue();
25263 SDValue ULEOp1 =
25264 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
25265 if (!ULEOp1)
25266 return SDValue();
25267 Op1 = ULEOp1;
25268 break;
25269 }
25270 case ISD::SETUGT: {
25271 // If the comparison is against a constant, we can turn this into a setuge.
25272 // This is beneficial because materializing a constant 0 for the PCMPEQ is
25273 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
25274 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
25275 SDValue UGEOp1 =
25276 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
25277 if (!UGEOp1)
25278 return SDValue();
25279 Op1 = Op0;
25280 Op0 = UGEOp1;
25281 break;
25282 }
25283 // Psubus is better than flip-sign because it requires no inversion.
25284 case ISD::SETUGE:
25285 std::swap(Op0, Op1);
25286 break;
25287 case ISD::SETULE:
25288 break;
25289 }
25290
25291 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
25292 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
25293 DAG.getConstant(0, dl, VT));
25294}
25295
25296static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
25297 SelectionDAG &DAG) {
25298 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25299 Op.getOpcode() == ISD::STRICT_FSETCCS;
25300 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25301 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25302 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
25303 MVT VT = Op->getSimpleValueType(0);
25304 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
25305 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
25306 SDLoc dl(Op);
25307
25308 if (isFP) {
25309 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
25310 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25310, __extension__
__PRETTY_FUNCTION__))
;
25311 if (isSoftFP16(EltVT, Subtarget))
25312 return SDValue();
25313
25314 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25315 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25316
25317 // If we have a strict compare with a vXi1 result and the input is 128/256
25318 // bits we can't use a masked compare unless we have VLX. If we use a wider
25319 // compare like we do for non-strict, we might trigger spurious exceptions
25320 // from the upper elements. Instead emit a AVX compare and convert to mask.
25321 unsigned Opc;
25322 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
25323 (!IsStrict || Subtarget.hasVLX() ||
25324 Op0.getSimpleValueType().is512BitVector())) {
25325#ifndef NDEBUG
25326 unsigned Num = VT.getVectorNumElements();
25327 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25327, __extension__
__PRETTY_FUNCTION__))
;
25328#endif
25329 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
25330 } else {
25331 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
25332 // The SSE/AVX packed FP comparison nodes are defined with a
25333 // floating-point vector result that matches the operand type. This allows
25334 // them to work with an SSE1 target (integer vector types are not legal).
25335 VT = Op0.getSimpleValueType();
25336 }
25337
25338 SDValue Cmp;
25339 bool IsAlwaysSignaling;
25340 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
25341 if (!Subtarget.hasAVX()) {
25342 // TODO: We could use following steps to handle a quiet compare with
25343 // signaling encodings.
25344 // 1. Get ordered masks from a quiet ISD::SETO
25345 // 2. Use the masks to mask potential unordered elements in operand A, B
25346 // 3. Get the compare results of masked A, B
25347 // 4. Calculating final result using the mask and result from 3
25348 // But currently, we just fall back to scalar operations.
25349 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
25350 return SDValue();
25351
25352 // Insert an extra signaling instruction to raise exception.
25353 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
25354 SDValue SignalCmp = DAG.getNode(
25355 Opc, dl, {VT, MVT::Other},
25356 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
25357 // FIXME: It seems we need to update the flags of all new strict nodes.
25358 // Otherwise, mayRaiseFPException in MI will return false due to
25359 // NoFPExcept = false by default. However, I didn't find it in other
25360 // patches.
25361 SignalCmp->setFlags(Op->getFlags());
25362 Chain = SignalCmp.getValue(1);
25363 }
25364
25365 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
25366 // emit two comparisons and a logic op to tie them together.
25367 if (!cheapX86FSETCC_SSE(Cond)) {
25368 // LLVM predicate is SETUEQ or SETONE.
25369 unsigned CC0, CC1;
25370 unsigned CombineOpc;
25371 if (Cond == ISD::SETUEQ) {
25372 CC0 = 3; // UNORD
25373 CC1 = 0; // EQ
25374 CombineOpc = X86ISD::FOR;
25375 } else {
25376 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25376, __extension__ __PRETTY_FUNCTION__))
;
25377 CC0 = 7; // ORD
25378 CC1 = 4; // NEQ
25379 CombineOpc = X86ISD::FAND;
25380 }
25381
25382 SDValue Cmp0, Cmp1;
25383 if (IsStrict) {
25384 Cmp0 = DAG.getNode(
25385 Opc, dl, {VT, MVT::Other},
25386 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
25387 Cmp1 = DAG.getNode(
25388 Opc, dl, {VT, MVT::Other},
25389 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
25390 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
25391 Cmp1.getValue(1));
25392 } else {
25393 Cmp0 = DAG.getNode(
25394 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
25395 Cmp1 = DAG.getNode(
25396 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
25397 }
25398 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
25399 } else {
25400 if (IsStrict) {
25401 Cmp = DAG.getNode(
25402 Opc, dl, {VT, MVT::Other},
25403 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25404 Chain = Cmp.getValue(1);
25405 } else
25406 Cmp = DAG.getNode(
25407 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25408 }
25409 } else {
25410 // Handle all other FP comparisons here.
25411 if (IsStrict) {
25412 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
25413 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
25414 Cmp = DAG.getNode(
25415 Opc, dl, {VT, MVT::Other},
25416 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25417 Chain = Cmp.getValue(1);
25418 } else
25419 Cmp = DAG.getNode(
25420 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25421 }
25422
25423 if (VT.getFixedSizeInBits() >
25424 Op.getSimpleValueType().getFixedSizeInBits()) {
25425 // We emitted a compare with an XMM/YMM result. Finish converting to a
25426 // mask register using a vptestm.
25427 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
25428 Cmp = DAG.getBitcast(CastVT, Cmp);
25429 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
25430 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
25431 } else {
25432 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
25433 // the result type of SETCC. The bitcast is expected to be optimized
25434 // away during combining/isel.
25435 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
25436 }
25437
25438 if (IsStrict)
25439 return DAG.getMergeValues({Cmp, Chain}, dl);
25440
25441 return Cmp;
25442 }
25443
25444 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25444, __extension__
__PRETTY_FUNCTION__))
;
25445
25446 MVT VTOp0 = Op0.getSimpleValueType();
25447 (void)VTOp0;
25448 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25449, __extension__
__PRETTY_FUNCTION__))
25449 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25449, __extension__
__PRETTY_FUNCTION__))
;
25450 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25451, __extension__
__PRETTY_FUNCTION__))
25451 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25451, __extension__
__PRETTY_FUNCTION__))
;
25452
25453 // The non-AVX512 code below works under the assumption that source and
25454 // destination types are the same.
25455 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25456, __extension__
__PRETTY_FUNCTION__))
25456 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25456, __extension__
__PRETTY_FUNCTION__))
;
25457
25458 // The result is boolean, but operands are int/float
25459 if (VT.getVectorElementType() == MVT::i1) {
25460 // In AVX-512 architecture setcc returns mask with i1 elements,
25461 // But there is no compare instruction for i8 and i16 elements in KNL.
25462 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25463, __extension__
__PRETTY_FUNCTION__))
25463 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25463, __extension__
__PRETTY_FUNCTION__))
;
25464 return LowerIntVSETCC_AVX512(Op, DAG);
25465 }
25466
25467 // Lower using XOP integer comparisons.
25468 if (VT.is128BitVector() && Subtarget.hasXOP()) {
25469 // Translate compare code to XOP PCOM compare mode.
25470 unsigned CmpMode = 0;
25471 switch (Cond) {
25472 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25472)
;
25473 case ISD::SETULT:
25474 case ISD::SETLT: CmpMode = 0x00; break;
25475 case ISD::SETULE:
25476 case ISD::SETLE: CmpMode = 0x01; break;
25477 case ISD::SETUGT:
25478 case ISD::SETGT: CmpMode = 0x02; break;
25479 case ISD::SETUGE:
25480 case ISD::SETGE: CmpMode = 0x03; break;
25481 case ISD::SETEQ: CmpMode = 0x04; break;
25482 case ISD::SETNE: CmpMode = 0x05; break;
25483 }
25484
25485 // Are we comparing unsigned or signed integers?
25486 unsigned Opc =
25487 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
25488
25489 return DAG.getNode(Opc, dl, VT, Op0, Op1,
25490 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
25491 }
25492
25493 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
25494 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
25495 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
25496 SDValue BC0 = peekThroughBitcasts(Op0);
25497 if (BC0.getOpcode() == ISD::AND) {
25498 APInt UndefElts;
25499 SmallVector<APInt, 64> EltBits;
25500 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
25501 VT.getScalarSizeInBits(), UndefElts,
25502 EltBits, false, false)) {
25503 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
25504 Cond = ISD::SETEQ;
25505 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
25506 }
25507 }
25508 }
25509 }
25510
25511 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
25512 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
25513 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
25514 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
25515 if (C1 && C1->getAPIntValue().isPowerOf2()) {
25516 unsigned BitWidth = VT.getScalarSizeInBits();
25517 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
25518
25519 SDValue Result = Op0.getOperand(0);
25520 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
25521 DAG.getConstant(ShiftAmt, dl, VT));
25522 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
25523 DAG.getConstant(BitWidth - 1, dl, VT));
25524 return Result;
25525 }
25526 }
25527
25528 // Break 256-bit integer vector compare into smaller ones.
25529 if (VT.is256BitVector() && !Subtarget.hasInt256())
25530 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25531
25532 // Break 512-bit integer vector compare into smaller ones.
25533 // TODO: Try harder to use VPCMPx + VPMOV2x?
25534 if (VT.is512BitVector())
25535 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25536
25537 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
25538 // not-of-PCMPEQ:
25539 // X != INT_MIN --> X >s INT_MIN
25540 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
25541 // +X != 0 --> +X >s 0
25542 APInt ConstValue;
25543 if (Cond == ISD::SETNE &&
25544 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
25545 if (ConstValue.isMinSignedValue())
25546 Cond = ISD::SETGT;
25547 else if (ConstValue.isMaxSignedValue())
25548 Cond = ISD::SETLT;
25549 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
25550 Cond = ISD::SETGT;
25551 }
25552
25553 // If both operands are known non-negative, then an unsigned compare is the
25554 // same as a signed compare and there's no need to flip signbits.
25555 // TODO: We could check for more general simplifications here since we're
25556 // computing known bits.
25557 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
25558 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
25559
25560 // Special case: Use min/max operations for unsigned compares.
25561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25562 if (ISD::isUnsignedIntSetCC(Cond) &&
25563 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
25564 TLI.isOperationLegal(ISD::UMIN, VT)) {
25565 // If we have a constant operand, increment/decrement it and change the
25566 // condition to avoid an invert.
25567 if (Cond == ISD::SETUGT) {
25568 // X > C --> X >= (C+1) --> X == umax(X, C+1)
25569 if (SDValue UGTOp1 =
25570 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
25571 Op1 = UGTOp1;
25572 Cond = ISD::SETUGE;
25573 }
25574 }
25575 if (Cond == ISD::SETULT) {
25576 // X < C --> X <= (C-1) --> X == umin(X, C-1)
25577 if (SDValue ULTOp1 =
25578 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
25579 Op1 = ULTOp1;
25580 Cond = ISD::SETULE;
25581 }
25582 }
25583 bool Invert = false;
25584 unsigned Opc;
25585 switch (Cond) {
25586 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25586)
;
25587 case ISD::SETUGT: Invert = true; [[fallthrough]];
25588 case ISD::SETULE: Opc = ISD::UMIN; break;
25589 case ISD::SETULT: Invert = true; [[fallthrough]];
25590 case ISD::SETUGE: Opc = ISD::UMAX; break;
25591 }
25592
25593 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25594 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
25595
25596 // If the logical-not of the result is required, perform that now.
25597 if (Invert)
25598 Result = DAG.getNOT(dl, Result, VT);
25599
25600 return Result;
25601 }
25602
25603 // Try to use SUBUS and PCMPEQ.
25604 if (FlipSigns)
25605 if (SDValue V =
25606 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
25607 return V;
25608
25609 // We are handling one of the integer comparisons here. Since SSE only has
25610 // GT and EQ comparisons for integer, swapping operands and multiple
25611 // operations may be required for some comparisons.
25612 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
25613 : X86ISD::PCMPGT;
25614 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
25615 Cond == ISD::SETGE || Cond == ISD::SETUGE;
25616 bool Invert = Cond == ISD::SETNE ||
25617 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
25618
25619 if (Swap)
25620 std::swap(Op0, Op1);
25621
25622 // Check that the operation in question is available (most are plain SSE2,
25623 // but PCMPGTQ and PCMPEQQ have different requirements).
25624 if (VT == MVT::v2i64) {
25625 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
25626 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25626, __extension__
__PRETTY_FUNCTION__))
;
25627
25628 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
25629 // the odd elements over the even elements.
25630 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
25631 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
25632 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25633
25634 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25635 static const int MaskHi[] = { 1, 1, 3, 3 };
25636 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25637
25638 return DAG.getBitcast(VT, Result);
25639 }
25640
25641 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
25642 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25643 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
25644
25645 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25646 static const int MaskHi[] = { 1, 1, 3, 3 };
25647 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25648
25649 return DAG.getBitcast(VT, Result);
25650 }
25651
25652 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25653 // bits of the inputs before performing those operations. The lower
25654 // compare is always unsigned.
25655 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
25656 : 0x0000000080000000ULL,
25657 dl, MVT::v2i64);
25658
25659 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
25660 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
25661
25662 // Cast everything to the right type.
25663 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25664 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25665
25666 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
25667 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25668 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
25669
25670 // Create masks for only the low parts/high parts of the 64 bit integers.
25671 static const int MaskHi[] = { 1, 1, 3, 3 };
25672 static const int MaskLo[] = { 0, 0, 2, 2 };
25673 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
25674 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
25675 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25676
25677 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
25678 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
25679
25680 if (Invert)
25681 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25682
25683 return DAG.getBitcast(VT, Result);
25684 }
25685
25686 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
25687 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
25688 // pcmpeqd + pshufd + pand.
25689 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25689, __extension__
__PRETTY_FUNCTION__))
;
25690
25691 // First cast everything to the right type.
25692 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25693 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25694
25695 // Do the compare.
25696 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
25697
25698 // Make sure the lower and upper halves are both all-ones.
25699 static const int Mask[] = { 1, 0, 3, 2 };
25700 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
25701 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
25702
25703 if (Invert)
25704 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25705
25706 return DAG.getBitcast(VT, Result);
25707 }
25708 }
25709
25710 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25711 // bits of the inputs before performing those operations.
25712 if (FlipSigns) {
25713 MVT EltVT = VT.getVectorElementType();
25714 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
25715 VT);
25716 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
25717 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
25718 }
25719
25720 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25721
25722 // If the logical-not of the result is required, perform that now.
25723 if (Invert)
25724 Result = DAG.getNOT(dl, Result, VT);
25725
25726 return Result;
25727}
25728
25729// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
25730static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
25731 const SDLoc &dl, SelectionDAG &DAG,
25732 const X86Subtarget &Subtarget,
25733 SDValue &X86CC) {
25734 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25734, __extension__
__PRETTY_FUNCTION__))
;
25735
25736 // Must be a bitcast from vXi1.
25737 if (Op0.getOpcode() != ISD::BITCAST)
25738 return SDValue();
25739
25740 Op0 = Op0.getOperand(0);
25741 MVT VT = Op0.getSimpleValueType();
25742 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
25743 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
25744 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
25745 return SDValue();
25746
25747 X86::CondCode X86Cond;
25748 if (isNullConstant(Op1)) {
25749 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
25750 } else if (isAllOnesConstant(Op1)) {
25751 // C flag is set for all ones.
25752 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
25753 } else
25754 return SDValue();
25755
25756 // If the input is an AND, we can combine it's operands into the KTEST.
25757 bool KTestable = false;
25758 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
25759 KTestable = true;
25760 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
25761 KTestable = true;
25762 if (!isNullConstant(Op1))
25763 KTestable = false;
25764 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
25765 SDValue LHS = Op0.getOperand(0);
25766 SDValue RHS = Op0.getOperand(1);
25767 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25768 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
25769 }
25770
25771 // If the input is an OR, we can combine it's operands into the KORTEST.
25772 SDValue LHS = Op0;
25773 SDValue RHS = Op0;
25774 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
25775 LHS = Op0.getOperand(0);
25776 RHS = Op0.getOperand(1);
25777 }
25778
25779 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25780 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
25781}
25782
25783/// Emit flags for the given setcc condition and operands. Also returns the
25784/// corresponding X86 condition code constant in X86CC.
25785SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
25786 ISD::CondCode CC, const SDLoc &dl,
25787 SelectionDAG &DAG,
25788 SDValue &X86CC) const {
25789 // Equality Combines.
25790 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
25791 X86::CondCode X86CondCode;
25792
25793 // Optimize to BT if possible.
25794 // Lower (X & (1 << N)) == 0 to BT(X, N).
25795 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
25796 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
25797 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
25798 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
25799 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25800 return BT;
25801 }
25802 }
25803
25804 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
25805 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
25806 X86CondCode)) {
25807 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25808 return CmpZ;
25809 }
25810
25811 // Try to lower using KORTEST or KTEST.
25812 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
25813 return Test;
25814
25815 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
25816 // of these.
25817 if (isOneConstant(Op1) || isNullConstant(Op1)) {
25818 // If the input is a setcc, then reuse the input setcc or use a new one
25819 // with the inverted condition.
25820 if (Op0.getOpcode() == X86ISD::SETCC) {
25821 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
25822
25823 X86CC = Op0.getOperand(0);
25824 if (Invert) {
25825 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
25826 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
25827 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25828 }
25829
25830 return Op0.getOperand(1);
25831 }
25832 }
25833
25834 // Try to use the carry flag from the add in place of an separate CMP for:
25835 // (seteq (add X, -1), -1). Similar for setne.
25836 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
25837 Op0.getOperand(1) == Op1) {
25838 if (isProfitableToUseFlagOp(Op0)) {
25839 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
25840
25841 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
25842 Op0.getOperand(1));
25843 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
25844 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25845 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25846 return SDValue(New.getNode(), 1);
25847 }
25848 }
25849 }
25850
25851 X86::CondCode CondCode =
25852 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
25853 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25853, __extension__
__PRETTY_FUNCTION__))
;
25854
25855 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
25856 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25857 return EFLAGS;
25858}
25859
25860SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
25861
25862 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25863 Op.getOpcode() == ISD::STRICT_FSETCCS;
25864 MVT VT = Op->getSimpleValueType(0);
25865
25866 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
25867
25868 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25868, __extension__
__PRETTY_FUNCTION__))
;
25869 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25870 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25871 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25872 SDLoc dl(Op);
25873 ISD::CondCode CC =
25874 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
25875
25876 if (isSoftFP16(Op0.getValueType()))
25877 return SDValue();
25878
25879 // Handle f128 first, since one possible outcome is a normal integer
25880 // comparison which gets handled by emitFlagsForSetcc.
25881 if (Op0.getValueType() == MVT::f128) {
25882 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
25883 Op.getOpcode() == ISD::STRICT_FSETCCS);
25884
25885 // If softenSetCCOperands returned a scalar, use it.
25886 if (!Op1.getNode()) {
25887 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25888, __extension__
__PRETTY_FUNCTION__))
25888 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25888, __extension__
__PRETTY_FUNCTION__))
;
25889 if (IsStrict)
25890 return DAG.getMergeValues({Op0, Chain}, dl);
25891 return Op0;
25892 }
25893 }
25894
25895 if (Op0.getSimpleValueType().isInteger()) {
25896 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
25897 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
25898 // this may translate to less uops depending on uarch implementation. The
25899 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
25900 // canonicalize to that CondCode.
25901 // NOTE: Only do this if incrementing the constant doesn't increase the bit
25902 // encoding size - so it must either already be a i8 or i32 immediate, or it
25903 // shrinks down to that. We don't do this for any i64's to avoid additional
25904 // constant materializations.
25905 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
25906 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
25907 const APInt &Op1Val = Op1C->getAPIntValue();
25908 if (!Op1Val.isZero()) {
25909 // Ensure the constant+1 doesn't overflow.
25910 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
25911 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
25912 APInt Op1ValPlusOne = Op1Val + 1;
25913 if (Op1ValPlusOne.isSignedIntN(32) &&
25914 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
25915 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
25916 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
25917 : ISD::CondCode::SETUGE;
25918 }
25919 }
25920 }
25921 }
25922
25923 SDValue X86CC;
25924 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
25925 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
25926 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
25927 }
25928
25929 // Handle floating point.
25930 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
25931 if (CondCode == X86::COND_INVALID)
25932 return SDValue();
25933
25934 SDValue EFLAGS;
25935 if (IsStrict) {
25936 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25937 EFLAGS =
25938 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
25939 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
25940 Chain = EFLAGS.getValue(1);
25941 } else {
25942 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
25943 }
25944
25945 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25946 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
25947 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
25948}
25949
25950SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
25951 SDValue LHS = Op.getOperand(0);
25952 SDValue RHS = Op.getOperand(1);
25953 SDValue Carry = Op.getOperand(2);
25954 SDValue Cond = Op.getOperand(3);
25955 SDLoc DL(Op);
25956
25957 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25957, __extension__
__PRETTY_FUNCTION__))
;
25958 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
25959
25960 // Recreate the carry if needed.
25961 EVT CarryVT = Carry.getValueType();
25962 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
25963 Carry, DAG.getAllOnesConstant(DL, CarryVT));
25964
25965 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
25966 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
25967 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
25968}
25969
25970// This function returns three things: the arithmetic computation itself
25971// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
25972// flag and the condition code define the case in which the arithmetic
25973// computation overflows.
25974static std::pair<SDValue, SDValue>
25975getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
25976 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25976, __extension__
__PRETTY_FUNCTION__))
;
25977 SDValue Value, Overflow;
25978 SDValue LHS = Op.getOperand(0);
25979 SDValue RHS = Op.getOperand(1);
25980 unsigned BaseOp = 0;
25981 SDLoc DL(Op);
25982 switch (Op.getOpcode()) {
25983 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 25983)
;
25984 case ISD::SADDO:
25985 BaseOp = X86ISD::ADD;
25986 Cond = X86::COND_O;
25987 break;
25988 case ISD::UADDO:
25989 BaseOp = X86ISD::ADD;
25990 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
25991 break;
25992 case ISD::SSUBO:
25993 BaseOp = X86ISD::SUB;
25994 Cond = X86::COND_O;
25995 break;
25996 case ISD::USUBO:
25997 BaseOp = X86ISD::SUB;
25998 Cond = X86::COND_B;
25999 break;
26000 case ISD::SMULO:
26001 BaseOp = X86ISD::SMUL;
26002 Cond = X86::COND_O;
26003 break;
26004 case ISD::UMULO:
26005 BaseOp = X86ISD::UMUL;
26006 Cond = X86::COND_O;
26007 break;
26008 }
26009
26010 if (BaseOp) {
26011 // Also sets EFLAGS.
26012 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
26013 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
26014 Overflow = Value.getValue(1);
26015 }
26016
26017 return std::make_pair(Value, Overflow);
26018}
26019
26020static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
26021 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
26022 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
26023 // looks for this combo and may remove the "setcc" instruction if the "setcc"
26024 // has only one use.
26025 SDLoc DL(Op);
26026 X86::CondCode Cond;
26027 SDValue Value, Overflow;
26028 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
26029
26030 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
26031 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26031, __extension__
__PRETTY_FUNCTION__))
;
26032 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
26033}
26034
26035/// Return true if opcode is a X86 logical comparison.
26036static bool isX86LogicalCmp(SDValue Op) {
26037 unsigned Opc = Op.getOpcode();
26038 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
26039 Opc == X86ISD::FCMP)
26040 return true;
26041 if (Op.getResNo() == 1 &&
26042 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
26043 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
26044 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
26045 return true;
26046
26047 return false;
26048}
26049
26050static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
26051 if (V.getOpcode() != ISD::TRUNCATE)
26052 return false;
26053
26054 SDValue VOp0 = V.getOperand(0);
26055 unsigned InBits = VOp0.getValueSizeInBits();
26056 unsigned Bits = V.getValueSizeInBits();
26057 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
26058}
26059
26060SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
26061 bool AddTest = true;
26062 SDValue Cond = Op.getOperand(0);
26063 SDValue Op1 = Op.getOperand(1);
26064 SDValue Op2 = Op.getOperand(2);
26065 SDLoc DL(Op);
26066 MVT VT = Op1.getSimpleValueType();
26067 SDValue CC;
26068
26069 if (isSoftFP16(VT)) {
26070 MVT NVT = VT.changeTypeToInteger();
26071 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
26072 DAG.getBitcast(NVT, Op1),
26073 DAG.getBitcast(NVT, Op2)));
26074 }
26075
26076 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
26077 // are available or VBLENDV if AVX is available.
26078 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
26079 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
26080 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
26081 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
26082 bool IsAlwaysSignaling;
26083 unsigned SSECC =
26084 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
26085 CondOp0, CondOp1, IsAlwaysSignaling);
26086
26087 if (Subtarget.hasAVX512()) {
26088 SDValue Cmp =
26089 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
26090 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26091 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26091, __extension__
__PRETTY_FUNCTION__))
;
26092 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26093 }
26094
26095 if (SSECC < 8 || Subtarget.hasAVX()) {
26096 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
26097 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26098
26099 // If we have AVX, we can use a variable vector select (VBLENDV) instead
26100 // of 3 logic instructions for size savings and potentially speed.
26101 // Unfortunately, there is no scalar form of VBLENDV.
26102
26103 // If either operand is a +0.0 constant, don't try this. We can expect to
26104 // optimize away at least one of the logic instructions later in that
26105 // case, so that sequence would be faster than a variable blend.
26106
26107 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
26108 // uses XMM0 as the selection register. That may need just as many
26109 // instructions as the AND/ANDN/OR sequence due to register moves, so
26110 // don't bother.
26111 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
26112 !isNullFPConstant(Op2)) {
26113 // Convert to vectors, do a VSELECT, and convert back to scalar.
26114 // All of the conversions should be optimized away.
26115 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
26116 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
26117 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
26118 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
26119
26120 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
26121 VCmp = DAG.getBitcast(VCmpVT, VCmp);
26122
26123 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
26124
26125 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
26126 VSel, DAG.getIntPtrConstant(0, DL));
26127 }
26128 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
26129 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
26130 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
26131 }
26132 }
26133
26134 // AVX512 fallback is to lower selects of scalar floats to masked moves.
26135 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
26136 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
26137 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26138 }
26139
26140 if (Cond.getOpcode() == ISD::SETCC &&
26141 !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
26142 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
26143 Cond = NewCond;
26144 // If the condition was updated, it's possible that the operands of the
26145 // select were also updated (for example, EmitTest has a RAUW). Refresh
26146 // the local references to the select operands in case they got stale.
26147 Op1 = Op.getOperand(1);
26148 Op2 = Op.getOperand(2);
26149 }
26150 }
26151
26152 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
26153 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
26154 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
26155 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
26156 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
26157 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
26158 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26159 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26160 if (Cond.getOpcode() == X86ISD::SETCC &&
26161 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
26162 isNullConstant(Cond.getOperand(1).getOperand(1))) {
26163 SDValue Cmp = Cond.getOperand(1);
26164 SDValue CmpOp0 = Cmp.getOperand(0);
26165 unsigned CondCode = Cond.getConstantOperandVal(0);
26166
26167 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
26168 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
26169 // handle to keep the CMP with 0. This should be removed by
26170 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
26171 // cttz_zero_undef.
26172 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
26173 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
26174 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
26175 };
26176 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
26177 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
26178 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
26179 // Keep Cmp.
26180 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26181 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
26182 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
26183 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
26184
26185 // 'X - 1' sets the carry flag if X == 0.
26186 // '0 - X' sets the carry flag if X != 0.
26187 // Convert the carry flag to a -1/0 mask with sbb:
26188 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
26189 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
26190 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
26191 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
26192 SDValue Sub;
26193 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
26194 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
26195 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
26196 } else {
26197 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
26198 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
26199 }
26200 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26201 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
26202 Sub.getValue(1));
26203 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
26204 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
26205 CmpOp0.getOpcode() == ISD::AND &&
26206 isOneConstant(CmpOp0.getOperand(1))) {
26207 SDValue Src1, Src2;
26208 // true if Op2 is XOR or OR operator and one of its operands
26209 // is equal to Op1
26210 // ( a , a op b) || ( b , a op b)
26211 auto isOrXorPattern = [&]() {
26212 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
26213 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
26214 Src1 =
26215 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
26216 Src2 = Op1;
26217 return true;
26218 }
26219 return false;
26220 };
26221
26222 if (isOrXorPattern()) {
26223 SDValue Neg;
26224 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
26225 // we need mask of all zeros or ones with same size of the other
26226 // operands.
26227 if (CmpSz > VT.getSizeInBits())
26228 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
26229 else if (CmpSz < VT.getSizeInBits())
26230 Neg = DAG.getNode(ISD::AND, DL, VT,
26231 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
26232 DAG.getConstant(1, DL, VT));
26233 else
26234 Neg = CmpOp0;
26235 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
26236 Neg); // -(and (x, 0x1))
26237 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
26238 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
26239 }
26240 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
26241 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
26242 ((CondCode == X86::COND_S) || // smin(x, 0)
26243 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
26244 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26245 //
26246 // If the comparison is testing for a positive value, we have to invert
26247 // the sign bit mask, so only do that transform if the target has a
26248 // bitwise 'and not' instruction (the invert is free).
26249 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26250 unsigned ShCt = VT.getSizeInBits() - 1;
26251 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
26252 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
26253 if (CondCode == X86::COND_G)
26254 Shift = DAG.getNOT(DL, Shift, VT);
26255 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
26256 }
26257 }
26258
26259 // Look past (and (setcc_carry (cmp ...)), 1).
26260 if (Cond.getOpcode() == ISD::AND &&
26261 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
26262 isOneConstant(Cond.getOperand(1)))
26263 Cond = Cond.getOperand(0);
26264
26265 // If condition flag is set by a X86ISD::CMP, then use it as the condition
26266 // setting operand in place of the X86ISD::SETCC.
26267 unsigned CondOpcode = Cond.getOpcode();
26268 if (CondOpcode == X86ISD::SETCC ||
26269 CondOpcode == X86ISD::SETCC_CARRY) {
26270 CC = Cond.getOperand(0);
26271
26272 SDValue Cmp = Cond.getOperand(1);
26273 bool IllegalFPCMov = false;
26274 if (VT.isFloatingPoint() && !VT.isVector() &&
26275 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
26276 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
26277
26278 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
26279 Cmp.getOpcode() == X86ISD::BT) { // FIXME
26280 Cond = Cmp;
26281 AddTest = false;
26282 }
26283 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
26284 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
26285 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
26286 SDValue Value;
26287 X86::CondCode X86Cond;
26288 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26289
26290 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
26291 AddTest = false;
26292 }
26293
26294 if (AddTest) {
26295 // Look past the truncate if the high bits are known zero.
26296 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26297 Cond = Cond.getOperand(0);
26298
26299 // We know the result of AND is compared against zero. Try to match
26300 // it to BT.
26301 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
26302 X86::CondCode X86CondCode;
26303 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
26304 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
26305 Cond = BT;
26306 AddTest = false;
26307 }
26308 }
26309 }
26310
26311 if (AddTest) {
26312 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
26313 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
26314 }
26315
26316 // a < b ? -1 : 0 -> RES = ~setcc_carry
26317 // a < b ? 0 : -1 -> RES = setcc_carry
26318 // a >= b ? -1 : 0 -> RES = setcc_carry
26319 // a >= b ? 0 : -1 -> RES = ~setcc_carry
26320 if (Cond.getOpcode() == X86ISD::SUB) {
26321 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
26322
26323 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
26324 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26325 (isNullConstant(Op1) || isNullConstant(Op2))) {
26326 SDValue Res =
26327 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
26328 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
26329 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
26330 return DAG.getNOT(DL, Res, Res.getValueType());
26331 return Res;
26332 }
26333 }
26334
26335 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
26336 // widen the cmov and push the truncate through. This avoids introducing a new
26337 // branch during isel and doesn't add any extensions.
26338 if (Op.getValueType() == MVT::i8 &&
26339 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
26340 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
26341 if (T1.getValueType() == T2.getValueType() &&
26342 // Exclude CopyFromReg to avoid partial register stalls.
26343 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
26344 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
26345 CC, Cond);
26346 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26347 }
26348 }
26349
26350 // Or finally, promote i8 cmovs if we have CMOV,
26351 // or i16 cmovs if it won't prevent folding a load.
26352 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
26353 // legal, but EmitLoweredSelect() can not deal with these extensions
26354 // being inserted between two CMOV's. (in i16 case too TBN)
26355 // https://bugs.llvm.org/show_bug.cgi?id=40974
26356 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
26357 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
26358 !X86::mayFoldLoad(Op2, Subtarget))) {
26359 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
26360 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
26361 SDValue Ops[] = { Op2, Op1, CC, Cond };
26362 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
26363 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26364 }
26365
26366 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
26367 // condition is true.
26368 SDValue Ops[] = { Op2, Op1, CC, Cond };
26369 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
26370}
26371
26372static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
26373 const X86Subtarget &Subtarget,
26374 SelectionDAG &DAG) {
26375 MVT VT = Op->getSimpleValueType(0);
26376 SDValue In = Op->getOperand(0);
26377 MVT InVT = In.getSimpleValueType();
26378 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26378, __extension__
__PRETTY_FUNCTION__))
;
26379 MVT VTElt = VT.getVectorElementType();
26380 SDLoc dl(Op);
26381
26382 unsigned NumElts = VT.getVectorNumElements();
26383
26384 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
26385 MVT ExtVT = VT;
26386 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
26387 // If v16i32 is to be avoided, we'll need to split and concatenate.
26388 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
26389 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
26390
26391 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
26392 }
26393
26394 // Widen to 512-bits if VLX is not supported.
26395 MVT WideVT = ExtVT;
26396 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
26397 NumElts *= 512 / ExtVT.getSizeInBits();
26398 InVT = MVT::getVectorVT(MVT::i1, NumElts);
26399 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
26400 In, DAG.getIntPtrConstant(0, dl));
26401 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
26402 }
26403
26404 SDValue V;
26405 MVT WideEltVT = WideVT.getVectorElementType();
26406 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
26407 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
26408 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
26409 } else {
26410 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
26411 SDValue Zero = DAG.getConstant(0, dl, WideVT);
26412 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
26413 }
26414
26415 // Truncate if we had to extend i16/i8 above.
26416 if (VT != ExtVT) {
26417 WideVT = MVT::getVectorVT(VTElt, NumElts);
26418 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
26419 }
26420
26421 // Extract back to 128/256-bit if we widened.
26422 if (WideVT != VT)
26423 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
26424 DAG.getIntPtrConstant(0, dl));
26425
26426 return V;
26427}
26428
26429static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26430 SelectionDAG &DAG) {
26431 SDValue In = Op->getOperand(0);
26432 MVT InVT = In.getSimpleValueType();
26433
26434 if (InVT.getVectorElementType() == MVT::i1)
26435 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26436
26437 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26437, __extension__
__PRETTY_FUNCTION__))
;
26438 return LowerAVXExtend(Op, DAG, Subtarget);
26439}
26440
26441// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
26442// For sign extend this needs to handle all vector sizes and SSE4.1 and
26443// non-SSE4.1 targets. For zero extend this should only handle inputs of
26444// MVT::v64i8 when BWI is not supported, but AVX512 is.
26445static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
26446 const X86Subtarget &Subtarget,
26447 SelectionDAG &DAG) {
26448 SDValue In = Op->getOperand(0);
26449 MVT VT = Op->getSimpleValueType(0);
26450 MVT InVT = In.getSimpleValueType();
26451
26452 MVT SVT = VT.getVectorElementType();
26453 MVT InSVT = InVT.getVectorElementType();
26454 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26454, __extension__
__PRETTY_FUNCTION__))
;
26455
26456 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
26457 return SDValue();
26458 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
26459 return SDValue();
26460 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
26461 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
26462 !(VT.is512BitVector() && Subtarget.hasAVX512()))
26463 return SDValue();
26464
26465 SDLoc dl(Op);
26466 unsigned Opc = Op.getOpcode();
26467 unsigned NumElts = VT.getVectorNumElements();
26468
26469 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
26470 // For 512-bit vectors, we need 128-bits or 256-bits.
26471 if (InVT.getSizeInBits() > 128) {
26472 // Input needs to be at least the same number of elements as output, and
26473 // at least 128-bits.
26474 int InSize = InSVT.getSizeInBits() * NumElts;
26475 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
26476 InVT = In.getSimpleValueType();
26477 }
26478
26479 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
26480 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
26481 // need to be handled here for 256/512-bit results.
26482 if (Subtarget.hasInt256()) {
26483 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26483, __extension__
__PRETTY_FUNCTION__))
;
26484
26485 if (InVT.getVectorNumElements() != NumElts)
26486 return DAG.getNode(Op.getOpcode(), dl, VT, In);
26487
26488 // FIXME: Apparently we create inreg operations that could be regular
26489 // extends.
26490 unsigned ExtOpc =
26491 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
26492 : ISD::ZERO_EXTEND;
26493 return DAG.getNode(ExtOpc, dl, VT, In);
26494 }
26495
26496 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
26497 if (Subtarget.hasAVX()) {
26498 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26498, __extension__
__PRETTY_FUNCTION__))
;
26499 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26500 int HalfNumElts = HalfVT.getVectorNumElements();
26501
26502 unsigned NumSrcElts = InVT.getVectorNumElements();
26503 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
26504 for (int i = 0; i != HalfNumElts; ++i)
26505 HiMask[i] = HalfNumElts + i;
26506
26507 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
26508 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
26509 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
26510 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26511 }
26512
26513 // We should only get here for sign extend.
26514 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26514, __extension__
__PRETTY_FUNCTION__))
;
26515 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26515, __extension__
__PRETTY_FUNCTION__))
;
26516
26517 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
26518 SDValue Curr = In;
26519 SDValue SignExt = Curr;
26520
26521 // As SRAI is only available on i16/i32 types, we expand only up to i32
26522 // and handle i64 separately.
26523 if (InVT != MVT::v4i32) {
26524 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
26525
26526 unsigned DestWidth = DestVT.getScalarSizeInBits();
26527 unsigned Scale = DestWidth / InSVT.getSizeInBits();
26528
26529 unsigned InNumElts = InVT.getVectorNumElements();
26530 unsigned DestElts = DestVT.getVectorNumElements();
26531
26532 // Build a shuffle mask that takes each input element and places it in the
26533 // MSBs of the new element size.
26534 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
26535 for (unsigned i = 0; i != DestElts; ++i)
26536 Mask[i * Scale + (Scale - 1)] = i;
26537
26538 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
26539 Curr = DAG.getBitcast(DestVT, Curr);
26540
26541 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
26542 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
26543 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
26544 }
26545
26546 if (VT == MVT::v2i64) {
26547 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26547, __extension__
__PRETTY_FUNCTION__))
;
26548 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
26549 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
26550 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
26551 SignExt = DAG.getBitcast(VT, SignExt);
26552 }
26553
26554 return SignExt;
26555}
26556
26557static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26558 SelectionDAG &DAG) {
26559 MVT VT = Op->getSimpleValueType(0);
26560 SDValue In = Op->getOperand(0);
26561 MVT InVT = In.getSimpleValueType();
26562 SDLoc dl(Op);
26563
26564 if (InVT.getVectorElementType() == MVT::i1)
26565 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26566
26567 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26567, __extension__
__PRETTY_FUNCTION__))
;
26568 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26569, __extension__
__PRETTY_FUNCTION__))
26569 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26569, __extension__
__PRETTY_FUNCTION__))
;
26570 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))
26571 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))
26572 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))
26573 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))
;
26574 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))
26575 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))
26576 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))
26577 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))
;
26578
26579 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
26580 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26580, __extension__
__PRETTY_FUNCTION__))
;
26581 return splitVectorIntUnary(Op, DAG);
26582 }
26583
26584 if (Subtarget.hasInt256())
26585 return Op;
26586
26587 // Optimize vectors in AVX mode
26588 // Sign extend v8i16 to v8i32 and
26589 // v4i32 to v4i64
26590 //
26591 // Divide input vector into two parts
26592 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
26593 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
26594 // concat the vectors to original VT
26595 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26596 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
26597
26598 unsigned NumElems = InVT.getVectorNumElements();
26599 SmallVector<int,8> ShufMask(NumElems, -1);
26600 for (unsigned i = 0; i != NumElems/2; ++i)
26601 ShufMask[i] = i + NumElems/2;
26602
26603 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26604 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
26605
26606 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
26607}
26608
26609/// Change a vector store into a pair of half-size vector stores.
26610static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
26611 SDValue StoredVal = Store->getValue();
26612 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26614, __extension__
__PRETTY_FUNCTION__))
26613 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26614, __extension__
__PRETTY_FUNCTION__))
26614 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26614, __extension__
__PRETTY_FUNCTION__))
;
26615
26616 // Splitting volatile memory ops is not allowed unless the operation was not
26617 // legal to begin with. Assume the input store is legal (this transform is
26618 // only used for targets with AVX). Note: It is possible that we have an
26619 // illegal type like v2i128, and so we could allow splitting a volatile store
26620 // in that case if that is important.
26621 if (!Store->isSimple())
26622 return SDValue();
26623
26624 SDLoc DL(Store);
26625 SDValue Value0, Value1;
26626 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
26627 unsigned HalfOffset = Value0.getValueType().getStoreSize();
26628 SDValue Ptr0 = Store->getBasePtr();
26629 SDValue Ptr1 =
26630 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
26631 SDValue Ch0 =
26632 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
26633 Store->getOriginalAlign(),
26634 Store->getMemOperand()->getFlags());
26635 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
26636 Store->getPointerInfo().getWithOffset(HalfOffset),
26637 Store->getOriginalAlign(),
26638 Store->getMemOperand()->getFlags());
26639 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
26640}
26641
26642/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
26643/// type.
26644static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
26645 SelectionDAG &DAG) {
26646 SDValue StoredVal = Store->getValue();
26647 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__))
26648 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__))
;
26649 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
26650
26651 // Splitting volatile memory ops is not allowed unless the operation was not
26652 // legal to begin with. We are assuming the input op is legal (this transform
26653 // is only used for targets with AVX).
26654 if (!Store->isSimple())
26655 return SDValue();
26656
26657 MVT StoreSVT = StoreVT.getScalarType();
26658 unsigned NumElems = StoreVT.getVectorNumElements();
26659 unsigned ScalarSize = StoreSVT.getStoreSize();
26660
26661 SDLoc DL(Store);
26662 SmallVector<SDValue, 4> Stores;
26663 for (unsigned i = 0; i != NumElems; ++i) {
26664 unsigned Offset = i * ScalarSize;
26665 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
26666 TypeSize::Fixed(Offset), DL);
26667 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
26668 DAG.getIntPtrConstant(i, DL));
26669 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
26670 Store->getPointerInfo().getWithOffset(Offset),
26671 Store->getOriginalAlign(),
26672 Store->getMemOperand()->getFlags());
26673 Stores.push_back(Ch);
26674 }
26675 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
26676}
26677
26678static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
26679 SelectionDAG &DAG) {
26680 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
26681 SDLoc dl(St);
26682 SDValue StoredVal = St->getValue();
26683
26684 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
26685 if (StoredVal.getValueType().isVector() &&
26686 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
26687 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
26688 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26688, __extension__
__PRETTY_FUNCTION__))
;
26689 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26689, __extension__
__PRETTY_FUNCTION__))
;
26690 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26691, __extension__
__PRETTY_FUNCTION__))
26691 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26691, __extension__
__PRETTY_FUNCTION__))
;
26692
26693 // We must pad with zeros to ensure we store zeroes to any unused bits.
26694 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26695 DAG.getUNDEF(MVT::v16i1), StoredVal,
26696 DAG.getIntPtrConstant(0, dl));
26697 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
26698 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
26699 // Make sure we store zeros in the extra bits.
26700 if (NumElts < 8)
26701 StoredVal = DAG.getZeroExtendInReg(
26702 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
26703
26704 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26705 St->getPointerInfo(), St->getOriginalAlign(),
26706 St->getMemOperand()->getFlags());
26707 }
26708
26709 if (St->isTruncatingStore())
26710 return SDValue();
26711
26712 // If this is a 256-bit store of concatenated ops, we are better off splitting
26713 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
26714 // and each half can execute independently. Some cores would split the op into
26715 // halves anyway, so the concat (vinsertf128) is purely an extra op.
26716 MVT StoreVT = StoredVal.getSimpleValueType();
26717 if (StoreVT.is256BitVector() ||
26718 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
26719 !Subtarget.hasBWI())) {
26720 SmallVector<SDValue, 4> CatOps;
26721 if (StoredVal.hasOneUse() &&
26722 collectConcatOps(StoredVal.getNode(), CatOps, DAG))
26723 return splitVectorStore(St, DAG);
26724 return SDValue();
26725 }
26726
26727 if (StoreVT.is32BitVector())
26728 return SDValue();
26729
26730 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26731 assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26731, __extension__
__PRETTY_FUNCTION__))
;
26732 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26734, __extension__
__PRETTY_FUNCTION__))
26733 TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26734, __extension__
__PRETTY_FUNCTION__))
26734 "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26734, __extension__
__PRETTY_FUNCTION__))
;
26735
26736 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
26737 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
26738 DAG.getUNDEF(StoreVT));
26739
26740 if (Subtarget.hasSSE2()) {
26741 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
26742 // and store it.
26743 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
26744 MVT CastVT = MVT::getVectorVT(StVT, 2);
26745 StoredVal = DAG.getBitcast(CastVT, StoredVal);
26746 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
26747 DAG.getIntPtrConstant(0, dl));
26748
26749 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26750 St->getPointerInfo(), St->getOriginalAlign(),
26751 St->getMemOperand()->getFlags());
26752 }
26753 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26753, __extension__
__PRETTY_FUNCTION__))
;
26754 SDVTList Tys = DAG.getVTList(MVT::Other);
26755 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
26756 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
26757 St->getMemOperand());
26758}
26759
26760// Lower vector extended loads using a shuffle. If SSSE3 is not available we
26761// may emit an illegal shuffle but the expansion is still better than scalar
26762// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
26763// we'll emit a shuffle and a arithmetic shift.
26764// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
26765// TODO: It is possible to support ZExt by zeroing the undef values during
26766// the shuffle phase or after the shuffle.
26767static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
26768 SelectionDAG &DAG) {
26769 MVT RegVT = Op.getSimpleValueType();
26770 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26770, __extension__
__PRETTY_FUNCTION__))
;
26771 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26772, __extension__
__PRETTY_FUNCTION__))
26772 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26772, __extension__
__PRETTY_FUNCTION__))
;
26773
26774 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
26775 SDLoc dl(Ld);
26776
26777 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
26778 if (RegVT.getVectorElementType() == MVT::i1) {
26779 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26779, __extension__
__PRETTY_FUNCTION__))
;
26780 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26780, __extension__
__PRETTY_FUNCTION__))
;
26781 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26782, __extension__
__PRETTY_FUNCTION__))
26782 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26782, __extension__
__PRETTY_FUNCTION__))
;
26783
26784 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
26785 Ld->getPointerInfo(), Ld->getOriginalAlign(),
26786 Ld->getMemOperand()->getFlags());
26787
26788 // Replace chain users with the new chain.
26789 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26789, __extension__
__PRETTY_FUNCTION__))
;
26790
26791 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
26792 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
26793 DAG.getBitcast(MVT::v16i1, Val),
26794 DAG.getIntPtrConstant(0, dl));
26795 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
26796 }
26797
26798 return SDValue();
26799}
26800
26801/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
26802/// each of which has no other use apart from the AND / OR.
26803static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
26804 Opc = Op.getOpcode();
26805 if (Opc != ISD::OR && Opc != ISD::AND)
26806 return false;
26807 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
26808 Op.getOperand(0).hasOneUse() &&
26809 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
26810 Op.getOperand(1).hasOneUse());
26811}
26812
26813SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
26814 SDValue Chain = Op.getOperand(0);
26815 SDValue Cond = Op.getOperand(1);
26816 SDValue Dest = Op.getOperand(2);
26817 SDLoc dl(Op);
26818
26819 // Bail out when we don't have native compare instructions.
26820 if (Cond.getOpcode() == ISD::SETCC &&
26821 Cond.getOperand(0).getValueType() != MVT::f128 &&
26822 !isSoftFP16(Cond.getOperand(0).getValueType())) {
26823 SDValue LHS = Cond.getOperand(0);
26824 SDValue RHS = Cond.getOperand(1);
26825 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26826
26827 // Special case for
26828 // setcc([su]{add,sub,mul}o == 0)
26829 // setcc([su]{add,sub,mul}o != 1)
26830 if (ISD::isOverflowIntrOpRes(LHS) &&
26831 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
26832 (isNullConstant(RHS) || isOneConstant(RHS))) {
26833 SDValue Value, Overflow;
26834 X86::CondCode X86Cond;
26835 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
26836
26837 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
26838 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
26839
26840 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26841 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26842 Overflow);
26843 }
26844
26845 if (LHS.getSimpleValueType().isInteger()) {
26846 SDValue CCVal;
26847 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
26848 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26849 EFLAGS);
26850 }
26851
26852 if (CC == ISD::SETOEQ) {
26853 // For FCMP_OEQ, we can emit
26854 // two branches instead of an explicit AND instruction with a
26855 // separate test. However, we only do this if this block doesn't
26856 // have a fall-through edge, because this requires an explicit
26857 // jmp when the condition is false.
26858 if (Op.getNode()->hasOneUse()) {
26859 SDNode *User = *Op.getNode()->use_begin();
26860 // Look for an unconditional branch following this conditional branch.
26861 // We need this because we need to reverse the successors in order
26862 // to implement FCMP_OEQ.
26863 if (User->getOpcode() == ISD::BR) {
26864 SDValue FalseBB = User->getOperand(1);
26865 SDNode *NewBR =
26866 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
26867 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26867, __extension__ __PRETTY_FUNCTION__))
;
26868 (void)NewBR;
26869 Dest = FalseBB;
26870
26871 SDValue Cmp =
26872 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26873 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26874 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
26875 CCVal, Cmp);
26876 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26877 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26878 Cmp);
26879 }
26880 }
26881 } else if (CC == ISD::SETUNE) {
26882 // For FCMP_UNE, we can emit
26883 // two branches instead of an explicit OR instruction with a
26884 // separate test.
26885 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26886 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26887 Chain =
26888 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
26889 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26890 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26891 Cmp);
26892 } else {
26893 X86::CondCode X86Cond =
26894 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
26895 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26896 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26897 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26898 Cmp);
26899 }
26900 }
26901
26902 if (ISD::isOverflowIntrOpRes(Cond)) {
26903 SDValue Value, Overflow;
26904 X86::CondCode X86Cond;
26905 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26906
26907 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26908 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26909 Overflow);
26910 }
26911
26912 // Look past the truncate if the high bits are known zero.
26913 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26914 Cond = Cond.getOperand(0);
26915
26916 EVT CondVT = Cond.getValueType();
26917
26918 // Add an AND with 1 if we don't already have one.
26919 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
26920 Cond =
26921 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
26922
26923 SDValue LHS = Cond;
26924 SDValue RHS = DAG.getConstant(0, dl, CondVT);
26925
26926 SDValue CCVal;
26927 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
26928 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26929 EFLAGS);
26930}
26931
26932// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
26933// Calls to _alloca are needed to probe the stack when allocating more than 4k
26934// bytes in one go. Touching the stack at 4K increments is necessary to ensure
26935// that the guard pages used by the OS virtual memory manager are allocated in
26936// correct sequence.
26937SDValue
26938X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
26939 SelectionDAG &DAG) const {
26940 MachineFunction &MF = DAG.getMachineFunction();
26941 bool SplitStack = MF.shouldSplitStack();
26942 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
26943 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
26944 SplitStack || EmitStackProbeCall;
26945 SDLoc dl(Op);
26946
26947 // Get the inputs.
26948 SDNode *Node = Op.getNode();
26949 SDValue Chain = Op.getOperand(0);
26950 SDValue Size = Op.getOperand(1);
26951 MaybeAlign Alignment(Op.getConstantOperandVal(2));
26952 EVT VT = Node->getValueType(0);
26953
26954 // Chain the dynamic stack allocation so that it doesn't modify the stack
26955 // pointer when other instructions are using the stack.
26956 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
26957
26958 bool Is64Bit = Subtarget.is64Bit();
26959 MVT SPTy = getPointerTy(DAG.getDataLayout());
26960
26961 SDValue Result;
26962 if (!Lower) {
26963 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26964 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
26965 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26966, __extension__
__PRETTY_FUNCTION__))
26966 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26966, __extension__
__PRETTY_FUNCTION__))
;
26967
26968 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26969 const Align StackAlign = TFI.getStackAlign();
26970 if (hasInlineStackProbe(MF)) {
26971 MachineRegisterInfo &MRI = MF.getRegInfo();
26972
26973 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26974 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
26975 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
26976 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
26977 DAG.getRegister(Vreg, SPTy));
26978 } else {
26979 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
26980 Chain = SP.getValue(1);
26981 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
26982 }
26983 if (Alignment && *Alignment > StackAlign)
26984 Result =
26985 DAG.getNode(ISD::AND, dl, VT, Result,
26986 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
26987 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
26988 } else if (SplitStack) {
26989 MachineRegisterInfo &MRI = MF.getRegInfo();
26990
26991 if (Is64Bit) {
26992 // The 64 bit implementation of segmented stacks needs to clobber both r10
26993 // r11. This makes it impossible to use it along with nested parameters.
26994 const Function &F = MF.getFunction();
26995 for (const auto &A : F.args()) {
26996 if (A.hasNestAttr())
26997 report_fatal_error("Cannot use segmented stacks with functions that "
26998 "have nested arguments.");
26999 }
27000 }
27001
27002 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27003 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27004 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27005 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
27006 DAG.getRegister(Vreg, SPTy));
27007 } else {
27008 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
27009 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
27010 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
27011
27012 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27013 Register SPReg = RegInfo->getStackRegister();
27014 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
27015 Chain = SP.getValue(1);
27016
27017 if (Alignment) {
27018 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
27019 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27020 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
27021 }
27022
27023 Result = SP;
27024 }
27025
27026 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
27027
27028 SDValue Ops[2] = {Result, Chain};
27029 return DAG.getMergeValues(Ops, dl);
27030}
27031
27032SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
27033 MachineFunction &MF = DAG.getMachineFunction();
27034 auto PtrVT = getPointerTy(MF.getDataLayout());
27035 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27036
27037 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27038 SDLoc DL(Op);
27039
27040 if (!Subtarget.is64Bit() ||
27041 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
27042 // vastart just stores the address of the VarArgsFrameIndex slot into the
27043 // memory location argument.
27044 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27045 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
27046 MachinePointerInfo(SV));
27047 }
27048
27049 // __va_list_tag:
27050 // gp_offset (0 - 6 * 8)
27051 // fp_offset (48 - 48 + 8 * 16)
27052 // overflow_arg_area (point to parameters coming in memory).
27053 // reg_save_area
27054 SmallVector<SDValue, 8> MemOps;
27055 SDValue FIN = Op.getOperand(1);
27056 // Store gp_offset
27057 SDValue Store = DAG.getStore(
27058 Op.getOperand(0), DL,
27059 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
27060 MachinePointerInfo(SV));
27061 MemOps.push_back(Store);
27062
27063 // Store fp_offset
27064 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
27065 Store = DAG.getStore(
27066 Op.getOperand(0), DL,
27067 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
27068 MachinePointerInfo(SV, 4));
27069 MemOps.push_back(Store);
27070
27071 // Store ptr to overflow_arg_area
27072 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
27073 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27074 Store =
27075 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
27076 MemOps.push_back(Store);
27077
27078 // Store ptr to reg_save_area.
27079 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
27080 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
27081 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
27082 Store = DAG.getStore(
27083 Op.getOperand(0), DL, RSFIN, FIN,
27084 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
27085 MemOps.push_back(Store);
27086 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
27087}
27088
27089SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
27090 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27091, __extension__
__PRETTY_FUNCTION__))
27091 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27091, __extension__
__PRETTY_FUNCTION__))
;
27092 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27092, __extension__ __PRETTY_FUNCTION__))
;
27093
27094 MachineFunction &MF = DAG.getMachineFunction();
27095 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
27096 // The Win64 ABI uses char* instead of a structure.
27097 return DAG.expandVAArg(Op.getNode());
27098
27099 SDValue Chain = Op.getOperand(0);
27100 SDValue SrcPtr = Op.getOperand(1);
27101 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27102 unsigned Align = Op.getConstantOperandVal(3);
27103 SDLoc dl(Op);
27104
27105 EVT ArgVT = Op.getNode()->getValueType(0);
27106 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27107 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
27108 uint8_t ArgMode;
27109
27110 // Decide which area this value should be read from.
27111 // TODO: Implement the AMD64 ABI in its entirety. This simple
27112 // selection mechanism works only for the basic types.
27113 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27113, __extension__
__PRETTY_FUNCTION__))
;
27114 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
27115 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
27116 } else {
27117 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27118, __extension__
__PRETTY_FUNCTION__))
27118 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27118, __extension__
__PRETTY_FUNCTION__))
;
27119 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
27120 }
27121
27122 if (ArgMode == 2) {
27123 // Make sure using fp_offset makes sense.
27124 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27126, __extension__
__PRETTY_FUNCTION__))
27125 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27126, __extension__
__PRETTY_FUNCTION__))
27126 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27126, __extension__
__PRETTY_FUNCTION__))
;
27127 }
27128
27129 // Insert VAARG node into the DAG
27130 // VAARG returns two values: Variable Argument Address, Chain
27131 SDValue InstOps[] = {Chain, SrcPtr,
27132 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
27133 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
27134 DAG.getTargetConstant(Align, dl, MVT::i32)};
27135 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
27136 SDValue VAARG = DAG.getMemIntrinsicNode(
27137 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
27138 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
27139 /*Alignment=*/std::nullopt,
27140 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
27141 Chain = VAARG.getValue(1);
27142
27143 // Load the next argument and return it
27144 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
27145}
27146
27147static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
27148 SelectionDAG &DAG) {
27149 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
27150 // where a va_list is still an i8*.
27151 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27151, __extension__
__PRETTY_FUNCTION__))
;
27152 if (Subtarget.isCallingConvWin64(
27153 DAG.getMachineFunction().getFunction().getCallingConv()))
27154 // Probably a Win64 va_copy.
27155 return DAG.expandVACopy(Op.getNode());
27156
27157 SDValue Chain = Op.getOperand(0);
27158 SDValue DstPtr = Op.getOperand(1);
27159 SDValue SrcPtr = Op.getOperand(2);
27160 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
27161 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27162 SDLoc DL(Op);
27163
27164 return DAG.getMemcpy(
27165 Chain, DL, DstPtr, SrcPtr,
27166 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
27167 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
27168 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
27169}
27170
27171// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
27172static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
27173 switch (Opc) {
27174 case ISD::SHL:
27175 case X86ISD::VSHL:
27176 case X86ISD::VSHLI:
27177 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
27178 case ISD::SRL:
27179 case X86ISD::VSRL:
27180 case X86ISD::VSRLI:
27181 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
27182 case ISD::SRA:
27183 case X86ISD::VSRA:
27184 case X86ISD::VSRAI:
27185 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
27186 }
27187 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187)
;
27188}
27189
27190/// Handle vector element shifts where the shift amount is a constant.
27191/// Takes immediate version of shift as input.
27192static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
27193 SDValue SrcOp, uint64_t ShiftAmt,
27194 SelectionDAG &DAG) {
27195 MVT ElementType = VT.getVectorElementType();
27196
27197 // Bitcast the source vector to the output type, this is mainly necessary for
27198 // vXi8/vXi64 shifts.
27199 if (VT != SrcOp.getSimpleValueType())
27200 SrcOp = DAG.getBitcast(VT, SrcOp);
27201
27202 // Fold this packed shift into its first operand if ShiftAmt is 0.
27203 if (ShiftAmt == 0)
27204 return SrcOp;
27205
27206 // Check for ShiftAmt >= element width
27207 if (ShiftAmt >= ElementType.getSizeInBits()) {
27208 if (Opc == X86ISD::VSRAI)
27209 ShiftAmt = ElementType.getSizeInBits() - 1;
27210 else
27211 return DAG.getConstant(0, dl, VT);
27212 }
27213
27214 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27215, __extension__
__PRETTY_FUNCTION__))
27215 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27215, __extension__
__PRETTY_FUNCTION__))
;
27216
27217 // Fold this packed vector shift into a build vector if SrcOp is a
27218 // vector of Constants or UNDEFs.
27219 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
27220 unsigned ShiftOpc;
27221 switch (Opc) {
27222 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27222)
;
27223 case X86ISD::VSHLI:
27224 ShiftOpc = ISD::SHL;
27225 break;
27226 case X86ISD::VSRLI:
27227 ShiftOpc = ISD::SRL;
27228 break;
27229 case X86ISD::VSRAI:
27230 ShiftOpc = ISD::SRA;
27231 break;
27232 }
27233
27234 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
27235 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
27236 return C;
27237 }
27238
27239 return DAG.getNode(Opc, dl, VT, SrcOp,
27240 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
27241}
27242
27243/// Handle vector element shifts by a splat shift amount
27244static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
27245 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
27246 const X86Subtarget &Subtarget,
27247 SelectionDAG &DAG) {
27248 MVT AmtVT = ShAmt.getSimpleValueType();
27249 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27249, __extension__
__PRETTY_FUNCTION__))
;
27250 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27251, __extension__
__PRETTY_FUNCTION__))
27251 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27251, __extension__
__PRETTY_FUNCTION__))
;
27252
27253 // Move the splat element to the bottom element.
27254 if (ShAmtIdx != 0) {
27255 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
27256 Mask[0] = ShAmtIdx;
27257 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
27258 }
27259
27260 // Peek through any zext node if we can get back to a 128-bit source.
27261 if (AmtVT.getScalarSizeInBits() == 64 &&
27262 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
27263 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
27264 ShAmt.getOperand(0).getValueType().isSimple() &&
27265 ShAmt.getOperand(0).getValueType().is128BitVector()) {
27266 ShAmt = ShAmt.getOperand(0);
27267 AmtVT = ShAmt.getSimpleValueType();
27268 }
27269
27270 // See if we can mask off the upper elements using the existing source node.
27271 // The shift uses the entire lower 64-bits of the amount vector, so no need to
27272 // do this for vXi64 types.
27273 bool IsMasked = false;
27274 if (AmtVT.getScalarSizeInBits() < 64) {
27275 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
27276 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
27277 // If the shift amount has come from a scalar, then zero-extend the scalar
27278 // before moving to the vector.
27279 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
27280 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
27281 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
27282 AmtVT = MVT::v4i32;
27283 IsMasked = true;
27284 } else if (ShAmt.getOpcode() == ISD::AND) {
27285 // See if the shift amount is already masked (e.g. for rotation modulo),
27286 // then we can zero-extend it by setting all the other mask elements to
27287 // zero.
27288 SmallVector<SDValue> MaskElts(
27289 AmtVT.getVectorNumElements(),
27290 DAG.getConstant(0, dl, AmtVT.getScalarType()));
27291 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
27292 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
27293 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
27294 {ShAmt.getOperand(1), Mask}))) {
27295 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
27296 IsMasked = true;
27297 }
27298 }
27299 }
27300
27301 // Extract if the shift amount vector is larger than 128-bits.
27302 if (AmtVT.getSizeInBits() > 128) {
27303 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
27304 AmtVT = ShAmt.getSimpleValueType();
27305 }
27306
27307 // Zero-extend bottom element to v2i64 vector type, either by extension or
27308 // shuffle masking.
27309 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
27310 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
27311 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
27312 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
27313 } else if (Subtarget.hasSSE41()) {
27314 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
27315 MVT::v2i64, ShAmt);
27316 } else {
27317 SDValue ByteShift = DAG.getTargetConstant(
27318 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
27319 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
27320 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27321 ByteShift);
27322 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27323 ByteShift);
27324 }
27325 }
27326
27327 // Change opcode to non-immediate version.
27328 Opc = getTargetVShiftUniformOpcode(Opc, true);
27329
27330 // The return type has to be a 128-bit type with the same element
27331 // type as the input type.
27332 MVT EltVT = VT.getVectorElementType();
27333 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
27334
27335 ShAmt = DAG.getBitcast(ShVT, ShAmt);
27336 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
27337}
27338
27339/// Return Mask with the necessary casting or extending
27340/// for \p Mask according to \p MaskVT when lowering masking intrinsics
27341static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
27342 const X86Subtarget &Subtarget, SelectionDAG &DAG,
27343 const SDLoc &dl) {
27344
27345 if (isAllOnesConstant(Mask))
27346 return DAG.getConstant(1, dl, MaskVT);
27347 if (X86::isZeroNode(Mask))
27348 return DAG.getConstant(0, dl, MaskVT);
27349
27350 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27350, __extension__
__PRETTY_FUNCTION__))
;
27351
27352 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
27353 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27353, __extension__
__PRETTY_FUNCTION__))
;
27354 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27354, __extension__
__PRETTY_FUNCTION__))
;
27355 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
27356 SDValue Lo, Hi;
27357 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
27358 Lo = DAG.getBitcast(MVT::v32i1, Lo);
27359 Hi = DAG.getBitcast(MVT::v32i1, Hi);
27360 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27361 } else {
27362 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
27363 Mask.getSimpleValueType().getSizeInBits());
27364 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
27365 // are extracted by EXTRACT_SUBVECTOR.
27366 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
27367 DAG.getBitcast(BitcastVT, Mask),
27368 DAG.getIntPtrConstant(0, dl));
27369 }
27370}
27371
27372/// Return (and \p Op, \p Mask) for compare instructions or
27373/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
27374/// necessary casting or extending for \p Mask when lowering masking intrinsics
27375static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
27376 SDValue PreservedSrc,
27377 const X86Subtarget &Subtarget,
27378 SelectionDAG &DAG) {
27379 MVT VT = Op.getSimpleValueType();
27380 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27381 unsigned OpcodeSelect = ISD::VSELECT;
27382 SDLoc dl(Op);
27383
27384 if (isAllOnesConstant(Mask))
27385 return Op;
27386
27387 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27388
27389 if (PreservedSrc.isUndef())
27390 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27391 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
27392}
27393
27394/// Creates an SDNode for a predicated scalar operation.
27395/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
27396/// The mask is coming as MVT::i8 and it should be transformed
27397/// to MVT::v1i1 while lowering masking intrinsics.
27398/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
27399/// "X86select" instead of "vselect". We just can't create the "vselect" node
27400/// for a scalar instruction.
27401static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
27402 SDValue PreservedSrc,
27403 const X86Subtarget &Subtarget,
27404 SelectionDAG &DAG) {
27405
27406 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
27407 if (MaskConst->getZExtValue() & 0x1)
27408 return Op;
27409
27410 MVT VT = Op.getSimpleValueType();
27411 SDLoc dl(Op);
27412
27413 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27413, __extension__
__PRETTY_FUNCTION__))
;
27414 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
27415 DAG.getBitcast(MVT::v8i1, Mask),
27416 DAG.getIntPtrConstant(0, dl));
27417 if (Op.getOpcode() == X86ISD::FSETCCM ||
27418 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
27419 Op.getOpcode() == X86ISD::VFPCLASSS)
27420 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
27421
27422 if (PreservedSrc.isUndef())
27423 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27424 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
27425}
27426
27427static int getSEHRegistrationNodeSize(const Function *Fn) {
27428 if (!Fn->hasPersonalityFn())
27429 report_fatal_error(
27430 "querying registration node size for function without personality");
27431 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
27432 // WinEHStatePass for the full struct definition.
27433 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
27434 case EHPersonality::MSVC_X86SEH: return 24;
27435 case EHPersonality::MSVC_CXX: return 16;
27436 default: break;
27437 }
27438 report_fatal_error(
27439 "can only recover FP for 32-bit MSVC EH personality functions");
27440}
27441
27442/// When the MSVC runtime transfers control to us, either to an outlined
27443/// function or when returning to a parent frame after catching an exception, we
27444/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
27445/// Here's the math:
27446/// RegNodeBase = EntryEBP - RegNodeSize
27447/// ParentFP = RegNodeBase - ParentFrameOffset
27448/// Subtracting RegNodeSize takes us to the offset of the registration node, and
27449/// subtracting the offset (negative on x86) takes us back to the parent FP.
27450static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
27451 SDValue EntryEBP) {
27452 MachineFunction &MF = DAG.getMachineFunction();
27453 SDLoc dl;
27454
27455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27456 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27457
27458 // It's possible that the parent function no longer has a personality function
27459 // if the exceptional code was optimized away, in which case we just return
27460 // the incoming EBP.
27461 if (!Fn->hasPersonalityFn())
27462 return EntryEBP;
27463
27464 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
27465 // registration, or the .set_setframe offset.
27466 MCSymbol *OffsetSym =
27467 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
27468 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
27469 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
27470 SDValue ParentFrameOffset =
27471 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
27472
27473 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
27474 // prologue to RBP in the parent function.
27475 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
27476 if (Subtarget.is64Bit())
27477 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
27478
27479 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
27480 // RegNodeBase = EntryEBP - RegNodeSize
27481 // ParentFP = RegNodeBase - ParentFrameOffset
27482 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
27483 DAG.getConstant(RegNodeSize, dl, PtrVT));
27484 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
27485}
27486
27487SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
27488 SelectionDAG &DAG) const {
27489 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
27490 auto isRoundModeCurDirection = [](SDValue Rnd) {
27491 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
27492 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
27493
27494 return false;
27495 };
27496 auto isRoundModeSAE = [](SDValue Rnd) {
27497 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27498 unsigned RC = C->getZExtValue();
27499 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27500 // Clear the NO_EXC bit and check remaining bits.
27501 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27502 // As a convenience we allow no other bits or explicitly
27503 // current direction.
27504 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
27505 }
27506 }
27507
27508 return false;
27509 };
27510 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
27511 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27512 RC = C->getZExtValue();
27513 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27514 // Clear the NO_EXC bit and check remaining bits.
27515 RC ^= X86::STATIC_ROUNDING::NO_EXC;
27516 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
27517 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
27518 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
27519 RC == X86::STATIC_ROUNDING::TO_ZERO;
27520 }
27521 }
27522
27523 return false;
27524 };
27525
27526 SDLoc dl(Op);
27527 unsigned IntNo = Op.getConstantOperandVal(0);
27528 MVT VT = Op.getSimpleValueType();
27529 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
27530
27531 // Propagate flags from original node to transformed node(s).
27532 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
27533
27534 if (IntrData) {
27535 switch(IntrData->Type) {
27536 case INTR_TYPE_1OP: {
27537 // We specify 2 possible opcodes for intrinsics with rounding modes.
27538 // First, we check if the intrinsic may have non-default rounding mode,
27539 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27540 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27541 if (IntrWithRoundingModeOpcode != 0) {
27542 SDValue Rnd = Op.getOperand(2);
27543 unsigned RC = 0;
27544 if (isRoundModeSAEToX(Rnd, RC))
27545 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27546 Op.getOperand(1),
27547 DAG.getTargetConstant(RC, dl, MVT::i32));
27548 if (!isRoundModeCurDirection(Rnd))
27549 return SDValue();
27550 }
27551 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27552 Op.getOperand(1));
27553 }
27554 case INTR_TYPE_1OP_SAE: {
27555 SDValue Sae = Op.getOperand(2);
27556
27557 unsigned Opc;
27558 if (isRoundModeCurDirection(Sae))
27559 Opc = IntrData->Opc0;
27560 else if (isRoundModeSAE(Sae))
27561 Opc = IntrData->Opc1;
27562 else
27563 return SDValue();
27564
27565 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
27566 }
27567 case INTR_TYPE_2OP: {
27568 SDValue Src2 = Op.getOperand(2);
27569
27570 // We specify 2 possible opcodes for intrinsics with rounding modes.
27571 // First, we check if the intrinsic may have non-default rounding mode,
27572 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27573 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27574 if (IntrWithRoundingModeOpcode != 0) {
27575 SDValue Rnd = Op.getOperand(3);
27576 unsigned RC = 0;
27577 if (isRoundModeSAEToX(Rnd, RC))
27578 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27579 Op.getOperand(1), Src2,
27580 DAG.getTargetConstant(RC, dl, MVT::i32));
27581 if (!isRoundModeCurDirection(Rnd))
27582 return SDValue();
27583 }
27584
27585 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27586 Op.getOperand(1), Src2);
27587 }
27588 case INTR_TYPE_2OP_SAE: {
27589 SDValue Sae = Op.getOperand(3);
27590
27591 unsigned Opc;
27592 if (isRoundModeCurDirection(Sae))
27593 Opc = IntrData->Opc0;
27594 else if (isRoundModeSAE(Sae))
27595 Opc = IntrData->Opc1;
27596 else
27597 return SDValue();
27598
27599 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
27600 Op.getOperand(2));
27601 }
27602 case INTR_TYPE_3OP:
27603 case INTR_TYPE_3OP_IMM8: {
27604 SDValue Src1 = Op.getOperand(1);
27605 SDValue Src2 = Op.getOperand(2);
27606 SDValue Src3 = Op.getOperand(3);
27607
27608 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
27609 Src3.getValueType() != MVT::i8) {
27610 Src3 = DAG.getTargetConstant(
27611 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
27612 }
27613
27614 // We specify 2 possible opcodes for intrinsics with rounding modes.
27615 // First, we check if the intrinsic may have non-default rounding mode,
27616 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27617 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27618 if (IntrWithRoundingModeOpcode != 0) {
27619 SDValue Rnd = Op.getOperand(4);
27620 unsigned RC = 0;
27621 if (isRoundModeSAEToX(Rnd, RC))
27622 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27623 Src1, Src2, Src3,
27624 DAG.getTargetConstant(RC, dl, MVT::i32));
27625 if (!isRoundModeCurDirection(Rnd))
27626 return SDValue();
27627 }
27628
27629 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27630 {Src1, Src2, Src3});
27631 }
27632 case INTR_TYPE_4OP_IMM8: {
27633 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27633, __extension__
__PRETTY_FUNCTION__))
;
27634 SDValue Src4 = Op.getOperand(4);
27635 if (Src4.getValueType() != MVT::i8) {
27636 Src4 = DAG.getTargetConstant(
27637 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
27638 }
27639
27640 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27641 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
27642 Src4);
27643 }
27644 case INTR_TYPE_1OP_MASK: {
27645 SDValue Src = Op.getOperand(1);
27646 SDValue PassThru = Op.getOperand(2);
27647 SDValue Mask = Op.getOperand(3);
27648 // We add rounding mode to the Node when
27649 // - RC Opcode is specified and
27650 // - RC is not "current direction".
27651 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27652 if (IntrWithRoundingModeOpcode != 0) {
27653 SDValue Rnd = Op.getOperand(4);
27654 unsigned RC = 0;
27655 if (isRoundModeSAEToX(Rnd, RC))
27656 return getVectorMaskingNode(
27657 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27658 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
27659 Mask, PassThru, Subtarget, DAG);
27660 if (!isRoundModeCurDirection(Rnd))
27661 return SDValue();
27662 }
27663 return getVectorMaskingNode(
27664 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
27665 Subtarget, DAG);
27666 }
27667 case INTR_TYPE_1OP_MASK_SAE: {
27668 SDValue Src = Op.getOperand(1);
27669 SDValue PassThru = Op.getOperand(2);
27670 SDValue Mask = Op.getOperand(3);
27671 SDValue Rnd = Op.getOperand(4);
27672
27673 unsigned Opc;
27674 if (isRoundModeCurDirection(Rnd))
27675 Opc = IntrData->Opc0;
27676 else if (isRoundModeSAE(Rnd))
27677 Opc = IntrData->Opc1;
27678 else
27679 return SDValue();
27680
27681 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
27682 Subtarget, DAG);
27683 }
27684 case INTR_TYPE_SCALAR_MASK: {
27685 SDValue Src1 = Op.getOperand(1);
27686 SDValue Src2 = Op.getOperand(2);
27687 SDValue passThru = Op.getOperand(3);
27688 SDValue Mask = Op.getOperand(4);
27689 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27690 // There are 2 kinds of intrinsics in this group:
27691 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
27692 // (2) With rounding mode and sae - 7 operands.
27693 bool HasRounding = IntrWithRoundingModeOpcode != 0;
27694 if (Op.getNumOperands() == (5U + HasRounding)) {
27695 if (HasRounding) {
27696 SDValue Rnd = Op.getOperand(5);
27697 unsigned RC = 0;
27698 if (isRoundModeSAEToX(Rnd, RC))
27699 return getScalarMaskingNode(
27700 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
27701 DAG.getTargetConstant(RC, dl, MVT::i32)),
27702 Mask, passThru, Subtarget, DAG);
27703 if (!isRoundModeCurDirection(Rnd))
27704 return SDValue();
27705 }
27706 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
27707 Src2),
27708 Mask, passThru, Subtarget, DAG);
27709 }
27710
27711 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27712, __extension__
__PRETTY_FUNCTION__))
27712 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27712, __extension__
__PRETTY_FUNCTION__))
;
27713 SDValue RoundingMode = Op.getOperand(5);
27714 unsigned Opc = IntrData->Opc0;
27715 if (HasRounding) {
27716 SDValue Sae = Op.getOperand(6);
27717 if (isRoundModeSAE(Sae))
27718 Opc = IntrWithRoundingModeOpcode;
27719 else if (!isRoundModeCurDirection(Sae))
27720 return SDValue();
27721 }
27722 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
27723 Src2, RoundingMode),
27724 Mask, passThru, Subtarget, DAG);
27725 }
27726 case INTR_TYPE_SCALAR_MASK_RND: {
27727 SDValue Src1 = Op.getOperand(1);
27728 SDValue Src2 = Op.getOperand(2);
27729 SDValue passThru = Op.getOperand(3);
27730 SDValue Mask = Op.getOperand(4);
27731 SDValue Rnd = Op.getOperand(5);
27732
27733 SDValue NewOp;
27734 unsigned RC = 0;
27735 if (isRoundModeCurDirection(Rnd))
27736 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27737 else if (isRoundModeSAEToX(Rnd, RC))
27738 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27739 DAG.getTargetConstant(RC, dl, MVT::i32));
27740 else
27741 return SDValue();
27742
27743 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
27744 }
27745 case INTR_TYPE_SCALAR_MASK_SAE: {
27746 SDValue Src1 = Op.getOperand(1);
27747 SDValue Src2 = Op.getOperand(2);
27748 SDValue passThru = Op.getOperand(3);
27749 SDValue Mask = Op.getOperand(4);
27750 SDValue Sae = Op.getOperand(5);
27751 unsigned Opc;
27752 if (isRoundModeCurDirection(Sae))
27753 Opc = IntrData->Opc0;
27754 else if (isRoundModeSAE(Sae))
27755 Opc = IntrData->Opc1;
27756 else
27757 return SDValue();
27758
27759 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27760 Mask, passThru, Subtarget, DAG);
27761 }
27762 case INTR_TYPE_2OP_MASK: {
27763 SDValue Src1 = Op.getOperand(1);
27764 SDValue Src2 = Op.getOperand(2);
27765 SDValue PassThru = Op.getOperand(3);
27766 SDValue Mask = Op.getOperand(4);
27767 SDValue NewOp;
27768 if (IntrData->Opc1 != 0) {
27769 SDValue Rnd = Op.getOperand(5);
27770 unsigned RC = 0;
27771 if (isRoundModeSAEToX(Rnd, RC))
27772 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27773 DAG.getTargetConstant(RC, dl, MVT::i32));
27774 else if (!isRoundModeCurDirection(Rnd))
27775 return SDValue();
27776 }
27777 if (!NewOp)
27778 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27779 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27780 }
27781 case INTR_TYPE_2OP_MASK_SAE: {
27782 SDValue Src1 = Op.getOperand(1);
27783 SDValue Src2 = Op.getOperand(2);
27784 SDValue PassThru = Op.getOperand(3);
27785 SDValue Mask = Op.getOperand(4);
27786
27787 unsigned Opc = IntrData->Opc0;
27788 if (IntrData->Opc1 != 0) {
27789 SDValue Sae = Op.getOperand(5);
27790 if (isRoundModeSAE(Sae))
27791 Opc = IntrData->Opc1;
27792 else if (!isRoundModeCurDirection(Sae))
27793 return SDValue();
27794 }
27795
27796 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27797 Mask, PassThru, Subtarget, DAG);
27798 }
27799 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
27800 SDValue Src1 = Op.getOperand(1);
27801 SDValue Src2 = Op.getOperand(2);
27802 SDValue Src3 = Op.getOperand(3);
27803 SDValue PassThru = Op.getOperand(4);
27804 SDValue Mask = Op.getOperand(5);
27805 SDValue Sae = Op.getOperand(6);
27806 unsigned Opc;
27807 if (isRoundModeCurDirection(Sae))
27808 Opc = IntrData->Opc0;
27809 else if (isRoundModeSAE(Sae))
27810 Opc = IntrData->Opc1;
27811 else
27812 return SDValue();
27813
27814 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27815 Mask, PassThru, Subtarget, DAG);
27816 }
27817 case INTR_TYPE_3OP_MASK_SAE: {
27818 SDValue Src1 = Op.getOperand(1);
27819 SDValue Src2 = Op.getOperand(2);
27820 SDValue Src3 = Op.getOperand(3);
27821 SDValue PassThru = Op.getOperand(4);
27822 SDValue Mask = Op.getOperand(5);
27823
27824 unsigned Opc = IntrData->Opc0;
27825 if (IntrData->Opc1 != 0) {
27826 SDValue Sae = Op.getOperand(6);
27827 if (isRoundModeSAE(Sae))
27828 Opc = IntrData->Opc1;
27829 else if (!isRoundModeCurDirection(Sae))
27830 return SDValue();
27831 }
27832 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27833 Mask, PassThru, Subtarget, DAG);
27834 }
27835 case BLENDV: {
27836 SDValue Src1 = Op.getOperand(1);
27837 SDValue Src2 = Op.getOperand(2);
27838 SDValue Src3 = Op.getOperand(3);
27839
27840 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
27841 Src3 = DAG.getBitcast(MaskVT, Src3);
27842
27843 // Reverse the operands to match VSELECT order.
27844 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
27845 }
27846 case VPERM_2OP : {
27847 SDValue Src1 = Op.getOperand(1);
27848 SDValue Src2 = Op.getOperand(2);
27849
27850 // Swap Src1 and Src2 in the node creation
27851 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
27852 }
27853 case CFMA_OP_MASKZ:
27854 case CFMA_OP_MASK: {
27855 SDValue Src1 = Op.getOperand(1);
27856 SDValue Src2 = Op.getOperand(2);
27857 SDValue Src3 = Op.getOperand(3);
27858 SDValue Mask = Op.getOperand(4);
27859 MVT VT = Op.getSimpleValueType();
27860
27861 SDValue PassThru = Src3;
27862 if (IntrData->Type == CFMA_OP_MASKZ)
27863 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
27864
27865 // We add rounding mode to the Node when
27866 // - RC Opcode is specified and
27867 // - RC is not "current direction".
27868 SDValue NewOp;
27869 if (IntrData->Opc1 != 0) {
27870 SDValue Rnd = Op.getOperand(5);
27871 unsigned RC = 0;
27872 if (isRoundModeSAEToX(Rnd, RC))
27873 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
27874 DAG.getTargetConstant(RC, dl, MVT::i32));
27875 else if (!isRoundModeCurDirection(Rnd))
27876 return SDValue();
27877 }
27878 if (!NewOp)
27879 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
27880 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27881 }
27882 case IFMA_OP:
27883 // NOTE: We need to swizzle the operands to pass the multiply operands
27884 // first.
27885 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27886 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
27887 case FPCLASSS: {
27888 SDValue Src1 = Op.getOperand(1);
27889 SDValue Imm = Op.getOperand(2);
27890 SDValue Mask = Op.getOperand(3);
27891 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
27892 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
27893 Subtarget, DAG);
27894 // Need to fill with zeros to ensure the bitcast will produce zeroes
27895 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27896 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27897 DAG.getConstant(0, dl, MVT::v8i1),
27898 FPclassMask, DAG.getIntPtrConstant(0, dl));
27899 return DAG.getBitcast(MVT::i8, Ins);
27900 }
27901
27902 case CMP_MASK_CC: {
27903 MVT MaskVT = Op.getSimpleValueType();
27904 SDValue CC = Op.getOperand(3);
27905 SDValue Mask = Op.getOperand(4);
27906 // We specify 2 possible opcodes for intrinsics with rounding modes.
27907 // First, we check if the intrinsic may have non-default rounding mode,
27908 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27909 if (IntrData->Opc1 != 0) {
27910 SDValue Sae = Op.getOperand(5);
27911 if (isRoundModeSAE(Sae))
27912 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
27913 Op.getOperand(2), CC, Mask, Sae);
27914 if (!isRoundModeCurDirection(Sae))
27915 return SDValue();
27916 }
27917 //default rounding mode
27918 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
27919 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
27920 }
27921 case CMP_MASK_SCALAR_CC: {
27922 SDValue Src1 = Op.getOperand(1);
27923 SDValue Src2 = Op.getOperand(2);
27924 SDValue CC = Op.getOperand(3);
27925 SDValue Mask = Op.getOperand(4);
27926
27927 SDValue Cmp;
27928 if (IntrData->Opc1 != 0) {
27929 SDValue Sae = Op.getOperand(5);
27930 if (isRoundModeSAE(Sae))
27931 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
27932 else if (!isRoundModeCurDirection(Sae))
27933 return SDValue();
27934 }
27935 //default rounding mode
27936 if (!Cmp.getNode())
27937 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
27938
27939 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
27940 Subtarget, DAG);
27941 // Need to fill with zeros to ensure the bitcast will produce zeroes
27942 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27943 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27944 DAG.getConstant(0, dl, MVT::v8i1),
27945 CmpMask, DAG.getIntPtrConstant(0, dl));
27946 return DAG.getBitcast(MVT::i8, Ins);
27947 }
27948 case COMI: { // Comparison intrinsics
27949 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
27950 SDValue LHS = Op.getOperand(1);
27951 SDValue RHS = Op.getOperand(2);
27952 // Some conditions require the operands to be swapped.
27953 if (CC == ISD::SETLT || CC == ISD::SETLE)
27954 std::swap(LHS, RHS);
27955
27956 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
27957 SDValue SetCC;
27958 switch (CC) {
27959 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
27960 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
27961 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
27962 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
27963 break;
27964 }
27965 case ISD::SETNE: { // (ZF = 1 or PF = 1)
27966 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
27967 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
27968 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
27969 break;
27970 }
27971 case ISD::SETGT: // (CF = 0 and ZF = 0)
27972 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
27973 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
27974 break;
27975 }
27976 case ISD::SETGE: // CF = 0
27977 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
27978 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
27979 break;
27980 default:
27981 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27981)
;
27982 }
27983 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27984 }
27985 case COMI_RM: { // Comparison intrinsics with Sae
27986 SDValue LHS = Op.getOperand(1);
27987 SDValue RHS = Op.getOperand(2);
27988 unsigned CondVal = Op.getConstantOperandVal(3);
27989 SDValue Sae = Op.getOperand(4);
27990
27991 SDValue FCmp;
27992 if (isRoundModeCurDirection(Sae))
27993 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
27994 DAG.getTargetConstant(CondVal, dl, MVT::i8));
27995 else if (isRoundModeSAE(Sae))
27996 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
27997 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
27998 else
27999 return SDValue();
28000 // Need to fill with zeros to ensure the bitcast will produce zeroes
28001 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28002 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
28003 DAG.getConstant(0, dl, MVT::v16i1),
28004 FCmp, DAG.getIntPtrConstant(0, dl));
28005 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
28006 DAG.getBitcast(MVT::i16, Ins));
28007 }
28008 case VSHIFT: {
28009 SDValue SrcOp = Op.getOperand(1);
28010 SDValue ShAmt = Op.getOperand(2);
28011 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28012, __extension__
__PRETTY_FUNCTION__))
28012 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28012, __extension__
__PRETTY_FUNCTION__))
;
28013
28014 // Catch shift-by-constant.
28015 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
28016 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
28017 Op.getSimpleValueType(), SrcOp,
28018 CShAmt->getZExtValue(), DAG);
28019
28020 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
28021 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
28022 SrcOp, ShAmt, 0, Subtarget, DAG);
28023 }
28024 case COMPRESS_EXPAND_IN_REG: {
28025 SDValue Mask = Op.getOperand(3);
28026 SDValue DataToCompress = Op.getOperand(1);
28027 SDValue PassThru = Op.getOperand(2);
28028 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
28029 return Op.getOperand(1);
28030
28031 // Avoid false dependency.
28032 if (PassThru.isUndef())
28033 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
28034
28035 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
28036 Mask);
28037 }
28038 case FIXUPIMM:
28039 case FIXUPIMM_MASKZ: {
28040 SDValue Src1 = Op.getOperand(1);
28041 SDValue Src2 = Op.getOperand(2);
28042 SDValue Src3 = Op.getOperand(3);
28043 SDValue Imm = Op.getOperand(4);
28044 SDValue Mask = Op.getOperand(5);
28045 SDValue Passthru = (IntrData->Type == FIXUPIMM)
28046 ? Src1
28047 : getZeroVector(VT, Subtarget, DAG, dl);
28048
28049 unsigned Opc = IntrData->Opc0;
28050 if (IntrData->Opc1 != 0) {
28051 SDValue Sae = Op.getOperand(6);
28052 if (isRoundModeSAE(Sae))
28053 Opc = IntrData->Opc1;
28054 else if (!isRoundModeCurDirection(Sae))
28055 return SDValue();
28056 }
28057
28058 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
28059
28060 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
28061 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28062
28063 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28064 }
28065 case ROUNDP: {
28066 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28066, __extension__
__PRETTY_FUNCTION__))
;
28067 // Clear the upper bits of the rounding immediate so that the legacy
28068 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28069 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
28070 SDValue RoundingMode =
28071 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28072 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28073 Op.getOperand(1), RoundingMode);
28074 }
28075 case ROUNDS: {
28076 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28076, __extension__
__PRETTY_FUNCTION__))
;
28077 // Clear the upper bits of the rounding immediate so that the legacy
28078 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28079 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
28080 SDValue RoundingMode =
28081 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28082 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28083 Op.getOperand(1), Op.getOperand(2), RoundingMode);
28084 }
28085 case BEXTRI: {
28086 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28086, __extension__
__PRETTY_FUNCTION__))
;
28087
28088 uint64_t Imm = Op.getConstantOperandVal(2);
28089 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
28090 Op.getValueType());
28091 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28092 Op.getOperand(1), Control);
28093 }
28094 // ADC/ADCX/SBB
28095 case ADX: {
28096 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
28097 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
28098
28099 SDValue Res;
28100 // If the carry in is zero, then we should just use ADD/SUB instead of
28101 // ADC/SBB.
28102 if (isNullConstant(Op.getOperand(1))) {
28103 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
28104 Op.getOperand(3));
28105 } else {
28106 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
28107 DAG.getConstant(-1, dl, MVT::i8));
28108 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
28109 Op.getOperand(3), GenCF.getValue(1));
28110 }
28111 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
28112 SDValue Results[] = { SetCC, Res };
28113 return DAG.getMergeValues(Results, dl);
28114 }
28115 case CVTPD2PS_MASK:
28116 case CVTPD2DQ_MASK:
28117 case CVTQQ2PS_MASK:
28118 case TRUNCATE_TO_REG: {
28119 SDValue Src = Op.getOperand(1);
28120 SDValue PassThru = Op.getOperand(2);
28121 SDValue Mask = Op.getOperand(3);
28122
28123 if (isAllOnesConstant(Mask))
28124 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28125
28126 MVT SrcVT = Src.getSimpleValueType();
28127 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28128 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28129 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
28130 {Src, PassThru, Mask});
28131 }
28132 case CVTPS2PH_MASK: {
28133 SDValue Src = Op.getOperand(1);
28134 SDValue Rnd = Op.getOperand(2);
28135 SDValue PassThru = Op.getOperand(3);
28136 SDValue Mask = Op.getOperand(4);
28137
28138 unsigned RC = 0;
28139 unsigned Opc = IntrData->Opc0;
28140 bool SAE = Src.getValueType().is512BitVector() &&
28141 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
28142 if (SAE) {
28143 Opc = X86ISD::CVTPS2PH_SAE;
28144 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
28145 }
28146
28147 if (isAllOnesConstant(Mask))
28148 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
28149
28150 if (SAE)
28151 Opc = X86ISD::MCVTPS2PH_SAE;
28152 else
28153 Opc = IntrData->Opc1;
28154 MVT SrcVT = Src.getSimpleValueType();
28155 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28156 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28157 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
28158 }
28159 case CVTNEPS2BF16_MASK: {
28160 SDValue Src = Op.getOperand(1);
28161 SDValue PassThru = Op.getOperand(2);
28162 SDValue Mask = Op.getOperand(3);
28163
28164 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28165 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28166
28167 // Break false dependency.
28168 if (PassThru.isUndef())
28169 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
28170
28171 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
28172 Mask);
28173 }
28174 default:
28175 break;
28176 }
28177 }
28178
28179 switch (IntNo) {
28180 default: return SDValue(); // Don't custom lower most intrinsics.
28181
28182 // ptest and testp intrinsics. The intrinsic these come from are designed to
28183 // return an integer value, not just an instruction so lower it to the ptest
28184 // or testp pattern and a setcc for the result.
28185 case Intrinsic::x86_avx512_ktestc_b:
28186 case Intrinsic::x86_avx512_ktestc_w:
28187 case Intrinsic::x86_avx512_ktestc_d:
28188 case Intrinsic::x86_avx512_ktestc_q:
28189 case Intrinsic::x86_avx512_ktestz_b:
28190 case Intrinsic::x86_avx512_ktestz_w:
28191 case Intrinsic::x86_avx512_ktestz_d:
28192 case Intrinsic::x86_avx512_ktestz_q:
28193 case Intrinsic::x86_sse41_ptestz:
28194 case Intrinsic::x86_sse41_ptestc:
28195 case Intrinsic::x86_sse41_ptestnzc:
28196 case Intrinsic::x86_avx_ptestz_256:
28197 case Intrinsic::x86_avx_ptestc_256:
28198 case Intrinsic::x86_avx_ptestnzc_256:
28199 case Intrinsic::x86_avx_vtestz_ps:
28200 case Intrinsic::x86_avx_vtestc_ps:
28201 case Intrinsic::x86_avx_vtestnzc_ps:
28202 case Intrinsic::x86_avx_vtestz_pd:
28203 case Intrinsic::x86_avx_vtestc_pd:
28204 case Intrinsic::x86_avx_vtestnzc_pd:
28205 case Intrinsic::x86_avx_vtestz_ps_256:
28206 case Intrinsic::x86_avx_vtestc_ps_256:
28207 case Intrinsic::x86_avx_vtestnzc_ps_256:
28208 case Intrinsic::x86_avx_vtestz_pd_256:
28209 case Intrinsic::x86_avx_vtestc_pd_256:
28210 case Intrinsic::x86_avx_vtestnzc_pd_256: {
28211 unsigned TestOpc = X86ISD::PTEST;
28212 X86::CondCode X86CC;
28213 switch (IntNo) {
28214 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28214)
;
28215 case Intrinsic::x86_avx512_ktestc_b:
28216 case Intrinsic::x86_avx512_ktestc_w:
28217 case Intrinsic::x86_avx512_ktestc_d:
28218 case Intrinsic::x86_avx512_ktestc_q:
28219 // CF = 1
28220 TestOpc = X86ISD::KTEST;
28221 X86CC = X86::COND_B;
28222 break;
28223 case Intrinsic::x86_avx512_ktestz_b:
28224 case Intrinsic::x86_avx512_ktestz_w:
28225 case Intrinsic::x86_avx512_ktestz_d:
28226 case Intrinsic::x86_avx512_ktestz_q:
28227 TestOpc = X86ISD::KTEST;
28228 X86CC = X86::COND_E;
28229 break;
28230 case Intrinsic::x86_avx_vtestz_ps:
28231 case Intrinsic::x86_avx_vtestz_pd:
28232 case Intrinsic::x86_avx_vtestz_ps_256:
28233 case Intrinsic::x86_avx_vtestz_pd_256:
28234 TestOpc = X86ISD::TESTP;
28235 [[fallthrough]];
28236 case Intrinsic::x86_sse41_ptestz:
28237 case Intrinsic::x86_avx_ptestz_256:
28238 // ZF = 1
28239 X86CC = X86::COND_E;
28240 break;
28241 case Intrinsic::x86_avx_vtestc_ps:
28242 case Intrinsic::x86_avx_vtestc_pd:
28243 case Intrinsic::x86_avx_vtestc_ps_256:
28244 case Intrinsic::x86_avx_vtestc_pd_256:
28245 TestOpc = X86ISD::TESTP;
28246 [[fallthrough]];
28247 case Intrinsic::x86_sse41_ptestc:
28248 case Intrinsic::x86_avx_ptestc_256:
28249 // CF = 1
28250 X86CC = X86::COND_B;
28251 break;
28252 case Intrinsic::x86_avx_vtestnzc_ps:
28253 case Intrinsic::x86_avx_vtestnzc_pd:
28254 case Intrinsic::x86_avx_vtestnzc_ps_256:
28255 case Intrinsic::x86_avx_vtestnzc_pd_256:
28256 TestOpc = X86ISD::TESTP;
28257 [[fallthrough]];
28258 case Intrinsic::x86_sse41_ptestnzc:
28259 case Intrinsic::x86_avx_ptestnzc_256:
28260 // ZF and CF = 0
28261 X86CC = X86::COND_A;
28262 break;
28263 }
28264
28265 SDValue LHS = Op.getOperand(1);
28266 SDValue RHS = Op.getOperand(2);
28267 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
28268 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
28269 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28270 }
28271
28272 case Intrinsic::x86_sse42_pcmpistria128:
28273 case Intrinsic::x86_sse42_pcmpestria128:
28274 case Intrinsic::x86_sse42_pcmpistric128:
28275 case Intrinsic::x86_sse42_pcmpestric128:
28276 case Intrinsic::x86_sse42_pcmpistrio128:
28277 case Intrinsic::x86_sse42_pcmpestrio128:
28278 case Intrinsic::x86_sse42_pcmpistris128:
28279 case Intrinsic::x86_sse42_pcmpestris128:
28280 case Intrinsic::x86_sse42_pcmpistriz128:
28281 case Intrinsic::x86_sse42_pcmpestriz128: {
28282 unsigned Opcode;
28283 X86::CondCode X86CC;
28284 switch (IntNo) {
28285 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28285)
; // Can't reach here.
28286 case Intrinsic::x86_sse42_pcmpistria128:
28287 Opcode = X86ISD::PCMPISTR;
28288 X86CC = X86::COND_A;
28289 break;
28290 case Intrinsic::x86_sse42_pcmpestria128:
28291 Opcode = X86ISD::PCMPESTR;
28292 X86CC = X86::COND_A;
28293 break;
28294 case Intrinsic::x86_sse42_pcmpistric128:
28295 Opcode = X86ISD::PCMPISTR;
28296 X86CC = X86::COND_B;
28297 break;
28298 case Intrinsic::x86_sse42_pcmpestric128:
28299 Opcode = X86ISD::PCMPESTR;
28300 X86CC = X86::COND_B;
28301 break;
28302 case Intrinsic::x86_sse42_pcmpistrio128:
28303 Opcode = X86ISD::PCMPISTR;
28304 X86CC = X86::COND_O;
28305 break;
28306 case Intrinsic::x86_sse42_pcmpestrio128:
28307 Opcode = X86ISD::PCMPESTR;
28308 X86CC = X86::COND_O;
28309 break;
28310 case Intrinsic::x86_sse42_pcmpistris128:
28311 Opcode = X86ISD::PCMPISTR;
28312 X86CC = X86::COND_S;
28313 break;
28314 case Intrinsic::x86_sse42_pcmpestris128:
28315 Opcode = X86ISD::PCMPESTR;
28316 X86CC = X86::COND_S;
28317 break;
28318 case Intrinsic::x86_sse42_pcmpistriz128:
28319 Opcode = X86ISD::PCMPISTR;
28320 X86CC = X86::COND_E;
28321 break;
28322 case Intrinsic::x86_sse42_pcmpestriz128:
28323 Opcode = X86ISD::PCMPESTR;
28324 X86CC = X86::COND_E;
28325 break;
28326 }
28327 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28328 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28329 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
28330 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
28331 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28332 }
28333
28334 case Intrinsic::x86_sse42_pcmpistri128:
28335 case Intrinsic::x86_sse42_pcmpestri128: {
28336 unsigned Opcode;
28337 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
28338 Opcode = X86ISD::PCMPISTR;
28339 else
28340 Opcode = X86ISD::PCMPESTR;
28341
28342 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28343 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28344 return DAG.getNode(Opcode, dl, VTs, NewOps);
28345 }
28346
28347 case Intrinsic::x86_sse42_pcmpistrm128:
28348 case Intrinsic::x86_sse42_pcmpestrm128: {
28349 unsigned Opcode;
28350 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
28351 Opcode = X86ISD::PCMPISTR;
28352 else
28353 Opcode = X86ISD::PCMPESTR;
28354
28355 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28356 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28357 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
28358 }
28359
28360 case Intrinsic::eh_sjlj_lsda: {
28361 MachineFunction &MF = DAG.getMachineFunction();
28362 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28363 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28364 auto &Context = MF.getMMI().getContext();
28365 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
28366 Twine(MF.getFunctionNumber()));
28367 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
28368 DAG.getMCSymbol(S, PtrVT));
28369 }
28370
28371 case Intrinsic::x86_seh_lsda: {
28372 // Compute the symbol for the LSDA. We know it'll get emitted later.
28373 MachineFunction &MF = DAG.getMachineFunction();
28374 SDValue Op1 = Op.getOperand(1);
28375 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
28376 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
28377 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
28378
28379 // Generate a simple absolute symbol reference. This intrinsic is only
28380 // supported on 32-bit Windows, which isn't PIC.
28381 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
28382 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
28383 }
28384
28385 case Intrinsic::eh_recoverfp: {
28386 SDValue FnOp = Op.getOperand(1);
28387 SDValue IncomingFPOp = Op.getOperand(2);
28388 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
28389 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
28390 if (!Fn)
28391 report_fatal_error(
28392 "llvm.eh.recoverfp must take a function as the first argument");
28393 return recoverFramePointer(DAG, Fn, IncomingFPOp);
28394 }
28395
28396 case Intrinsic::localaddress: {
28397 // Returns one of the stack, base, or frame pointer registers, depending on
28398 // which is used to reference local variables.
28399 MachineFunction &MF = DAG.getMachineFunction();
28400 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28401 unsigned Reg;
28402 if (RegInfo->hasBasePointer(MF))
28403 Reg = RegInfo->getBaseRegister();
28404 else { // Handles the SP or FP case.
28405 bool CantUseFP = RegInfo->hasStackRealignment(MF);
28406 if (CantUseFP)
28407 Reg = RegInfo->getPtrSizedStackRegister(MF);
28408 else
28409 Reg = RegInfo->getPtrSizedFrameRegister(MF);
28410 }
28411 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
28412 }
28413 case Intrinsic::x86_avx512_vp2intersect_q_512:
28414 case Intrinsic::x86_avx512_vp2intersect_q_256:
28415 case Intrinsic::x86_avx512_vp2intersect_q_128:
28416 case Intrinsic::x86_avx512_vp2intersect_d_512:
28417 case Intrinsic::x86_avx512_vp2intersect_d_256:
28418 case Intrinsic::x86_avx512_vp2intersect_d_128: {
28419 MVT MaskVT = Op.getSimpleValueType();
28420
28421 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
28422 SDLoc DL(Op);
28423
28424 SDValue Operation =
28425 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
28426 Op->getOperand(1), Op->getOperand(2));
28427
28428 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
28429 MaskVT, Operation);
28430 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
28431 MaskVT, Operation);
28432 return DAG.getMergeValues({Result0, Result1}, DL);
28433 }
28434 case Intrinsic::x86_mmx_pslli_w:
28435 case Intrinsic::x86_mmx_pslli_d:
28436 case Intrinsic::x86_mmx_pslli_q:
28437 case Intrinsic::x86_mmx_psrli_w:
28438 case Intrinsic::x86_mmx_psrli_d:
28439 case Intrinsic::x86_mmx_psrli_q:
28440 case Intrinsic::x86_mmx_psrai_w:
28441 case Intrinsic::x86_mmx_psrai_d: {
28442 SDLoc DL(Op);
28443 SDValue ShAmt = Op.getOperand(2);
28444 // If the argument is a constant, convert it to a target constant.
28445 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
28446 // Clamp out of bounds shift amounts since they will otherwise be masked
28447 // to 8-bits which may make it no longer out of bounds.
28448 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
28449 if (ShiftAmount == 0)
28450 return Op.getOperand(1);
28451
28452 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28453 Op.getOperand(0), Op.getOperand(1),
28454 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
28455 }
28456
28457 unsigned NewIntrinsic;
28458 switch (IntNo) {
28459 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28459)
; // Can't reach here.
28460 case Intrinsic::x86_mmx_pslli_w:
28461 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
28462 break;
28463 case Intrinsic::x86_mmx_pslli_d:
28464 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
28465 break;
28466 case Intrinsic::x86_mmx_pslli_q:
28467 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
28468 break;
28469 case Intrinsic::x86_mmx_psrli_w:
28470 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
28471 break;
28472 case Intrinsic::x86_mmx_psrli_d:
28473 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
28474 break;
28475 case Intrinsic::x86_mmx_psrli_q:
28476 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
28477 break;
28478 case Intrinsic::x86_mmx_psrai_w:
28479 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
28480 break;
28481 case Intrinsic::x86_mmx_psrai_d:
28482 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
28483 break;
28484 }
28485
28486 // The vector shift intrinsics with scalars uses 32b shift amounts but
28487 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
28488 // MMX register.
28489 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
28490 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28491 DAG.getTargetConstant(NewIntrinsic, DL,
28492 getPointerTy(DAG.getDataLayout())),
28493 Op.getOperand(1), ShAmt);
28494 }
28495 case Intrinsic::thread_pointer: {
28496 if (Subtarget.isTargetELF()) {
28497 SDLoc dl(Op);
28498 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28499 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
28500 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
28501 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
28502 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28503 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
28504 }
28505 report_fatal_error(
28506 "Target OS doesn't support __builtin_thread_pointer() yet.");
28507 }
28508 }
28509}
28510
28511static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28512 SDValue Src, SDValue Mask, SDValue Base,
28513 SDValue Index, SDValue ScaleOp, SDValue Chain,
28514 const X86Subtarget &Subtarget) {
28515 SDLoc dl(Op);
28516 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28517 // Scale must be constant.
28518 if (!C)
28519 return SDValue();
28520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28521 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28522 TLI.getPointerTy(DAG.getDataLayout()));
28523 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
28524 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28525 // If source is undef or we know it won't be used, use a zero vector
28526 // to break register dependency.
28527 // TODO: use undef instead and let BreakFalseDeps deal with it?
28528 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28529 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28530
28531 // Cast mask to an integer type.
28532 Mask = DAG.getBitcast(MaskVT, Mask);
28533
28534 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28535
28536 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28537 SDValue Res =
28538 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28539 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28540 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28541}
28542
28543static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
28544 SDValue Src, SDValue Mask, SDValue Base,
28545 SDValue Index, SDValue ScaleOp, SDValue Chain,
28546 const X86Subtarget &Subtarget) {
28547 MVT VT = Op.getSimpleValueType();
28548 SDLoc dl(Op);
28549 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28550 // Scale must be constant.
28551 if (!C)
28552 return SDValue();
28553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28554 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28555 TLI.getPointerTy(DAG.getDataLayout()));
28556 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28557 VT.getVectorNumElements());
28558 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28559
28560 // We support two versions of the gather intrinsics. One with scalar mask and
28561 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28562 if (Mask.getValueType() != MaskVT)
28563 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28564
28565 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28566 // If source is undef or we know it won't be used, use a zero vector
28567 // to break register dependency.
28568 // TODO: use undef instead and let BreakFalseDeps deal with it?
28569 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28570 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28571
28572 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28573
28574 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28575 SDValue Res =
28576 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28577 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28578 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28579}
28580
28581static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28582 SDValue Src, SDValue Mask, SDValue Base,
28583 SDValue Index, SDValue ScaleOp, SDValue Chain,
28584 const X86Subtarget &Subtarget) {
28585 SDLoc dl(Op);
28586 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28587 // Scale must be constant.
28588 if (!C)
28589 return SDValue();
28590 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28591 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28592 TLI.getPointerTy(DAG.getDataLayout()));
28593 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28594 Src.getSimpleValueType().getVectorNumElements());
28595 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28596
28597 // We support two versions of the scatter intrinsics. One with scalar mask and
28598 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28599 if (Mask.getValueType() != MaskVT)
28600 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28601
28602 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28603
28604 SDVTList VTs = DAG.getVTList(MVT::Other);
28605 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
28606 SDValue Res =
28607 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28608 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28609 return Res;
28610}
28611
28612static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28613 SDValue Mask, SDValue Base, SDValue Index,
28614 SDValue ScaleOp, SDValue Chain,
28615 const X86Subtarget &Subtarget) {
28616 SDLoc dl(Op);
28617 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28618 // Scale must be constant.
28619 if (!C)
28620 return SDValue();
28621 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28622 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28623 TLI.getPointerTy(DAG.getDataLayout()));
28624 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
28625 SDValue Segment = DAG.getRegister(0, MVT::i32);
28626 MVT MaskVT =
28627 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
28628 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28629 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
28630 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
28631 return SDValue(Res, 0);
28632}
28633
28634/// Handles the lowering of builtin intrinsics with chain that return their
28635/// value into registers EDX:EAX.
28636/// If operand ScrReg is a valid register identifier, then operand 2 of N is
28637/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
28638/// TargetOpcode.
28639/// Returns a Glue value which can be used to add extra copy-from-reg if the
28640/// expanded intrinsics implicitly defines extra registers (i.e. not just
28641/// EDX:EAX).
28642static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
28643 SelectionDAG &DAG,
28644 unsigned TargetOpcode,
28645 unsigned SrcReg,
28646 const X86Subtarget &Subtarget,
28647 SmallVectorImpl<SDValue> &Results) {
28648 SDValue Chain = N->getOperand(0);
28649 SDValue Glue;
28650
28651 if (SrcReg) {
28652 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28652, __extension__
__PRETTY_FUNCTION__))
;
28653 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
28654 Glue = Chain.getValue(1);
28655 }
28656
28657 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28658 SDValue N1Ops[] = {Chain, Glue};
28659 SDNode *N1 = DAG.getMachineNode(
28660 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
28661 Chain = SDValue(N1, 0);
28662
28663 // Reads the content of XCR and returns it in registers EDX:EAX.
28664 SDValue LO, HI;
28665 if (Subtarget.is64Bit()) {
28666 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
28667 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
28668 LO.getValue(2));
28669 } else {
28670 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
28671 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
28672 LO.getValue(2));
28673 }
28674 Chain = HI.getValue(1);
28675 Glue = HI.getValue(2);
28676
28677 if (Subtarget.is64Bit()) {
28678 // Merge the two 32-bit values into a 64-bit one.
28679 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
28680 DAG.getConstant(32, DL, MVT::i8));
28681 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
28682 Results.push_back(Chain);
28683 return Glue;
28684 }
28685
28686 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
28687 SDValue Ops[] = { LO, HI };
28688 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
28689 Results.push_back(Pair);
28690 Results.push_back(Chain);
28691 return Glue;
28692}
28693
28694/// Handles the lowering of builtin intrinsics that read the time stamp counter
28695/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
28696/// READCYCLECOUNTER nodes.
28697static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
28698 SelectionDAG &DAG,
28699 const X86Subtarget &Subtarget,
28700 SmallVectorImpl<SDValue> &Results) {
28701 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
28702 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
28703 // and the EAX register is loaded with the low-order 32 bits.
28704 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
28705 /* NoRegister */0, Subtarget,
28706 Results);
28707 if (Opcode != X86::RDTSCP)
28708 return;
28709
28710 SDValue Chain = Results[1];
28711 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
28712 // the ECX register. Add 'ecx' explicitly to the chain.
28713 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
28714 Results[1] = ecx;
28715 Results.push_back(ecx.getValue(1));
28716}
28717
28718static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
28719 SelectionDAG &DAG) {
28720 SmallVector<SDValue, 3> Results;
28721 SDLoc DL(Op);
28722 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
28723 Results);
28724 return DAG.getMergeValues(Results, DL);
28725}
28726
28727static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
28728 MachineFunction &MF = DAG.getMachineFunction();
28729 SDValue Chain = Op.getOperand(0);
28730 SDValue RegNode = Op.getOperand(2);
28731 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28732 if (!EHInfo)
28733 report_fatal_error("EH registrations only live in functions using WinEH");
28734
28735 // Cast the operand to an alloca, and remember the frame index.
28736 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
28737 if (!FINode)
28738 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
28739 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
28740
28741 // Return the chain operand without making any DAG nodes.
28742 return Chain;
28743}
28744
28745static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
28746 MachineFunction &MF = DAG.getMachineFunction();
28747 SDValue Chain = Op.getOperand(0);
28748 SDValue EHGuard = Op.getOperand(2);
28749 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28750 if (!EHInfo)
28751 report_fatal_error("EHGuard only live in functions using WinEH");
28752
28753 // Cast the operand to an alloca, and remember the frame index.
28754 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
28755 if (!FINode)
28756 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
28757 EHInfo->EHGuardFrameIndex = FINode->getIndex();
28758
28759 // Return the chain operand without making any DAG nodes.
28760 return Chain;
28761}
28762
28763/// Emit Truncating Store with signed or unsigned saturation.
28764static SDValue
28765EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
28766 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
28767 SelectionDAG &DAG) {
28768 SDVTList VTs = DAG.getVTList(MVT::Other);
28769 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
28770 SDValue Ops[] = { Chain, Val, Ptr, Undef };
28771 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
28772 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28773}
28774
28775/// Emit Masked Truncating Store with signed or unsigned saturation.
28776static SDValue
28777EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
28778 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
28779 MachineMemOperand *MMO, SelectionDAG &DAG) {
28780 SDVTList VTs = DAG.getVTList(MVT::Other);
28781 SDValue Ops[] = { Chain, Val, Ptr, Mask };
28782 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
28783 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28784}
28785
28786static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
28787 SelectionDAG &DAG) {
28788 unsigned IntNo = Op.getConstantOperandVal(1);
28789 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
28790 if (!IntrData) {
28791 switch (IntNo) {
28792
28793 case Intrinsic::swift_async_context_addr: {
28794 SDLoc dl(Op);
28795 auto &MF = DAG.getMachineFunction();
28796 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
28797 if (Subtarget.is64Bit()) {
28798 MF.getFrameInfo().setFrameAddressIsTaken(true);
28799 X86FI->setHasSwiftAsyncContext(true);
28800 SDValue Chain = Op->getOperand(0);
28801 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
28802 SDValue Result =
28803 SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
28804 DAG.getTargetConstant(8, dl, MVT::i32)),
28805 0);
28806 // Return { result, chain }.
28807 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28808 CopyRBP.getValue(1));
28809 } else {
28810 // 32-bit so no special extended frame, create or reuse an existing
28811 // stack slot.
28812 if (!X86FI->getSwiftAsyncContextFrameIdx())
28813 X86FI->setSwiftAsyncContextFrameIdx(
28814 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
28815 SDValue Result =
28816 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
28817 // Return { result, chain }.
28818 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28819 Op->getOperand(0));
28820 }
28821 }
28822
28823 case llvm::Intrinsic::x86_seh_ehregnode:
28824 return MarkEHRegistrationNode(Op, DAG);
28825 case llvm::Intrinsic::x86_seh_ehguard:
28826 return MarkEHGuard(Op, DAG);
28827 case llvm::Intrinsic::x86_rdpkru: {
28828 SDLoc dl(Op);
28829 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28830 // Create a RDPKRU node and pass 0 to the ECX parameter.
28831 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
28832 DAG.getConstant(0, dl, MVT::i32));
28833 }
28834 case llvm::Intrinsic::x86_wrpkru: {
28835 SDLoc dl(Op);
28836 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
28837 // to the EDX and ECX parameters.
28838 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
28839 Op.getOperand(0), Op.getOperand(2),
28840 DAG.getConstant(0, dl, MVT::i32),
28841 DAG.getConstant(0, dl, MVT::i32));
28842 }
28843 case llvm::Intrinsic::asan_check_memaccess: {
28844 // Mark this as adjustsStack because it will be lowered to a call.
28845 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
28846 // Don't do anything here, we will expand these intrinsics out later.
28847 return Op;
28848 }
28849 case llvm::Intrinsic::x86_flags_read_u32:
28850 case llvm::Intrinsic::x86_flags_read_u64:
28851 case llvm::Intrinsic::x86_flags_write_u32:
28852 case llvm::Intrinsic::x86_flags_write_u64: {
28853 // We need a frame pointer because this will get lowered to a PUSH/POP
28854 // sequence.
28855 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28856 MFI.setHasCopyImplyingStackAdjustment(true);
28857 // Don't do anything here, we will expand these intrinsics out later
28858 // during FinalizeISel in EmitInstrWithCustomInserter.
28859 return Op;
28860 }
28861 case Intrinsic::x86_lwpins32:
28862 case Intrinsic::x86_lwpins64:
28863 case Intrinsic::x86_umwait:
28864 case Intrinsic::x86_tpause: {
28865 SDLoc dl(Op);
28866 SDValue Chain = Op->getOperand(0);
28867 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28868 unsigned Opcode;
28869
28870 switch (IntNo) {
28871 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28871)
;
28872 case Intrinsic::x86_umwait:
28873 Opcode = X86ISD::UMWAIT;
28874 break;
28875 case Intrinsic::x86_tpause:
28876 Opcode = X86ISD::TPAUSE;
28877 break;
28878 case Intrinsic::x86_lwpins32:
28879 case Intrinsic::x86_lwpins64:
28880 Opcode = X86ISD::LWPINS;
28881 break;
28882 }
28883
28884 SDValue Operation =
28885 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
28886 Op->getOperand(3), Op->getOperand(4));
28887 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28888 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28889 Operation.getValue(1));
28890 }
28891 case Intrinsic::x86_enqcmd:
28892 case Intrinsic::x86_enqcmds: {
28893 SDLoc dl(Op);
28894 SDValue Chain = Op.getOperand(0);
28895 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28896 unsigned Opcode;
28897 switch (IntNo) {
28898 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28898)
;
28899 case Intrinsic::x86_enqcmd:
28900 Opcode = X86ISD::ENQCMD;
28901 break;
28902 case Intrinsic::x86_enqcmds:
28903 Opcode = X86ISD::ENQCMDS;
28904 break;
28905 }
28906 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
28907 Op.getOperand(3));
28908 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
28909 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28910 Operation.getValue(1));
28911 }
28912 case Intrinsic::x86_aesenc128kl:
28913 case Intrinsic::x86_aesdec128kl:
28914 case Intrinsic::x86_aesenc256kl:
28915 case Intrinsic::x86_aesdec256kl: {
28916 SDLoc DL(Op);
28917 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
28918 SDValue Chain = Op.getOperand(0);
28919 unsigned Opcode;
28920
28921 switch (IntNo) {
28922 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28922)
;
28923 case Intrinsic::x86_aesenc128kl:
28924 Opcode = X86ISD::AESENC128KL;
28925 break;
28926 case Intrinsic::x86_aesdec128kl:
28927 Opcode = X86ISD::AESDEC128KL;
28928 break;
28929 case Intrinsic::x86_aesenc256kl:
28930 Opcode = X86ISD::AESENC256KL;
28931 break;
28932 case Intrinsic::x86_aesdec256kl:
28933 Opcode = X86ISD::AESDEC256KL;
28934 break;
28935 }
28936
28937 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28938 MachineMemOperand *MMO = MemIntr->getMemOperand();
28939 EVT MemVT = MemIntr->getMemoryVT();
28940 SDValue Operation = DAG.getMemIntrinsicNode(
28941 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
28942 MMO);
28943 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
28944
28945 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
28946 {ZF, Operation.getValue(0), Operation.getValue(2)});
28947 }
28948 case Intrinsic::x86_aesencwide128kl:
28949 case Intrinsic::x86_aesdecwide128kl:
28950 case Intrinsic::x86_aesencwide256kl:
28951 case Intrinsic::x86_aesdecwide256kl: {
28952 SDLoc DL(Op);
28953 SDVTList VTs = DAG.getVTList(
28954 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
28955 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
28956 SDValue Chain = Op.getOperand(0);
28957 unsigned Opcode;
28958
28959 switch (IntNo) {
28960 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28960)
;
28961 case Intrinsic::x86_aesencwide128kl:
28962 Opcode = X86ISD::AESENCWIDE128KL;
28963 break;
28964 case Intrinsic::x86_aesdecwide128kl:
28965 Opcode = X86ISD::AESDECWIDE128KL;
28966 break;
28967 case Intrinsic::x86_aesencwide256kl:
28968 Opcode = X86ISD::AESENCWIDE256KL;
28969 break;
28970 case Intrinsic::x86_aesdecwide256kl:
28971 Opcode = X86ISD::AESDECWIDE256KL;
28972 break;
28973 }
28974
28975 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28976 MachineMemOperand *MMO = MemIntr->getMemOperand();
28977 EVT MemVT = MemIntr->getMemoryVT();
28978 SDValue Operation = DAG.getMemIntrinsicNode(
28979 Opcode, DL, VTs,
28980 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
28981 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
28982 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
28983 MemVT, MMO);
28984 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
28985
28986 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
28987 {ZF, Operation.getValue(1), Operation.getValue(2),
28988 Operation.getValue(3), Operation.getValue(4),
28989 Operation.getValue(5), Operation.getValue(6),
28990 Operation.getValue(7), Operation.getValue(8),
28991 Operation.getValue(9)});
28992 }
28993 case Intrinsic::x86_testui: {
28994 SDLoc dl(Op);
28995 SDValue Chain = Op.getOperand(0);
28996 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28997 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
28998 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28999 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29000 Operation.getValue(1));
29001 }
29002 case Intrinsic::x86_atomic_bts_rm:
29003 case Intrinsic::x86_atomic_btc_rm:
29004 case Intrinsic::x86_atomic_btr_rm: {
29005 SDLoc DL(Op);
29006 MVT VT = Op.getSimpleValueType();
29007 SDValue Chain = Op.getOperand(0);
29008 SDValue Op1 = Op.getOperand(2);
29009 SDValue Op2 = Op.getOperand(3);
29010 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
29011 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
29012 : X86ISD::LBTR_RM;
29013 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29014 SDValue Res =
29015 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29016 {Chain, Op1, Op2}, VT, MMO);
29017 Chain = Res.getValue(1);
29018 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29019 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29020 }
29021 case Intrinsic::x86_atomic_bts:
29022 case Intrinsic::x86_atomic_btc:
29023 case Intrinsic::x86_atomic_btr: {
29024 SDLoc DL(Op);
29025 MVT VT = Op.getSimpleValueType();
29026 SDValue Chain = Op.getOperand(0);
29027 SDValue Op1 = Op.getOperand(2);
29028 SDValue Op2 = Op.getOperand(3);
29029 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
29030 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
29031 : X86ISD::LBTR;
29032 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
29033 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29034 SDValue Res =
29035 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29036 {Chain, Op1, Op2, Size}, VT, MMO);
29037 Chain = Res.getValue(1);
29038 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29039 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
29040 if (Imm)
29041 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
29042 DAG.getShiftAmountConstant(Imm, VT, DL));
29043 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29044 }
29045 case Intrinsic::x86_cmpccxadd32:
29046 case Intrinsic::x86_cmpccxadd64: {
29047 SDLoc DL(Op);
29048 SDValue Chain = Op.getOperand(0);
29049 SDValue Addr = Op.getOperand(2);
29050 SDValue Src1 = Op.getOperand(3);
29051 SDValue Src2 = Op.getOperand(4);
29052 SDValue CC = Op.getOperand(5);
29053 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29054 SDValue Operation = DAG.getMemIntrinsicNode(
29055 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
29056 MVT::i32, MMO);
29057 return Operation;
29058 }
29059 case Intrinsic::x86_aadd32:
29060 case Intrinsic::x86_aadd64:
29061 case Intrinsic::x86_aand32:
29062 case Intrinsic::x86_aand64:
29063 case Intrinsic::x86_aor32:
29064 case Intrinsic::x86_aor64:
29065 case Intrinsic::x86_axor32:
29066 case Intrinsic::x86_axor64: {
29067 SDLoc DL(Op);
29068 SDValue Chain = Op.getOperand(0);
29069 SDValue Op1 = Op.getOperand(2);
29070 SDValue Op2 = Op.getOperand(3);
29071 MVT VT = Op2.getSimpleValueType();
29072 unsigned Opc = 0;
29073 switch (IntNo) {
29074 default:
29075 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29075)
;
29076 case Intrinsic::x86_aadd32:
29077 case Intrinsic::x86_aadd64:
29078 Opc = X86ISD::AADD;
29079 break;
29080 case Intrinsic::x86_aand32:
29081 case Intrinsic::x86_aand64:
29082 Opc = X86ISD::AAND;
29083 break;
29084 case Intrinsic::x86_aor32:
29085 case Intrinsic::x86_aor64:
29086 Opc = X86ISD::AOR;
29087 break;
29088 case Intrinsic::x86_axor32:
29089 case Intrinsic::x86_axor64:
29090 Opc = X86ISD::AXOR;
29091 break;
29092 }
29093 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
29094 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
29095 {Chain, Op1, Op2}, VT, MMO);
29096 }
29097 case Intrinsic::x86_atomic_add_cc:
29098 case Intrinsic::x86_atomic_sub_cc:
29099 case Intrinsic::x86_atomic_or_cc:
29100 case Intrinsic::x86_atomic_and_cc:
29101 case Intrinsic::x86_atomic_xor_cc: {
29102 SDLoc DL(Op);
29103 SDValue Chain = Op.getOperand(0);
29104 SDValue Op1 = Op.getOperand(2);
29105 SDValue Op2 = Op.getOperand(3);
29106 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
29107 MVT VT = Op2.getSimpleValueType();
29108 unsigned Opc = 0;
29109 switch (IntNo) {
29110 default:
29111 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29111)
;
29112 case Intrinsic::x86_atomic_add_cc:
29113 Opc = X86ISD::LADD;
29114 break;
29115 case Intrinsic::x86_atomic_sub_cc:
29116 Opc = X86ISD::LSUB;
29117 break;
29118 case Intrinsic::x86_atomic_or_cc:
29119 Opc = X86ISD::LOR;
29120 break;
29121 case Intrinsic::x86_atomic_and_cc:
29122 Opc = X86ISD::LAND;
29123 break;
29124 case Intrinsic::x86_atomic_xor_cc:
29125 Opc = X86ISD::LXOR;
29126 break;
29127 }
29128 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29129 SDValue LockArith =
29130 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29131 {Chain, Op1, Op2}, VT, MMO);
29132 Chain = LockArith.getValue(1);
29133 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
29134 }
29135 }
29136 return SDValue();
29137 }
29138
29139 SDLoc dl(Op);
29140 switch(IntrData->Type) {
29141 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29141)
;
29142 case RDSEED:
29143 case RDRAND: {
29144 // Emit the node with the right value type.
29145 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
29146 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29147
29148 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
29149 // Otherwise return the value from Rand, which is always 0, casted to i32.
29150 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
29151 DAG.getConstant(1, dl, Op->getValueType(1)),
29152 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
29153 SDValue(Result.getNode(), 1)};
29154 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
29155
29156 // Return { result, isValid, chain }.
29157 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
29158 SDValue(Result.getNode(), 2));
29159 }
29160 case GATHER_AVX2: {
29161 SDValue Chain = Op.getOperand(0);
29162 SDValue Src = Op.getOperand(2);
29163 SDValue Base = Op.getOperand(3);
29164 SDValue Index = Op.getOperand(4);
29165 SDValue Mask = Op.getOperand(5);
29166 SDValue Scale = Op.getOperand(6);
29167 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29168 Scale, Chain, Subtarget);
29169 }
29170 case GATHER: {
29171 //gather(v1, mask, index, base, scale);
29172 SDValue Chain = Op.getOperand(0);
29173 SDValue Src = Op.getOperand(2);
29174 SDValue Base = Op.getOperand(3);
29175 SDValue Index = Op.getOperand(4);
29176 SDValue Mask = Op.getOperand(5);
29177 SDValue Scale = Op.getOperand(6);
29178 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
29179 Chain, Subtarget);
29180 }
29181 case SCATTER: {
29182 //scatter(base, mask, index, v1, scale);
29183 SDValue Chain = Op.getOperand(0);
29184 SDValue Base = Op.getOperand(2);
29185 SDValue Mask = Op.getOperand(3);
29186 SDValue Index = Op.getOperand(4);
29187 SDValue Src = Op.getOperand(5);
29188 SDValue Scale = Op.getOperand(6);
29189 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29190 Scale, Chain, Subtarget);
29191 }
29192 case PREFETCH: {
29193 const APInt &HintVal = Op.getConstantOperandAPInt(6);
29194 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29195, __extension__
__PRETTY_FUNCTION__))
29195 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29195, __extension__
__PRETTY_FUNCTION__))
;
29196 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
29197 SDValue Chain = Op.getOperand(0);
29198 SDValue Mask = Op.getOperand(2);
29199 SDValue Index = Op.getOperand(3);
29200 SDValue Base = Op.getOperand(4);
29201 SDValue Scale = Op.getOperand(5);
29202 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
29203 Subtarget);
29204 }
29205 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
29206 case RDTSC: {
29207 SmallVector<SDValue, 2> Results;
29208 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
29209 Results);
29210 return DAG.getMergeValues(Results, dl);
29211 }
29212 // Read Performance Monitoring Counters.
29213 case RDPMC:
29214 // Read Processor Register.
29215 case RDPRU:
29216 // GetExtended Control Register.
29217 case XGETBV: {
29218 SmallVector<SDValue, 2> Results;
29219
29220 // RDPMC uses ECX to select the index of the performance counter to read.
29221 // RDPRU uses ECX to select the processor register to read.
29222 // XGETBV uses ECX to select the index of the XCR register to return.
29223 // The result is stored into registers EDX:EAX.
29224 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
29225 Subtarget, Results);
29226 return DAG.getMergeValues(Results, dl);
29227 }
29228 // XTEST intrinsics.
29229 case XTEST: {
29230 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
29231 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29232
29233 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
29234 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
29235 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
29236 Ret, SDValue(InTrans.getNode(), 1));
29237 }
29238 case TRUNCATE_TO_MEM_VI8:
29239 case TRUNCATE_TO_MEM_VI16:
29240 case TRUNCATE_TO_MEM_VI32: {
29241 SDValue Mask = Op.getOperand(4);
29242 SDValue DataToTruncate = Op.getOperand(3);
29243 SDValue Addr = Op.getOperand(2);
29244 SDValue Chain = Op.getOperand(0);
29245
29246 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
29247 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29247, __extension__
__PRETTY_FUNCTION__))
;
29248
29249 EVT MemVT = MemIntr->getMemoryVT();
29250
29251 uint16_t TruncationOp = IntrData->Opc0;
29252 switch (TruncationOp) {
29253 case X86ISD::VTRUNC: {
29254 if (isAllOnesConstant(Mask)) // return just a truncate store
29255 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
29256 MemIntr->getMemOperand());
29257
29258 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29259 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29260 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
29261
29262 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
29263 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
29264 true /* truncating */);
29265 }
29266 case X86ISD::VTRUNCUS:
29267 case X86ISD::VTRUNCS: {
29268 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
29269 if (isAllOnesConstant(Mask))
29270 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
29271 MemIntr->getMemOperand(), DAG);
29272
29273 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29274 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29275
29276 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
29277 VMask, MemVT, MemIntr->getMemOperand(), DAG);
29278 }
29279 default:
29280 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29280)
;
29281 }
29282 }
29283 }
29284}
29285
29286SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
29287 SelectionDAG &DAG) const {
29288 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
29289 MFI.setReturnAddressIsTaken(true);
29290
29291 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
29292 return SDValue();
29293
29294 unsigned Depth = Op.getConstantOperandVal(0);
29295 SDLoc dl(Op);
29296 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29297
29298 if (Depth > 0) {
29299 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
29300 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29301 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
29302 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
29303 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
29304 MachinePointerInfo());
29305 }
29306
29307 // Just load the return address.
29308 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
29309 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
29310 MachinePointerInfo());
29311}
29312
29313SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
29314 SelectionDAG &DAG) const {
29315 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
29316 return getReturnAddressFrameIndex(DAG);
29317}
29318
29319SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
29320 MachineFunction &MF = DAG.getMachineFunction();
29321 MachineFrameInfo &MFI = MF.getFrameInfo();
29322 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
29323 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29324 EVT VT = Op.getValueType();
29325
29326 MFI.setFrameAddressIsTaken(true);
29327
29328 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
29329 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
29330 // is not possible to crawl up the stack without looking at the unwind codes
29331 // simultaneously.
29332 int FrameAddrIndex = FuncInfo->getFAIndex();
29333 if (!FrameAddrIndex) {
29334 // Set up a frame object for the return address.
29335 unsigned SlotSize = RegInfo->getSlotSize();
29336 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
29337 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
29338 FuncInfo->setFAIndex(FrameAddrIndex);
29339 }
29340 return DAG.getFrameIndex(FrameAddrIndex, VT);
29341 }
29342
29343 unsigned FrameReg =
29344 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
29345 SDLoc dl(Op); // FIXME probably not meaningful
29346 unsigned Depth = Op.getConstantOperandVal(0);
29347 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29349, __extension__
__PRETTY_FUNCTION__))
29348 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29349, __extension__
__PRETTY_FUNCTION__))
29349 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29349, __extension__
__PRETTY_FUNCTION__))
;
29350 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
29351 while (Depth--)
29352 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
29353 MachinePointerInfo());
29354 return FrameAddr;
29355}
29356
29357// FIXME? Maybe this could be a TableGen attribute on some registers and
29358// this table could be generated automatically from RegInfo.
29359Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
29360 const MachineFunction &MF) const {
29361 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
29362
29363 Register Reg = StringSwitch<unsigned>(RegName)
29364 .Case("esp", X86::ESP)
29365 .Case("rsp", X86::RSP)
29366 .Case("ebp", X86::EBP)
29367 .Case("rbp", X86::RBP)
29368 .Default(0);
29369
29370 if (Reg == X86::EBP || Reg == X86::RBP) {
29371 if (!TFI.hasFP(MF))
29372 report_fatal_error("register " + StringRef(RegName) +
29373 " is allocatable: function has no frame pointer");
29374#ifndef NDEBUG
29375 else {
29376 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29377 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
29378 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29379, __extension__
__PRETTY_FUNCTION__))
29379 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29379, __extension__
__PRETTY_FUNCTION__))
;
29380 }
29381#endif
29382 }
29383
29384 if (Reg)
29385 return Reg;
29386
29387 report_fatal_error("Invalid register name global variable");
29388}
29389
29390SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
29391 SelectionDAG &DAG) const {
29392 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29393 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
29394}
29395
29396Register X86TargetLowering::getExceptionPointerRegister(
29397 const Constant *PersonalityFn) const {
29398 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
29399 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29400
29401 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
29402}
29403
29404Register X86TargetLowering::getExceptionSelectorRegister(
29405 const Constant *PersonalityFn) const {
29406 // Funclet personalities don't use selectors (the runtime does the selection).
29407 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
29408 return X86::NoRegister;
29409 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29410}
29411
29412bool X86TargetLowering::needsFixedCatchObjects() const {
29413 return Subtarget.isTargetWin64();
29414}
29415
29416SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
29417 SDValue Chain = Op.getOperand(0);
29418 SDValue Offset = Op.getOperand(1);
29419 SDValue Handler = Op.getOperand(2);
29420 SDLoc dl (Op);
29421
29422 EVT PtrVT = getPointerTy(DAG.getDataLayout());
29423 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29424 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
29425 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29427, __extension__
__PRETTY_FUNCTION__))
29426 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29427, __extension__
__PRETTY_FUNCTION__))
29427 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29427, __extension__
__PRETTY_FUNCTION__))
;
29428 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
29429 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
29430
29431 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
29432 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
29433 dl));
29434 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
29435 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
29436 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
29437
29438 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
29439 DAG.getRegister(StoreAddrReg, PtrVT));
29440}
29441
29442SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
29443 SelectionDAG &DAG) const {
29444 SDLoc DL(Op);
29445 // If the subtarget is not 64bit, we may need the global base reg
29446 // after isel expand pseudo, i.e., after CGBR pass ran.
29447 // Therefore, ask for the GlobalBaseReg now, so that the pass
29448 // inserts the code for us in case we need it.
29449 // Otherwise, we will end up in a situation where we will
29450 // reference a virtual register that is not defined!
29451 if (!Subtarget.is64Bit()) {
29452 const X86InstrInfo *TII = Subtarget.getInstrInfo();
29453 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
29454 }
29455 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
29456 DAG.getVTList(MVT::i32, MVT::Other),
29457 Op.getOperand(0), Op.getOperand(1));
29458}
29459
29460SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
29461 SelectionDAG &DAG) const {
29462 SDLoc DL(Op);
29463 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
29464 Op.getOperand(0), Op.getOperand(1));
29465}
29466
29467SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
29468 SelectionDAG &DAG) const {
29469 SDLoc DL(Op);
29470 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
29471 Op.getOperand(0));
29472}
29473
29474static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
29475 return Op.getOperand(0);
29476}
29477
29478SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
29479 SelectionDAG &DAG) const {
29480 SDValue Root = Op.getOperand(0);
29481 SDValue Trmp = Op.getOperand(1); // trampoline
29482 SDValue FPtr = Op.getOperand(2); // nested function
29483 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
29484 SDLoc dl (Op);
29485
29486 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
29487 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29488
29489 if (Subtarget.is64Bit()) {
29490 SDValue OutChains[6];
29491
29492 // Large code-model.
29493 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
29494 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
29495
29496 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
29497 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
29498
29499 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
29500
29501 // Load the pointer to the nested function into R11.
29502 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
29503 SDValue Addr = Trmp;
29504 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29505 Addr, MachinePointerInfo(TrmpAddr));
29506
29507 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29508 DAG.getConstant(2, dl, MVT::i64));
29509 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
29510 MachinePointerInfo(TrmpAddr, 2), Align(2));
29511
29512 // Load the 'nest' parameter value into R10.
29513 // R10 is specified in X86CallingConv.td
29514 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
29515 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29516 DAG.getConstant(10, dl, MVT::i64));
29517 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29518 Addr, MachinePointerInfo(TrmpAddr, 10));
29519
29520 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29521 DAG.getConstant(12, dl, MVT::i64));
29522 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
29523 MachinePointerInfo(TrmpAddr, 12), Align(2));
29524
29525 // Jump to the nested function.
29526 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
29527 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29528 DAG.getConstant(20, dl, MVT::i64));
29529 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29530 Addr, MachinePointerInfo(TrmpAddr, 20));
29531
29532 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
29533 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29534 DAG.getConstant(22, dl, MVT::i64));
29535 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
29536 Addr, MachinePointerInfo(TrmpAddr, 22));
29537
29538 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29539 } else {
29540 const Function *Func =
29541 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
29542 CallingConv::ID CC = Func->getCallingConv();
29543 unsigned NestReg;
29544
29545 switch (CC) {
29546 default:
29547 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29547)
;
29548 case CallingConv::C:
29549 case CallingConv::X86_StdCall: {
29550 // Pass 'nest' parameter in ECX.
29551 // Must be kept in sync with X86CallingConv.td
29552 NestReg = X86::ECX;
29553
29554 // Check that ECX wasn't needed by an 'inreg' parameter.
29555 FunctionType *FTy = Func->getFunctionType();
29556 const AttributeList &Attrs = Func->getAttributes();
29557
29558 if (!Attrs.isEmpty() && !Func->isVarArg()) {
29559 unsigned InRegCount = 0;
29560 unsigned Idx = 0;
29561
29562 for (FunctionType::param_iterator I = FTy->param_begin(),
29563 E = FTy->param_end(); I != E; ++I, ++Idx)
29564 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
29565 const DataLayout &DL = DAG.getDataLayout();
29566 // FIXME: should only count parameters that are lowered to integers.
29567 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
29568 }
29569
29570 if (InRegCount > 2) {
29571 report_fatal_error("Nest register in use - reduce number of inreg"
29572 " parameters!");
29573 }
29574 }
29575 break;
29576 }
29577 case CallingConv::X86_FastCall:
29578 case CallingConv::X86_ThisCall:
29579 case CallingConv::Fast:
29580 case CallingConv::Tail:
29581 case CallingConv::SwiftTail:
29582 // Pass 'nest' parameter in EAX.
29583 // Must be kept in sync with X86CallingConv.td
29584 NestReg = X86::EAX;
29585 break;
29586 }
29587
29588 SDValue OutChains[4];
29589 SDValue Addr, Disp;
29590
29591 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29592 DAG.getConstant(10, dl, MVT::i32));
29593 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
29594
29595 // This is storing the opcode for MOV32ri.
29596 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
29597 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
29598 OutChains[0] =
29599 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
29600 Trmp, MachinePointerInfo(TrmpAddr));
29601
29602 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29603 DAG.getConstant(1, dl, MVT::i32));
29604 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
29605 MachinePointerInfo(TrmpAddr, 1), Align(1));
29606
29607 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
29608 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29609 DAG.getConstant(5, dl, MVT::i32));
29610 OutChains[2] =
29611 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
29612 MachinePointerInfo(TrmpAddr, 5), Align(1));
29613
29614 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29615 DAG.getConstant(6, dl, MVT::i32));
29616 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
29617 MachinePointerInfo(TrmpAddr, 6), Align(1));
29618
29619 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29620 }
29621}
29622
29623SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
29624 SelectionDAG &DAG) const {
29625 /*
29626 The rounding mode is in bits 11:10 of FPSR, and has the following
29627 settings:
29628 00 Round to nearest
29629 01 Round to -inf
29630 10 Round to +inf
29631 11 Round to 0
29632
29633 GET_ROUNDING, on the other hand, expects the following:
29634 -1 Undefined
29635 0 Round to 0
29636 1 Round to nearest
29637 2 Round to +inf
29638 3 Round to -inf
29639
29640 To perform the conversion, we use a packed lookup table of the four 2-bit
29641 values that we can index by FPSP[11:10]
29642 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
29643
29644 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
29645 */
29646
29647 MachineFunction &MF = DAG.getMachineFunction();
29648 MVT VT = Op.getSimpleValueType();
29649 SDLoc DL(Op);
29650
29651 // Save FP Control Word to stack slot
29652 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
29653 SDValue StackSlot =
29654 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
29655
29656 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
29657
29658 SDValue Chain = Op.getOperand(0);
29659 SDValue Ops[] = {Chain, StackSlot};
29660 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
29661 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
29662 Align(2), MachineMemOperand::MOStore);
29663
29664 // Load FP Control Word from stack slot
29665 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
29666 Chain = CWD.getValue(1);
29667
29668 // Mask and turn the control bits into a shift for the lookup table.
29669 SDValue Shift =
29670 DAG.getNode(ISD::SRL, DL, MVT::i16,
29671 DAG.getNode(ISD::AND, DL, MVT::i16,
29672 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
29673 DAG.getConstant(9, DL, MVT::i8));
29674 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
29675
29676 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
29677 SDValue RetVal =
29678 DAG.getNode(ISD::AND, DL, MVT::i32,
29679 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
29680 DAG.getConstant(3, DL, MVT::i32));
29681
29682 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
29683
29684 return DAG.getMergeValues({RetVal, Chain}, DL);
29685}
29686
29687SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
29688 SelectionDAG &DAG) const {
29689 MachineFunction &MF = DAG.getMachineFunction();
29690 SDLoc DL(Op);
29691 SDValue Chain = Op.getNode()->getOperand(0);
29692
29693 // FP control word may be set only from data in memory. So we need to allocate
29694 // stack space to save/load FP control word.
29695 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
29696 SDValue StackSlot =
29697 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
29698 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
29699 MachineMemOperand *MMO =
29700 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
29701
29702 // Store FP control word into memory.
29703 SDValue Ops[] = {Chain, StackSlot};
29704 Chain = DAG.getMemIntrinsicNode(
29705 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
29706
29707 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
29708 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
29709 Chain = CWD.getValue(1);
29710 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
29711 DAG.getConstant(0xf3ff, DL, MVT::i16));
29712
29713 // Calculate new rounding mode.
29714 SDValue NewRM = Op.getNode()->getOperand(1);
29715 SDValue RMBits;
29716 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
29717 uint64_t RM = CVal->getZExtValue();
29718 int FieldVal;
29719 switch (static_cast<RoundingMode>(RM)) {
29720 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
29721 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
29722 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
29723 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
29724 default:
29725 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29725)
;
29726 }
29727 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
29728 } else {
29729 // Need to convert argument into bits of control word:
29730 // 0 Round to 0 -> 11
29731 // 1 Round to nearest -> 00
29732 // 2 Round to +inf -> 10
29733 // 3 Round to -inf -> 01
29734 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
29735 // To make the conversion, put all these values into a value 0xc9 and shift
29736 // it left depending on the rounding mode:
29737 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
29738 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
29739 // ...
29740 // (0xc9 << (2 * NewRM + 4)) & 0xc00
29741 SDValue ShiftValue =
29742 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
29743 DAG.getNode(ISD::ADD, DL, MVT::i32,
29744 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
29745 DAG.getConstant(1, DL, MVT::i8)),
29746 DAG.getConstant(4, DL, MVT::i32)));
29747 SDValue Shifted =
29748 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
29749 ShiftValue);
29750 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
29751 DAG.getConstant(0xc00, DL, MVT::i16));
29752 }
29753
29754 // Update rounding mode bits and store the new FP Control Word into stack.
29755 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
29756 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
29757
29758 // Load FP control word from the slot.
29759 SDValue OpsLD[] = {Chain, StackSlot};
29760 MachineMemOperand *MMOL =
29761 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
29762 Chain = DAG.getMemIntrinsicNode(
29763 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
29764
29765 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
29766 // same way but in bits 14:13.
29767 if (Subtarget.hasSSE1()) {
29768 // Store MXCSR into memory.
29769 Chain = DAG.getNode(
29770 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29771 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
29772 StackSlot);
29773
29774 // Load MXCSR from stack slot and clear RM field (bits 14:13).
29775 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
29776 Chain = CWD.getValue(1);
29777 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
29778 DAG.getConstant(0xffff9fff, DL, MVT::i32));
29779
29780 // Shift X87 RM bits from 11:10 to 14:13.
29781 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
29782 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
29783 DAG.getConstant(3, DL, MVT::i8));
29784
29785 // Update rounding mode bits and store the new FP Control Word into stack.
29786 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
29787 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
29788
29789 // Load MXCSR from the slot.
29790 Chain = DAG.getNode(
29791 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29792 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
29793 StackSlot);
29794 }
29795
29796 return Chain;
29797}
29798
29799/// Lower a vector CTLZ using native supported vector CTLZ instruction.
29800//
29801// i8/i16 vector implemented using dword LZCNT vector instruction
29802// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
29803// split the vector, perform operation on it's Lo a Hi part and
29804// concatenate the results.
29805static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
29806 const X86Subtarget &Subtarget) {
29807 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29807, __extension__ __PRETTY_FUNCTION__))
;
29808 SDLoc dl(Op);
29809 MVT VT = Op.getSimpleValueType();
29810 MVT EltVT = VT.getVectorElementType();
29811 unsigned NumElems = VT.getVectorNumElements();
29812
29813 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29814, __extension__
__PRETTY_FUNCTION__))
29814 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29814, __extension__
__PRETTY_FUNCTION__))
;
29815
29816 // Split vector, it's Lo and Hi parts will be handled in next iteration.
29817 if (NumElems > 16 ||
29818 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
29819 return splitVectorIntUnary(Op, DAG);
29820
29821 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29822 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29823, __extension__
__PRETTY_FUNCTION__))
29823 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29823, __extension__
__PRETTY_FUNCTION__))
;
29824
29825 // Use native supported vector instruction vplzcntd.
29826 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
29827 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
29828 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
29829 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
29830
29831 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
29832}
29833
29834// Lower CTLZ using a PSHUFB lookup table implementation.
29835static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
29836 const X86Subtarget &Subtarget,
29837 SelectionDAG &DAG) {
29838 MVT VT = Op.getSimpleValueType();
29839 int NumElts = VT.getVectorNumElements();
29840 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
29841 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
29842
29843 // Per-nibble leading zero PSHUFB lookup table.
29844 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
29845 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
29846 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
29847 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29848
29849 SmallVector<SDValue, 64> LUTVec;
29850 for (int i = 0; i < NumBytes; ++i)
29851 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29852 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29853
29854 // Begin by bitcasting the input to byte vector, then split those bytes
29855 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
29856 // If the hi input nibble is zero then we add both results together, otherwise
29857 // we just take the hi result (by masking the lo result to zero before the
29858 // add).
29859 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29860 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29861
29862 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29863 SDValue Lo = Op0;
29864 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29865 SDValue HiZ;
29866 if (CurrVT.is512BitVector()) {
29867 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29868 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29869 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29870 } else {
29871 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29872 }
29873
29874 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29875 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29876 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29877 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29878
29879 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29880 // of the current vector width in the same way we did for the nibbles.
29881 // If the upper half of the input element is zero then add the halves'
29882 // leading zero counts together, otherwise just use the upper half's.
29883 // Double the width of the result until we are at target width.
29884 while (CurrVT != VT) {
29885 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29886 int CurrNumElts = CurrVT.getVectorNumElements();
29887 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29888 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29889 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29890
29891 // Check if the upper half of the input element is zero.
29892 if (CurrVT.is512BitVector()) {
29893 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29894 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29895 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29896 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29897 } else {
29898 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29899 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29900 }
29901 HiZ = DAG.getBitcast(NextVT, HiZ);
29902
29903 // Move the upper/lower halves to the lower bits as we'll be extending to
29904 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29905 // together.
29906 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29907 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29908 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29909 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29910 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29911 CurrVT = NextVT;
29912 }
29913
29914 return Res;
29915}
29916
29917static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
29918 const X86Subtarget &Subtarget,
29919 SelectionDAG &DAG) {
29920 MVT VT = Op.getSimpleValueType();
29921
29922 if (Subtarget.hasCDI() &&
29923 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29924 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29925 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29926
29927 // Decompose 256-bit ops into smaller 128-bit ops.
29928 if (VT.is256BitVector() && !Subtarget.hasInt256())
29929 return splitVectorIntUnary(Op, DAG);
29930
29931 // Decompose 512-bit ops into smaller 256-bit ops.
29932 if (VT.is512BitVector() && !Subtarget.hasBWI())
29933 return splitVectorIntUnary(Op, DAG);
29934
29935 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29935, __extension__
__PRETTY_FUNCTION__))
;
29936 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29937}
29938
29939static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29940 SelectionDAG &DAG) {
29941 MVT VT = Op.getSimpleValueType();
29942 MVT OpVT = VT;
29943 unsigned NumBits = VT.getSizeInBits();
29944 SDLoc dl(Op);
29945 unsigned Opc = Op.getOpcode();
29946
29947 if (VT.isVector())
29948 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29949
29950 Op = Op.getOperand(0);
29951 if (VT == MVT::i8) {
29952 // Zero extend to i32 since there is not an i8 bsr.
29953 OpVT = MVT::i32;
29954 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29955 }
29956
29957 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29958 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29959 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
29960
29961 if (Opc == ISD::CTLZ) {
29962 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29963 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29964 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29965 Op.getValue(1)};
29966 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29967 }
29968
29969 // Finally xor with NumBits-1.
29970 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29971 DAG.getConstant(NumBits - 1, dl, OpVT));
29972
29973 if (VT == MVT::i8)
29974 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29975 return Op;
29976}
29977
29978static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29979 SelectionDAG &DAG) {
29980 MVT VT = Op.getSimpleValueType();
29981 unsigned NumBits = VT.getScalarSizeInBits();
29982 SDValue N0 = Op.getOperand(0);
29983 SDLoc dl(Op);
29984
29985 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29986, __extension__
__PRETTY_FUNCTION__))
29986 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29986, __extension__
__PRETTY_FUNCTION__))
;
29987
29988 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29989 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29990 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
29991
29992 // If src is known never zero we can skip the CMOV.
29993 if (DAG.isKnownNeverZero(N0))
29994 return Op;
29995
29996 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29997 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29998 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29999 Op.getValue(1)};
30000 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
30001}
30002
30003static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
30004 const X86Subtarget &Subtarget) {
30005 MVT VT = Op.getSimpleValueType();
30006 if (VT == MVT::i16 || VT == MVT::i32)
30007 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
30008
30009 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30010 return splitVectorIntBinary(Op, DAG);
30011
30012 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30014, __extension__
__PRETTY_FUNCTION__))
30013 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30014, __extension__
__PRETTY_FUNCTION__))
30014 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30014, __extension__
__PRETTY_FUNCTION__))
;
30015 return splitVectorIntBinary(Op, DAG);
30016}
30017
30018static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
30019 const X86Subtarget &Subtarget) {
30020 MVT VT = Op.getSimpleValueType();
30021 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
30022 unsigned Opcode = Op.getOpcode();
30023 SDLoc DL(Op);
30024
30025 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
30026 (VT.is256BitVector() && !Subtarget.hasInt256())) {
30027 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30028, __extension__
__PRETTY_FUNCTION__))
30028 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30028, __extension__
__PRETTY_FUNCTION__))
;
30029 return splitVectorIntBinary(Op, DAG);
30030 }
30031
30032 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
30033 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30034 EVT SetCCResultType =
30035 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30036
30037 unsigned BitWidth = VT.getScalarSizeInBits();
30038 if (Opcode == ISD::USUBSAT) {
30039 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
30040 // Handle a special-case with a bit-hack instead of cmp+select:
30041 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
30042 // If the target can use VPTERNLOG, DAGToDAG will match this as
30043 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
30044 // "broadcast" constant load.
30045 ConstantSDNode *C = isConstOrConstSplat(Y, true);
30046 if (C && C->getAPIntValue().isSignMask()) {
30047 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
30048 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
30049 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
30050 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
30051 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
30052 }
30053 }
30054 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
30055 // usubsat X, Y --> (X >u Y) ? X - Y : 0
30056 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
30057 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
30058 // TODO: Move this to DAGCombiner?
30059 if (SetCCResultType == VT &&
30060 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
30061 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
30062 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
30063 }
30064 }
30065
30066 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
30067 (!VT.isVector() || VT == MVT::v2i64)) {
30068 APInt MinVal = APInt::getSignedMinValue(BitWidth);
30069 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
30070 SDValue Zero = DAG.getConstant(0, DL, VT);
30071 SDValue Result =
30072 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
30073 DAG.getVTList(VT, SetCCResultType), X, Y);
30074 SDValue SumDiff = Result.getValue(0);
30075 SDValue Overflow = Result.getValue(1);
30076 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
30077 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
30078 SDValue SumNeg =
30079 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
30080 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
30081 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
30082 }
30083
30084 // Use default expansion.
30085 return SDValue();
30086}
30087
30088static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
30089 SelectionDAG &DAG) {
30090 MVT VT = Op.getSimpleValueType();
30091 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
30092 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30093 // 8-bit integer abs to NEG and CMOV.
30094 SDLoc DL(Op);
30095 SDValue N0 = Op.getOperand(0);
30096 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30097 DAG.getConstant(0, DL, VT), N0);
30098 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
30099 SDValue(Neg.getNode(), 1)};
30100 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
30101 }
30102
30103 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
30104 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
30105 SDLoc DL(Op);
30106 SDValue Src = Op.getOperand(0);
30107 SDValue Sub =
30108 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
30109 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
30110 }
30111
30112 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
30113 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
30114 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
;
30115 return splitVectorIntUnary(Op, DAG);
30116 }
30117
30118 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30119 return splitVectorIntUnary(Op, DAG);
30120
30121 // Default to expand.
30122 return SDValue();
30123}
30124
30125static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
30126 SelectionDAG &DAG) {
30127 MVT VT = Op.getSimpleValueType();
30128
30129 // For AVX1 cases, split to use legal ops.
30130 if (VT.is256BitVector() && !Subtarget.hasInt256())
30131 return splitVectorIntBinary(Op, DAG);
30132
30133 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30134 return splitVectorIntBinary(Op, DAG);
30135
30136 // Default to expand.
30137 return SDValue();
30138}
30139
30140static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
30141 SelectionDAG &DAG) {
30142 MVT VT = Op.getSimpleValueType();
30143
30144 // For AVX1 cases, split to use legal ops.
30145 if (VT.is256BitVector() && !Subtarget.hasInt256())
30146 return splitVectorIntBinary(Op, DAG);
30147
30148 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30149 return splitVectorIntBinary(Op, DAG);
30150
30151 // umax(x,1) --> sub(x,cmpeq(x,0))
30152 // TODO: Move this to expandIntMINMAX?
30153 if (VT.isVector() && Op.getOpcode() == ISD::UMAX &&
30154 llvm::isOneOrOneSplat(Op.getOperand(1), true)) {
30155 SDLoc DL(Op);
30156 SDValue X = DAG.getFreeze(Op.getOperand(0));
30157 SDValue Zero = getZeroVector(VT, Subtarget, DAG, DL);
30158 return DAG.getNode(ISD::SUB, DL, VT, X,
30159 DAG.getSetCC(DL, VT, X, Zero, ISD::SETEQ));
30160 }
30161
30162 // Default to expand.
30163 return SDValue();
30164}
30165
30166static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
30167 SelectionDAG &DAG) {
30168 MVT VT = Op.getSimpleValueType();
30169
30170 // For AVX1 cases, split to use legal ops.
30171 if (VT.is256BitVector() && !Subtarget.hasInt256())
30172 return splitVectorIntBinary(Op, DAG);
30173
30174 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
30175 return splitVectorIntBinary(Op, DAG);
30176
30177 // TODO: Add TargetLowering expandABD() support.
30178 SDLoc dl(Op);
30179 bool IsSigned = Op.getOpcode() == ISD::ABDS;
30180 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
30181 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
30182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30183
30184 // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
30185 // abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))
30186 unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
30187 unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
30188 if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {
30189 SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
30190 SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
30191 return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
30192 }
30193
30194 // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
30195 // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
30196 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30197 ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
30198 SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
30199 return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
30200 DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
30201}
30202
30203static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
30204 SelectionDAG &DAG) {
30205 SDLoc dl(Op);
30206 MVT VT = Op.getSimpleValueType();
30207
30208 // Decompose 256-bit ops into 128-bit ops.
30209 if (VT.is256BitVector() && !Subtarget.hasInt256())
30210 return splitVectorIntBinary(Op, DAG);
30211
30212 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30213 return splitVectorIntBinary(Op, DAG);
30214
30215 SDValue A = Op.getOperand(0);
30216 SDValue B = Op.getOperand(1);
30217
30218 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
30219 // vector pairs, multiply and truncate.
30220 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
30221 unsigned NumElts = VT.getVectorNumElements();
30222
30223 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30224 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30225 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30226 return DAG.getNode(
30227 ISD::TRUNCATE, dl, VT,
30228 DAG.getNode(ISD::MUL, dl, ExVT,
30229 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
30230 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
30231 }
30232
30233 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30234
30235 // Extract the lo/hi parts to any extend to i16.
30236 // We're going to mask off the low byte of each result element of the
30237 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
30238 // element.
30239 SDValue Undef = DAG.getUNDEF(VT);
30240 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
30241 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
30242
30243 SDValue BLo, BHi;
30244 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30245 // If the RHS is a constant, manually unpackl/unpackh.
30246 SmallVector<SDValue, 16> LoOps, HiOps;
30247 for (unsigned i = 0; i != NumElts; i += 16) {
30248 for (unsigned j = 0; j != 8; ++j) {
30249 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
30250 MVT::i16));
30251 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
30252 MVT::i16));
30253 }
30254 }
30255
30256 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30257 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30258 } else {
30259 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
30260 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
30261 }
30262
30263 // Multiply, mask the lower 8bits of the lo/hi results and pack.
30264 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
30265 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
30266 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30267 }
30268
30269 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
30270 if (VT == MVT::v4i32) {
30271 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30272, __extension__
__PRETTY_FUNCTION__))
30272 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30272, __extension__
__PRETTY_FUNCTION__))
;
30273
30274 // Extract the odd parts.
30275 static const int UnpackMask[] = { 1, -1, 3, -1 };
30276 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
30277 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
30278
30279 // Multiply the even parts.
30280 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30281 DAG.getBitcast(MVT::v2i64, A),
30282 DAG.getBitcast(MVT::v2i64, B));
30283 // Now multiply odd parts.
30284 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30285 DAG.getBitcast(MVT::v2i64, Aodds),
30286 DAG.getBitcast(MVT::v2i64, Bodds));
30287
30288 Evens = DAG.getBitcast(VT, Evens);
30289 Odds = DAG.getBitcast(VT, Odds);
30290
30291 // Merge the two vectors back together with a shuffle. This expands into 2
30292 // shuffles.
30293 static const int ShufMask[] = { 0, 4, 2, 6 };
30294 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
30295 }
30296
30297 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30298, __extension__
__PRETTY_FUNCTION__))
30298 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30298, __extension__
__PRETTY_FUNCTION__))
;
30299 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30299, __extension__
__PRETTY_FUNCTION__))
;
30300
30301 // Ahi = psrlqi(a, 32);
30302 // Bhi = psrlqi(b, 32);
30303 //
30304 // AloBlo = pmuludq(a, b);
30305 // AloBhi = pmuludq(a, Bhi);
30306 // AhiBlo = pmuludq(Ahi, b);
30307 //
30308 // Hi = psllqi(AloBhi + AhiBlo, 32);
30309 // return AloBlo + Hi;
30310 KnownBits AKnown = DAG.computeKnownBits(A);
30311 KnownBits BKnown = DAG.computeKnownBits(B);
30312
30313 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
30314 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
30315 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
30316
30317 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
30318 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
30319 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
30320
30321 SDValue Zero = DAG.getConstant(0, dl, VT);
30322
30323 // Only multiply lo/hi halves that aren't known to be zero.
30324 SDValue AloBlo = Zero;
30325 if (!ALoIsZero && !BLoIsZero)
30326 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
30327
30328 SDValue AloBhi = Zero;
30329 if (!ALoIsZero && !BHiIsZero) {
30330 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
30331 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
30332 }
30333
30334 SDValue AhiBlo = Zero;
30335 if (!AHiIsZero && !BLoIsZero) {
30336 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
30337 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
30338 }
30339
30340 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
30341 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
30342
30343 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
30344}
30345
30346static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
30347 MVT VT, bool IsSigned,
30348 const X86Subtarget &Subtarget,
30349 SelectionDAG &DAG,
30350 SDValue *Low = nullptr) {
30351 unsigned NumElts = VT.getVectorNumElements();
30352
30353 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
30354 // to a vXi16 type. Do the multiplies, shift the results and pack the half
30355 // lane results back together.
30356
30357 // We'll take different approaches for signed and unsigned.
30358 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
30359 // and use pmullw to calculate the full 16-bit product.
30360 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
30361 // shift them left into the upper byte of each word. This allows us to use
30362 // pmulhw to calculate the full 16-bit product. This trick means we don't
30363 // need to sign extend the bytes to use pmullw.
30364
30365 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30366 SDValue Zero = DAG.getConstant(0, dl, VT);
30367
30368 SDValue ALo, AHi;
30369 if (IsSigned) {
30370 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
30371 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
30372 } else {
30373 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
30374 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
30375 }
30376
30377 SDValue BLo, BHi;
30378 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30379 // If the RHS is a constant, manually unpackl/unpackh and extend.
30380 SmallVector<SDValue, 16> LoOps, HiOps;
30381 for (unsigned i = 0; i != NumElts; i += 16) {
30382 for (unsigned j = 0; j != 8; ++j) {
30383 SDValue LoOp = B.getOperand(i + j);
30384 SDValue HiOp = B.getOperand(i + j + 8);
30385
30386 if (IsSigned) {
30387 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
30388 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
30389 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
30390 DAG.getConstant(8, dl, MVT::i16));
30391 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
30392 DAG.getConstant(8, dl, MVT::i16));
30393 } else {
30394 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
30395 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
30396 }
30397
30398 LoOps.push_back(LoOp);
30399 HiOps.push_back(HiOp);
30400 }
30401 }
30402
30403 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30404 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30405 } else if (IsSigned) {
30406 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
30407 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
30408 } else {
30409 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
30410 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
30411 }
30412
30413 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
30414 // pack back to vXi8.
30415 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
30416 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
30417 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
30418
30419 if (Low)
30420 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30421
30422 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
30423}
30424
30425static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
30426 SelectionDAG &DAG) {
30427 SDLoc dl(Op);
30428 MVT VT = Op.getSimpleValueType();
30429 bool IsSigned = Op->getOpcode() == ISD::MULHS;
30430 unsigned NumElts = VT.getVectorNumElements();
30431 SDValue A = Op.getOperand(0);
30432 SDValue B = Op.getOperand(1);
30433
30434 // Decompose 256-bit ops into 128-bit ops.
30435 if (VT.is256BitVector() && !Subtarget.hasInt256())
30436 return splitVectorIntBinary(Op, DAG);
30437
30438 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30439 return splitVectorIntBinary(Op, DAG);
30440
30441 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
30442 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30444, __extension__
__PRETTY_FUNCTION__))
30443 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30444, __extension__
__PRETTY_FUNCTION__))
30444 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30444, __extension__
__PRETTY_FUNCTION__))
;
30445
30446 // PMULxD operations multiply each even value (starting at 0) of LHS with
30447 // the related value of RHS and produce a widen result.
30448 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30449 // => <2 x i64> <ae|cg>
30450 //
30451 // In other word, to have all the results, we need to perform two PMULxD:
30452 // 1. one with the even values.
30453 // 2. one with the odd values.
30454 // To achieve #2, with need to place the odd values at an even position.
30455 //
30456 // Place the odd value at an even position (basically, shift all values 1
30457 // step to the left):
30458 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
30459 9, -1, 11, -1, 13, -1, 15, -1};
30460 // <a|b|c|d> => <b|undef|d|undef>
30461 SDValue Odd0 =
30462 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
30463 // <e|f|g|h> => <f|undef|h|undef>
30464 SDValue Odd1 =
30465 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
30466
30467 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
30468 // ints.
30469 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
30470 unsigned Opcode =
30471 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
30472 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30473 // => <2 x i64> <ae|cg>
30474 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30475 DAG.getBitcast(MulVT, A),
30476 DAG.getBitcast(MulVT, B)));
30477 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
30478 // => <2 x i64> <bf|dh>
30479 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30480 DAG.getBitcast(MulVT, Odd0),
30481 DAG.getBitcast(MulVT, Odd1)));
30482
30483 // Shuffle it back into the right order.
30484 SmallVector<int, 16> ShufMask(NumElts);
30485 for (int i = 0; i != (int)NumElts; ++i)
30486 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
30487
30488 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
30489
30490 // If we have a signed multiply but no PMULDQ fix up the result of an
30491 // unsigned multiply.
30492 if (IsSigned && !Subtarget.hasSSE41()) {
30493 SDValue Zero = DAG.getConstant(0, dl, VT);
30494 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
30495 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
30496 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
30497 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
30498
30499 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
30500 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
30501 }
30502
30503 return Res;
30504 }
30505
30506 // Only i8 vectors should need custom lowering after this.
30507 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30509, __extension__
__PRETTY_FUNCTION__))
30508 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30509, __extension__
__PRETTY_FUNCTION__))
30509 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30509, __extension__
__PRETTY_FUNCTION__))
;
30510
30511 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
30512 // logical shift down the upper half and pack back to i8.
30513
30514 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
30515 // and then ashr/lshr the upper bits down to the lower bits before multiply.
30516
30517 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30518 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30519 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30520 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30521 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30522 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30523 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30524 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30525 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30526 }
30527
30528 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
30529}
30530
30531// Custom lowering for SMULO/UMULO.
30532static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
30533 SelectionDAG &DAG) {
30534 MVT VT = Op.getSimpleValueType();
30535
30536 // Scalars defer to LowerXALUO.
30537 if (!VT.isVector())
30538 return LowerXALUO(Op, DAG);
30539
30540 SDLoc dl(Op);
30541 bool IsSigned = Op->getOpcode() == ISD::SMULO;
30542 SDValue A = Op.getOperand(0);
30543 SDValue B = Op.getOperand(1);
30544 EVT OvfVT = Op->getValueType(1);
30545
30546 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
30547 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
30548 // Extract the LHS Lo/Hi vectors
30549 SDValue LHSLo, LHSHi;
30550 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
30551
30552 // Extract the RHS Lo/Hi vectors
30553 SDValue RHSLo, RHSHi;
30554 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
30555
30556 EVT LoOvfVT, HiOvfVT;
30557 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
30558 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
30559 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
30560
30561 // Issue the split operations.
30562 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
30563 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
30564
30565 // Join the separate data results and the overflow results.
30566 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30567 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
30568 Hi.getValue(1));
30569
30570 return DAG.getMergeValues({Res, Ovf}, dl);
30571 }
30572
30573 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30574 EVT SetccVT =
30575 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30576
30577 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30578 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30579 unsigned NumElts = VT.getVectorNumElements();
30580 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30581 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30582 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30583 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30584 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30585
30586 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30587
30588 SDValue Ovf;
30589 if (IsSigned) {
30590 SDValue High, LowSign;
30591 if (OvfVT.getVectorElementType() == MVT::i1 &&
30592 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30593 // Rather the truncating try to do the compare on vXi16 or vXi32.
30594 // Shift the high down filling with sign bits.
30595 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
30596 // Fill all 16 bits with the sign bit from the low.
30597 LowSign =
30598 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
30599 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
30600 15, DAG);
30601 SetccVT = OvfVT;
30602 if (!Subtarget.hasBWI()) {
30603 // We can't do a vXi16 compare so sign extend to v16i32.
30604 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30605 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30606 }
30607 } else {
30608 // Otherwise do the compare at vXi8.
30609 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30610 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30611 LowSign =
30612 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30613 }
30614
30615 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30616 } else {
30617 SDValue High =
30618 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30619 if (OvfVT.getVectorElementType() == MVT::i1 &&
30620 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30621 // Rather the truncating try to do the compare on vXi16 or vXi32.
30622 SetccVT = OvfVT;
30623 if (!Subtarget.hasBWI()) {
30624 // We can't do a vXi16 compare so sign extend to v16i32.
30625 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30626 }
30627 } else {
30628 // Otherwise do the compare at vXi8.
30629 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30630 }
30631
30632 Ovf =
30633 DAG.getSetCC(dl, SetccVT, High,
30634 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30635 }
30636
30637 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30638
30639 return DAG.getMergeValues({Low, Ovf}, dl);
30640 }
30641
30642 SDValue Low;
30643 SDValue High =
30644 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30645
30646 SDValue Ovf;
30647 if (IsSigned) {
30648 // SMULO overflows if the high bits don't match the sign of the low.
30649 SDValue LowSign =
30650 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30651 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30652 } else {
30653 // UMULO overflows if the high bits are non-zero.
30654 Ovf =
30655 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30656 }
30657
30658 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30659
30660 return DAG.getMergeValues({Low, Ovf}, dl);
30661}
30662
30663SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30664 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30664, __extension__
__PRETTY_FUNCTION__))
;
30665 EVT VT = Op.getValueType();
30666 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30667, __extension__
__PRETTY_FUNCTION__))
30667 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30667, __extension__
__PRETTY_FUNCTION__))
;
30668
30669 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30670 SmallVector<SDValue> Result;
30671 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30672 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30673 }
30674
30675 RTLIB::Libcall LC;
30676 bool isSigned;
30677 switch (Op->getOpcode()) {
30678 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30678)
;
30679 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30680 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30681 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30682 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30683 }
30684
30685 SDLoc dl(Op);
30686 SDValue InChain = DAG.getEntryNode();
30687
30688 TargetLowering::ArgListTy Args;
30689 TargetLowering::ArgListEntry Entry;
30690 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30691 EVT ArgVT = Op->getOperand(i).getValueType();
30692 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30693, __extension__
__PRETTY_FUNCTION__))
30693 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30693, __extension__
__PRETTY_FUNCTION__))
;
30694 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30695 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30696 MachinePointerInfo MPI =
30697 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30698 Entry.Node = StackPtr;
30699 InChain =
30700 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30701 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30702 Entry.Ty = PointerType::get(ArgTy,0);
30703 Entry.IsSExt = false;
30704 Entry.IsZExt = false;
30705 Args.push_back(Entry);
30706 }
30707
30708 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
30709 getPointerTy(DAG.getDataLayout()));
30710
30711 TargetLowering::CallLoweringInfo CLI(DAG);
30712 CLI.setDebugLoc(dl)
30713 .setChain(InChain)
30714 .setLibCallee(
30715 getLibcallCallingConv(LC),
30716 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30717 std::move(Args))
30718 .setInRegister()
30719 .setSExtResult(isSigned)
30720 .setZExtResult(!isSigned);
30721
30722 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30723 return DAG.getBitcast(VT, CallInfo.first);
30724}
30725
30726SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30727 SelectionDAG &DAG,
30728 SDValue &Chain) const {
30729 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30729, __extension__
__PRETTY_FUNCTION__))
;
30730 EVT VT = Op.getValueType();
30731 bool IsStrict = Op->isStrictFPOpcode();
30732
30733 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30734 EVT ArgVT = Arg.getValueType();
30735
30736 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30737, __extension__
__PRETTY_FUNCTION__))
30737 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30737, __extension__
__PRETTY_FUNCTION__))
;
30738
30739 RTLIB::Libcall LC;
30740 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30741 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30742 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30743 else
30744 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30745 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30745, __extension__
__PRETTY_FUNCTION__))
;
30746
30747 SDLoc dl(Op);
30748 MakeLibCallOptions CallOptions;
30749 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30750
30751 SDValue Result;
30752 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30753 // expected VT (i128).
30754 std::tie(Result, Chain) =
30755 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30756 Result = DAG.getBitcast(VT, Result);
30757 return Result;
30758}
30759
30760SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30761 SelectionDAG &DAG) const {
30762 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30762, __extension__
__PRETTY_FUNCTION__))
;
30763 EVT VT = Op.getValueType();
30764 bool IsStrict = Op->isStrictFPOpcode();
30765
30766 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30767 EVT ArgVT = Arg.getValueType();
30768
30769 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30770, __extension__
__PRETTY_FUNCTION__))
30770 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30770, __extension__
__PRETTY_FUNCTION__))
;
30771
30772 RTLIB::Libcall LC;
30773 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30774 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30775 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30776 else
30777 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30778 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30778, __extension__
__PRETTY_FUNCTION__))
;
30779
30780 SDLoc dl(Op);
30781 MakeLibCallOptions CallOptions;
30782 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30783
30784 // Pass the i128 argument as an indirect argument on the stack.
30785 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30786 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30787 MachinePointerInfo MPI =
30788 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30789 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30790
30791 SDValue Result;
30792 std::tie(Result, Chain) =
30793 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30794 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30795}
30796
30797// Return true if the required (according to Opcode) shift-imm form is natively
30798// supported by the Subtarget
30799static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
30800 unsigned Opcode) {
30801 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30802 return false;
30803
30804 if (VT.getScalarSizeInBits() < 16)
30805 return false;
30806
30807 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30808 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30809 return true;
30810
30811 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30812 (VT.is256BitVector() && Subtarget.hasInt256());
30813
30814 bool AShift = LShift && (Subtarget.hasAVX512() ||
30815 (VT != MVT::v2i64 && VT != MVT::v4i64));
30816 return (Opcode == ISD::SRA) ? AShift : LShift;
30817}
30818
30819// The shift amount is a variable, but it is the same for all vector lanes.
30820// These instructions are defined together with shift-immediate.
30821static
30822bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
30823 unsigned Opcode) {
30824 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30825}
30826
30827// Return true if the required (according to Opcode) variable-shift form is
30828// natively supported by the Subtarget
30829static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
30830 unsigned Opcode) {
30831 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30832 return false;
30833
30834 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30835 return false;
30836
30837 // vXi16 supported only on AVX-512, BWI
30838 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30839 return false;
30840
30841 if (Subtarget.hasAVX512() &&
30842 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30843 return true;
30844
30845 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30846 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30847 return (Opcode == ISD::SRA) ? AShift : LShift;
30848}
30849
30850static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
30851 const X86Subtarget &Subtarget) {
30852 MVT VT = Op.getSimpleValueType();
30853 SDLoc dl(Op);
30854 SDValue R = Op.getOperand(0);
30855 SDValue Amt = Op.getOperand(1);
30856 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30857
30858 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30859 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30859, __extension__
__PRETTY_FUNCTION__))
;
30860 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30861 SDValue Ex = DAG.getBitcast(ExVT, R);
30862
30863 // ashr(R, 63) === cmp_slt(R, 0)
30864 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30865 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30866, __extension__
__PRETTY_FUNCTION__))
30866 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30866, __extension__
__PRETTY_FUNCTION__))
;
30867 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30868 }
30869
30870 if (ShiftAmt >= 32) {
30871 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30872 SDValue Upper =
30873 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30874 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
30875 ShiftAmt - 32, DAG);
30876 if (VT == MVT::v2i64)
30877 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30878 if (VT == MVT::v4i64)
30879 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30880 {9, 1, 11, 3, 13, 5, 15, 7});
30881 } else {
30882 // SRA upper i32, SRL whole i64 and select lower i32.
30883 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
30884 ShiftAmt, DAG);
30885 SDValue Lower =
30886 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30887 Lower = DAG.getBitcast(ExVT, Lower);
30888 if (VT == MVT::v2i64)
30889 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30890 if (VT == MVT::v4i64)
30891 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30892 {8, 1, 10, 3, 12, 5, 14, 7});
30893 }
30894 return DAG.getBitcast(VT, Ex);
30895 };
30896
30897 // Optimize shl/srl/sra with constant shift amount.
30898 APInt APIntShiftAmt;
30899 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30900 return SDValue();
30901
30902 // If the shift amount is out of range, return undef.
30903 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
30904 return DAG.getUNDEF(VT);
30905
30906 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30907
30908 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30909 // Hardware support for vector shifts is sparse which makes us scalarize the
30910 // vector operations in many cases. Also, on sandybridge ADD is faster than
30911 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30912 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30913 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30914 // must be 0). (add undef, undef) however can be any value. To make this
30915 // safe, we must freeze R to ensure that register allocation uses the same
30916 // register for an undefined value. This ensures that the result will
30917 // still be even and preserves the original semantics.
30918 R = DAG.getFreeze(R);
30919 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30920 }
30921
30922 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30923 }
30924
30925 // i64 SRA needs to be performed as partial shifts.
30926 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30927 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30928 Op.getOpcode() == ISD::SRA)
30929 return ArithmeticShiftRight64(ShiftAmt);
30930
30931 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30932 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30933 unsigned NumElts = VT.getVectorNumElements();
30934 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30935
30936 // Simple i8 add case
30937 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30938 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30939 // must be 0). (add undef, undef) however can be any value. To make this
30940 // safe, we must freeze R to ensure that register allocation uses the same
30941 // register for an undefined value. This ensures that the result will
30942 // still be even and preserves the original semantics.
30943 R = DAG.getFreeze(R);
30944 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30945 }
30946
30947 // ashr(R, 7) === cmp_slt(R, 0)
30948 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30949 SDValue Zeros = DAG.getConstant(0, dl, VT);
30950 if (VT.is512BitVector()) {
30951 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30951, __extension__
__PRETTY_FUNCTION__))
;
30952 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30953 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30954 }
30955 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30956 }
30957
30958 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30959 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30960 return SDValue();
30961
30962 if (Op.getOpcode() == ISD::SHL) {
30963 // Make a large shift.
30964 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30965 ShiftAmt, DAG);
30966 SHL = DAG.getBitcast(VT, SHL);
30967 // Zero out the rightmost bits.
30968 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30969 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30970 }
30971 if (Op.getOpcode() == ISD::SRL) {
30972 // Make a large shift.
30973 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30974 ShiftAmt, DAG);
30975 SRL = DAG.getBitcast(VT, SRL);
30976 // Zero out the leftmost bits.
30977 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30978 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30979 }
30980 if (Op.getOpcode() == ISD::SRA) {
30981 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30982 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30983
30984 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30985 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30986 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30987 return Res;
30988 }
30989 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30989)
;
30990 }
30991
30992 return SDValue();
30993}
30994
30995static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
30996 const X86Subtarget &Subtarget) {
30997 MVT VT = Op.getSimpleValueType();
30998 SDLoc dl(Op);
30999 SDValue R = Op.getOperand(0);
31000 SDValue Amt = Op.getOperand(1);
31001 unsigned Opcode = Op.getOpcode();
31002 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
31003
31004 int BaseShAmtIdx = -1;
31005 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
31006 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
31007 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
31008 Subtarget, DAG);
31009
31010 // vXi8 shifts - shift as v8i16 + mask result.
31011 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
31012 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
31013 VT == MVT::v64i8) &&
31014 !Subtarget.hasXOP()) {
31015 unsigned NumElts = VT.getVectorNumElements();
31016 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31017 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
31018 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
31019 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
31020
31021 // Create the mask using vXi16 shifts. For shift-rights we need to move
31022 // the upper byte down before splatting the vXi8 mask.
31023 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
31024 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
31025 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
31026 if (Opcode != ISD::SHL)
31027 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
31028 8, DAG);
31029 BitMask = DAG.getBitcast(VT, BitMask);
31030 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
31031 SmallVector<int, 64>(NumElts, 0));
31032
31033 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
31034 DAG.getBitcast(ExtVT, R), BaseShAmt,
31035 BaseShAmtIdx, Subtarget, DAG);
31036 Res = DAG.getBitcast(VT, Res);
31037 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
31038
31039 if (Opcode == ISD::SRA) {
31040 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
31041 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
31042 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
31043 SignMask =
31044 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
31045 BaseShAmtIdx, Subtarget, DAG);
31046 SignMask = DAG.getBitcast(VT, SignMask);
31047 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
31048 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
31049 }
31050 return Res;
31051 }
31052 }
31053 }
31054
31055 return SDValue();
31056}
31057
31058// Convert a shift/rotate left amount to a multiplication scale factor.
31059static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
31060 const X86Subtarget &Subtarget,
31061 SelectionDAG &DAG) {
31062 MVT VT = Amt.getSimpleValueType();
31063 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
31064 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
31065 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
31066 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
31067 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31068 (Subtarget.hasBWI() && VT == MVT::v64i8)))
31069 return SDValue();
31070
31071 MVT SVT = VT.getVectorElementType();
31072 unsigned SVTBits = SVT.getSizeInBits();
31073 unsigned NumElems = VT.getVectorNumElements();
31074
31075 APInt UndefElts;
31076 SmallVector<APInt> EltBits;
31077 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
31078 APInt One(SVTBits, 1);
31079 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
31080 for (unsigned I = 0; I != NumElems; ++I) {
31081 if (UndefElts[I] || EltBits[I].uge(SVTBits))
31082 continue;
31083 uint64_t ShAmt = EltBits[I].getZExtValue();
31084 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
31085 }
31086 return DAG.getBuildVector(VT, dl, Elts);
31087 }
31088
31089 // If the target doesn't support variable shifts, use either FP conversion
31090 // or integer multiplication to avoid shifting each element individually.
31091 if (VT == MVT::v4i32) {
31092 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
31093 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
31094 DAG.getConstant(0x3f800000U, dl, VT));
31095 Amt = DAG.getBitcast(MVT::v4f32, Amt);
31096 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
31097 }
31098
31099 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
31100 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
31101 SDValue Z = DAG.getConstant(0, dl, VT);
31102 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
31103 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
31104 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
31105 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
31106 if (Subtarget.hasSSE41())
31107 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31108 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
31109 }
31110
31111 return SDValue();
31112}
31113
31114static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
31115 SelectionDAG &DAG) {
31116 MVT VT = Op.getSimpleValueType();
31117 SDLoc dl(Op);
31118 SDValue R = Op.getOperand(0);
31119 SDValue Amt = Op.getOperand(1);
31120 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31121 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31122
31123 unsigned Opc = Op.getOpcode();
31124 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
31125 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
31126
31127 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31127, __extension__
__PRETTY_FUNCTION__))
;
31128 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31128, __extension__
__PRETTY_FUNCTION__))
;
31129
31130 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
31131 return V;
31132
31133 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
31134 return V;
31135
31136 if (supportedVectorVarShift(VT, Subtarget, Opc))
31137 return Op;
31138
31139 // i64 vector arithmetic shift can be emulated with the transform:
31140 // M = lshr(SIGN_MASK, Amt)
31141 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
31142 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
31143 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
31144 Opc == ISD::SRA) {
31145 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
31146 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
31147 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31148 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
31149 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
31150 return R;
31151 }
31152
31153 // XOP has 128-bit variable logical/arithmetic shifts.
31154 // +ve/-ve Amt = shift left/right.
31155 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
31156 VT == MVT::v8i16 || VT == MVT::v16i8)) {
31157 if (Opc == ISD::SRL || Opc == ISD::SRA) {
31158 SDValue Zero = DAG.getConstant(0, dl, VT);
31159 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
31160 }
31161 if (Opc == ISD::SHL || Opc == ISD::SRL)
31162 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
31163 if (Opc == ISD::SRA)
31164 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
31165 }
31166
31167 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
31168 // shifts per-lane and then shuffle the partial results back together.
31169 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
31170 // Splat the shift amounts so the scalar shifts above will catch it.
31171 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
31172 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
31173 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
31174 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
31175 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
31176 }
31177
31178 // If possible, lower this shift as a sequence of two shifts by
31179 // constant plus a BLENDing shuffle instead of scalarizing it.
31180 // Example:
31181 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
31182 //
31183 // Could be rewritten as:
31184 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
31185 //
31186 // The advantage is that the two shifts from the example would be
31187 // lowered as X86ISD::VSRLI nodes in parallel before blending.
31188 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
31189 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31190 SDValue Amt1, Amt2;
31191 unsigned NumElts = VT.getVectorNumElements();
31192 SmallVector<int, 8> ShuffleMask;
31193 for (unsigned i = 0; i != NumElts; ++i) {
31194 SDValue A = Amt->getOperand(i);
31195 if (A.isUndef()) {
31196 ShuffleMask.push_back(SM_SentinelUndef);
31197 continue;
31198 }
31199 if (!Amt1 || Amt1 == A) {
31200 ShuffleMask.push_back(i);
31201 Amt1 = A;
31202 continue;
31203 }
31204 if (!Amt2 || Amt2 == A) {
31205 ShuffleMask.push_back(i + NumElts);
31206 Amt2 = A;
31207 continue;
31208 }
31209 break;
31210 }
31211
31212 // Only perform this blend if we can perform it without loading a mask.
31213 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
31214 (VT != MVT::v16i16 ||
31215 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
31216 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
31217 canWidenShuffleElements(ShuffleMask))) {
31218 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
31219 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
31220 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
31221 Cst2->getAPIntValue().ult(EltSizeInBits)) {
31222 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31223 Cst1->getZExtValue(), DAG);
31224 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31225 Cst2->getZExtValue(), DAG);
31226 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
31227 }
31228 }
31229 }
31230
31231 // If possible, lower this packed shift into a vector multiply instead of
31232 // expanding it into a sequence of scalar shifts.
31233 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
31234 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
31235 Subtarget.canExtendTo512BW())))
31236 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
31237 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
31238
31239 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
31240 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
31241 if (Opc == ISD::SRL && ConstantAmt &&
31242 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31243 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31244 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31245 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31246 SDValue Zero = DAG.getConstant(0, dl, VT);
31247 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
31248 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
31249 return DAG.getSelect(dl, VT, ZAmt, R, Res);
31250 }
31251 }
31252
31253 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
31254 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
31255 // TODO: Special case handling for shift by 0/1, really we can afford either
31256 // of these cases in pre-SSE41/XOP/AVX512 but not both.
31257 if (Opc == ISD::SRA && ConstantAmt &&
31258 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
31259 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
31260 !Subtarget.hasAVX512()) ||
31261 DAG.isKnownNeverZero(Amt))) {
31262 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31263 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31264 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31265 SDValue Amt0 =
31266 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
31267 SDValue Amt1 =
31268 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
31269 SDValue Sra1 =
31270 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
31271 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
31272 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
31273 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
31274 }
31275 }
31276
31277 // v4i32 Non Uniform Shifts.
31278 // If the shift amount is constant we can shift each lane using the SSE2
31279 // immediate shifts, else we need to zero-extend each lane to the lower i64
31280 // and shift using the SSE2 variable shifts.
31281 // The separate results can then be blended together.
31282 if (VT == MVT::v4i32) {
31283 SDValue Amt0, Amt1, Amt2, Amt3;
31284 if (ConstantAmt) {
31285 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
31286 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
31287 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
31288 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
31289 } else {
31290 // The SSE2 shifts use the lower i64 as the same shift amount for
31291 // all lanes and the upper i64 is ignored. On AVX we're better off
31292 // just zero-extending, but for SSE just duplicating the top 16-bits is
31293 // cheaper and has the same effect for out of range values.
31294 if (Subtarget.hasAVX()) {
31295 SDValue Z = DAG.getConstant(0, dl, VT);
31296 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
31297 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
31298 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
31299 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
31300 } else {
31301 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
31302 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
31303 {4, 5, 6, 7, -1, -1, -1, -1});
31304 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
31305 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
31306 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
31307 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
31308 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
31309 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
31310 }
31311 }
31312
31313 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
31314 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
31315 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
31316 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
31317 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
31318
31319 // Merge the shifted lane results optimally with/without PBLENDW.
31320 // TODO - ideally shuffle combining would handle this.
31321 if (Subtarget.hasSSE41()) {
31322 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
31323 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
31324 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
31325 }
31326 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
31327 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
31328 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
31329 }
31330
31331 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
31332 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
31333 // make the existing SSE solution better.
31334 // NOTE: We honor prefered vector width before promoting to 512-bits.
31335 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
31336 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
31337 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
31338 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
31339 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
31340 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31341, __extension__
__PRETTY_FUNCTION__))
31341 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31341, __extension__
__PRETTY_FUNCTION__))
;
31342 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
31343 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
31344 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31345 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
31346 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
31347 return DAG.getNode(ISD::TRUNCATE, dl, VT,
31348 DAG.getNode(Opc, dl, ExtVT, R, Amt));
31349 }
31350
31351 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
31352 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
31353 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
31354 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
31355 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
31356 !Subtarget.hasXOP()) {
31357 int NumElts = VT.getVectorNumElements();
31358 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
31359
31360 // Extend constant shift amount to vXi16 (it doesn't matter if the type
31361 // isn't legal).
31362 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
31363 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
31364 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
31365 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
31366 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31367, __extension__
__PRETTY_FUNCTION__))
31367 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31367, __extension__
__PRETTY_FUNCTION__))
;
31368
31369 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
31370 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
31371 : DAG.getZExtOrTrunc(R, dl, ExVT);
31372 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
31373 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
31374 return DAG.getZExtOrTrunc(R, dl, VT);
31375 }
31376
31377 SmallVector<SDValue, 16> LoAmt, HiAmt;
31378 for (int i = 0; i != NumElts; i += 16) {
31379 for (int j = 0; j != 8; ++j) {
31380 LoAmt.push_back(Amt.getOperand(i + j));
31381 HiAmt.push_back(Amt.getOperand(i + j + 8));
31382 }
31383 }
31384
31385 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
31386 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
31387 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
31388
31389 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31390 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31391 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31392 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31393 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31394 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31395 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31396 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31397 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31398 }
31399
31400 if (VT == MVT::v16i8 ||
31401 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31402 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31403 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
31404
31405 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31406 if (VT.is512BitVector()) {
31407 // On AVX512BW targets we make use of the fact that VSELECT lowers
31408 // to a masked blend which selects bytes based just on the sign bit
31409 // extracted to a mask.
31410 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
31411 V0 = DAG.getBitcast(VT, V0);
31412 V1 = DAG.getBitcast(VT, V1);
31413 Sel = DAG.getBitcast(VT, Sel);
31414 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31415 ISD::SETGT);
31416 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31417 } else if (Subtarget.hasSSE41()) {
31418 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31419 // on the sign bit.
31420 V0 = DAG.getBitcast(VT, V0);
31421 V1 = DAG.getBitcast(VT, V1);
31422 Sel = DAG.getBitcast(VT, Sel);
31423 return DAG.getBitcast(SelVT,
31424 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31425 }
31426 // On pre-SSE41 targets we test for the sign bit by comparing to
31427 // zero - a negative value will set all bits of the lanes to true
31428 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31429 SDValue Z = DAG.getConstant(0, dl, SelVT);
31430 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31431 return DAG.getSelect(dl, SelVT, C, V0, V1);
31432 };
31433
31434 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31435 // We can safely do this using i16 shifts as we're only interested in
31436 // the 3 lower bits of each byte.
31437 Amt = DAG.getBitcast(ExtVT, Amt);
31438 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31439 Amt = DAG.getBitcast(VT, Amt);
31440
31441 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31442 // r = VSELECT(r, shift(r, 4), a);
31443 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31444 R = SignBitSelect(VT, Amt, M, R);
31445
31446 // a += a
31447 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31448
31449 // r = VSELECT(r, shift(r, 2), a);
31450 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31451 R = SignBitSelect(VT, Amt, M, R);
31452
31453 // a += a
31454 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31455
31456 // return VSELECT(r, shift(r, 1), a);
31457 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31458 R = SignBitSelect(VT, Amt, M, R);
31459 return R;
31460 }
31461
31462 if (Opc == ISD::SRA) {
31463 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31464 // so we can correctly sign extend. We don't care what happens to the
31465 // lower byte.
31466 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31467 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31468 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31469 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31470 ALo = DAG.getBitcast(ExtVT, ALo);
31471 AHi = DAG.getBitcast(ExtVT, AHi);
31472 RLo = DAG.getBitcast(ExtVT, RLo);
31473 RHi = DAG.getBitcast(ExtVT, RHi);
31474
31475 // r = VSELECT(r, shift(r, 4), a);
31476 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31477 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31478 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31479 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31480
31481 // a += a
31482 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31483 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31484
31485 // r = VSELECT(r, shift(r, 2), a);
31486 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31487 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31488 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31489 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31490
31491 // a += a
31492 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31493 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31494
31495 // r = VSELECT(r, shift(r, 1), a);
31496 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31497 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31498 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31499 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31500
31501 // Logical shift the result back to the lower byte, leaving a zero upper
31502 // byte meaning that we can safely pack with PACKUSWB.
31503 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31504 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31505 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31506 }
31507 }
31508
31509 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31510 MVT ExtVT = MVT::v8i32;
31511 SDValue Z = DAG.getConstant(0, dl, VT);
31512 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31513 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31514 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31515 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31516 ALo = DAG.getBitcast(ExtVT, ALo);
31517 AHi = DAG.getBitcast(ExtVT, AHi);
31518 RLo = DAG.getBitcast(ExtVT, RLo);
31519 RHi = DAG.getBitcast(ExtVT, RHi);
31520 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31521 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31522 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31523 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31524 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31525 }
31526
31527 if (VT == MVT::v8i16) {
31528 // If we have a constant shift amount, the non-SSE41 path is best as
31529 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31530 bool UseSSE41 = Subtarget.hasSSE41() &&
31531 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31532
31533 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31534 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31535 // the sign bit.
31536 if (UseSSE41) {
31537 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
31538 V0 = DAG.getBitcast(ExtVT, V0);
31539 V1 = DAG.getBitcast(ExtVT, V1);
31540 Sel = DAG.getBitcast(ExtVT, Sel);
31541 return DAG.getBitcast(
31542 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31543 }
31544 // On pre-SSE41 targets we splat the sign bit - a negative value will
31545 // set all bits of the lanes to true and VSELECT uses that in
31546 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31547 SDValue C =
31548 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31549 return DAG.getSelect(dl, VT, C, V0, V1);
31550 };
31551
31552 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31553 if (UseSSE41) {
31554 // On SSE41 targets we need to replicate the shift mask in both
31555 // bytes for PBLENDVB.
31556 Amt = DAG.getNode(
31557 ISD::OR, dl, VT,
31558 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31559 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31560 } else {
31561 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31562 }
31563
31564 // r = VSELECT(r, shift(r, 8), a);
31565 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31566 R = SignBitSelect(Amt, M, R);
31567
31568 // a += a
31569 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31570
31571 // r = VSELECT(r, shift(r, 4), a);
31572 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31573 R = SignBitSelect(Amt, M, R);
31574
31575 // a += a
31576 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31577
31578 // r = VSELECT(r, shift(r, 2), a);
31579 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31580 R = SignBitSelect(Amt, M, R);
31581
31582 // a += a
31583 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31584
31585 // return VSELECT(r, shift(r, 1), a);
31586 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31587 R = SignBitSelect(Amt, M, R);
31588 return R;
31589 }
31590
31591 // Decompose 256-bit shifts into 128-bit shifts.
31592 if (VT.is256BitVector())
31593 return splitVectorIntBinary(Op, DAG);
31594
31595 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31596 return splitVectorIntBinary(Op, DAG);
31597
31598 return SDValue();
31599}
31600
31601static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
31602 SelectionDAG &DAG) {
31603 MVT VT = Op.getSimpleValueType();
31604 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31605, __extension__
__PRETTY_FUNCTION__))
31605 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31605, __extension__
__PRETTY_FUNCTION__))
;
31606
31607 SDLoc DL(Op);
31608 SDValue Op0 = Op.getOperand(0);
31609 SDValue Op1 = Op.getOperand(1);
31610 SDValue Amt = Op.getOperand(2);
31611 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31612 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31613
31614 if (VT.isVector()) {
31615 APInt APIntShiftAmt;
31616 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31617
31618 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31619 if (IsFSHR)
31620 std::swap(Op0, Op1);
31621
31622 if (IsCstSplat) {
31623 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31624 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31625 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31626 {Op0, Op1, Imm}, DAG, Subtarget);
31627 }
31628 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31629 {Op0, Op1, Amt}, DAG, Subtarget);
31630 }
31631 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31632 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31633 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
31634 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))
;
31635
31636 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31637 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31638 if (IsCstSplat)
31639 return SDValue();
31640
31641 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31642 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31643 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31644
31645 // Constant vXi16 funnel shifts can be efficiently handled by default.
31646 if (IsCst && EltSizeInBits == 16)
31647 return SDValue();
31648
31649 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31650 unsigned NumElts = VT.getVectorNumElements();
31651 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31652 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31653
31654 // Split 256-bit integers on XOP/pre-AVX2 targets.
31655 // Split 512-bit integers on non 512-bit BWI targets.
31656 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31657 !Subtarget.hasAVX2())) ||
31658 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31659 EltSizeInBits < 32)) {
31660 // Pre-mask the amount modulo using the wider vector.
31661 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31662 return splitVectorOp(Op, DAG);
31663 }
31664
31665 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31666 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31667 int ScalarAmtIdx = -1;
31668 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31669 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31670 if (EltSizeInBits == 16)
31671 return SDValue();
31672
31673 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31674 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31675 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31676 ScalarAmtIdx, Subtarget, DAG);
31677 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31678 ScalarAmtIdx, Subtarget, DAG);
31679 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31680 }
31681 }
31682
31683 MVT WideSVT = MVT::getIntegerVT(
31684 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31685 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31686
31687 // If per-element shifts are legal, fallback to generic expansion.
31688 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31689 return SDValue();
31690
31691 // Attempt to fold as:
31692 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31693 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31694 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31695 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31696 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31697 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31698 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31699 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31700 EltSizeInBits, DAG);
31701 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31702 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31703 if (!IsFSHR)
31704 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31705 EltSizeInBits, DAG);
31706 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31707 }
31708
31709 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31710 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31711 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31712 SDValue Z = DAG.getConstant(0, DL, VT);
31713 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31714 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31715 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31716 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31717 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31718 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31719 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31720 }
31721
31722 // Fallback to generic expansion.
31723 return SDValue();
31724 }
31725 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31727, __extension__
__PRETTY_FUNCTION__))
31726 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31727, __extension__
__PRETTY_FUNCTION__))
31727 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31727, __extension__
__PRETTY_FUNCTION__))
;
31728
31729 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31730 bool OptForSize = DAG.shouldOptForSize();
31731 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31732
31733 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31734 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31735 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31736 !isa<ConstantSDNode>(Amt)) {
31737 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31738 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31739 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31740 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31741 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31742 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31743 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31744 if (IsFSHR) {
31745 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31746 } else {
31747 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31748 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31749 }
31750 return DAG.getZExtOrTrunc(Res, DL, VT);
31751 }
31752
31753 if (VT == MVT::i8 || ExpandFunnel)
31754 return SDValue();
31755
31756 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31757 if (VT == MVT::i16) {
31758 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31759 DAG.getConstant(15, DL, Amt.getValueType()));
31760 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31761 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31762 }
31763
31764 return Op;
31765}
31766
31767static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31768 SelectionDAG &DAG) {
31769 MVT VT = Op.getSimpleValueType();
31770 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31770, __extension__
__PRETTY_FUNCTION__))
;
31771
31772 SDLoc DL(Op);
31773 SDValue R = Op.getOperand(0);
31774 SDValue Amt = Op.getOperand(1);
31775 unsigned Opcode = Op.getOpcode();
31776 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31777 int NumElts = VT.getVectorNumElements();
31778 bool IsROTL = Opcode == ISD::ROTL;
31779
31780 // Check for constant splat rotation amount.
31781 APInt CstSplatValue;
31782 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31783
31784 // Check for splat rotate by zero.
31785 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31786 return R;
31787
31788 // AVX512 implicitly uses modulo rotation amounts.
31789 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
31790 // Attempt to rotate by immediate.
31791 if (IsCstSplat) {
31792 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31793 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31794 return DAG.getNode(RotOpc, DL, VT, R,
31795 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31796 }
31797
31798 // Else, fall-back on VPROLV/VPRORV.
31799 return Op;
31800 }
31801
31802 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31803 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31804 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31805 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31806 }
31807
31808 SDValue Z = DAG.getConstant(0, DL, VT);
31809
31810 if (!IsROTL) {
31811 // If the ISD::ROTR amount is constant, we're always better converting to
31812 // ISD::ROTL.
31813 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31814 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31815
31816 // XOP targets always prefers ISD::ROTL.
31817 if (Subtarget.hasXOP())
31818 return DAG.getNode(ISD::ROTL, DL, VT, R,
31819 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31820 }
31821
31822 // Split 256-bit integers on XOP/pre-AVX2 targets.
31823 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31824 return splitVectorIntBinary(Op, DAG);
31825
31826 // XOP has 128-bit vector variable + immediate rotates.
31827 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31828 // XOP implicitly uses modulo rotation amounts.
31829 if (Subtarget.hasXOP()) {
31830 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31830, __extension__
__PRETTY_FUNCTION__))
;
31831 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31831, __extension__
__PRETTY_FUNCTION__))
;
31832
31833 // Attempt to rotate by immediate.
31834 if (IsCstSplat) {
31835 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31836 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31837 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31838 }
31839
31840 // Use general rotate by variable (per-element).
31841 return Op;
31842 }
31843
31844 // Rotate by an uniform constant - expand back to shifts.
31845 if (IsCstSplat)
31846 return SDValue();
31847
31848 // Split 512-bit integers on non 512-bit BWI targets.
31849 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31850 return splitVectorIntBinary(Op, DAG);
31851
31852 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))
31853 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))
31854 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))
31855 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))
31856 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))
31857 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))
;
31858
31859 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31860 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31861
31862 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31863 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31864
31865 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31866 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31867 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31868 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31869 int BaseRotAmtIdx = -1;
31870 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31871 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31872 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31873 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31874 }
31875 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31876 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31877 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31878 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31879 BaseRotAmtIdx, Subtarget, DAG);
31880 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31881 BaseRotAmtIdx, Subtarget, DAG);
31882 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31883 }
31884 }
31885
31886 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31887 // the amount bit.
31888 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31889 if (EltSizeInBits == 8) {
31890 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31891 MVT WideVT =
31892 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31893 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31894
31895 // Attempt to fold as:
31896 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31897 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31898 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31899 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31900 // If we're rotating by constant, just use default promotion.
31901 if (IsConstAmt)
31902 return SDValue();
31903 // See if we can perform this by widening to vXi16 or vXi32.
31904 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31905 R = DAG.getNode(
31906 ISD::OR, DL, WideVT, R,
31907 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31908 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31909 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31910 if (IsROTL)
31911 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31912 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31913 }
31914
31915 // Attempt to fold as unpack(x,x) << zext(y):
31916 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31917 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31918 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31919 // See if we can perform this by unpacking to lo/hi vXi16.
31920 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31921 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31922 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31923 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31924 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31925 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31926 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31927 }
31928 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31928, __extension__
__PRETTY_FUNCTION__))
;
31929
31930 // We don't need ModuloAmt here as we just peek at individual bits.
31931 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31932 if (Subtarget.hasSSE41()) {
31933 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31934 // on the sign bit.
31935 V0 = DAG.getBitcast(VT, V0);
31936 V1 = DAG.getBitcast(VT, V1);
31937 Sel = DAG.getBitcast(VT, Sel);
31938 return DAG.getBitcast(SelVT,
31939 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31940 }
31941 // On pre-SSE41 targets we test for the sign bit by comparing to
31942 // zero - a negative value will set all bits of the lanes to true
31943 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31944 SDValue Z = DAG.getConstant(0, DL, SelVT);
31945 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31946 return DAG.getSelect(DL, SelVT, C, V0, V1);
31947 };
31948
31949 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31950 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31951 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31952 IsROTL = true;
31953 }
31954
31955 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31956 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31957
31958 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31959 // We can safely do this using i16 shifts as we're only interested in
31960 // the 3 lower bits of each byte.
31961 Amt = DAG.getBitcast(ExtVT, Amt);
31962 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31963 Amt = DAG.getBitcast(VT, Amt);
31964
31965 // r = VSELECT(r, rot(r, 4), a);
31966 SDValue M;
31967 M = DAG.getNode(
31968 ISD::OR, DL, VT,
31969 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31970 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31971 R = SignBitSelect(VT, Amt, M, R);
31972
31973 // a += a
31974 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31975
31976 // r = VSELECT(r, rot(r, 2), a);
31977 M = DAG.getNode(
31978 ISD::OR, DL, VT,
31979 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31980 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31981 R = SignBitSelect(VT, Amt, M, R);
31982
31983 // a += a
31984 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31985
31986 // return VSELECT(r, rot(r, 1), a);
31987 M = DAG.getNode(
31988 ISD::OR, DL, VT,
31989 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31990 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31991 return SignBitSelect(VT, Amt, M, R);
31992 }
31993
31994 bool IsSplatAmt = DAG.isSplatValue(Amt);
31995 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31996 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31997 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31998
31999 // Fallback for splats + all supported variable shifts.
32000 // Fallback for non-constants AVX2 vXi16 as well.
32001 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
32002 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32003 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
32004 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
32005 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
32006 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
32007 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
32008 }
32009
32010 // Everything below assumes ISD::ROTL.
32011 if (!IsROTL) {
32012 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32013 IsROTL = true;
32014 }
32015
32016 // ISD::ROT* uses modulo rotate amounts.
32017 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32018
32019 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32019, __extension__
__PRETTY_FUNCTION__))
;
32020
32021 // As with shifts, attempt to convert the rotation amount to a multiplication
32022 // factor, fallback to general expansion.
32023 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
32024 if (!Scale)
32025 return SDValue();
32026
32027 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
32028 if (EltSizeInBits == 16) {
32029 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
32030 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
32031 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32032 }
32033
32034 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
32035 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
32036 // that can then be OR'd with the lower 32-bits.
32037 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32037, __extension__
__PRETTY_FUNCTION__))
;
32038 static const int OddMask[] = {1, -1, 3, -1};
32039 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
32040 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
32041
32042 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32043 DAG.getBitcast(MVT::v2i64, R),
32044 DAG.getBitcast(MVT::v2i64, Scale));
32045 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32046 DAG.getBitcast(MVT::v2i64, R13),
32047 DAG.getBitcast(MVT::v2i64, Scale13));
32048 Res02 = DAG.getBitcast(VT, Res02);
32049 Res13 = DAG.getBitcast(VT, Res13);
32050
32051 return DAG.getNode(ISD::OR, DL, VT,
32052 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
32053 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
32054}
32055
32056/// Returns true if the operand type is exactly twice the native width, and
32057/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
32058/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
32059/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
32060bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
32061 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
32062
32063 if (OpWidth == 64)
32064 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
32065 if (OpWidth == 128)
32066 return Subtarget.canUseCMPXCHG16B();
32067
32068 return false;
32069}
32070
32071TargetLoweringBase::AtomicExpansionKind
32072X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
32073 Type *MemType = SI->getValueOperand()->getType();
32074
32075 bool NoImplicitFloatOps =
32076 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32077 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32078 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32079 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32080 return AtomicExpansionKind::None;
32081
32082 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
32083 : AtomicExpansionKind::None;
32084}
32085
32086// Note: this turns large loads into lock cmpxchg8b/16b.
32087// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
32088TargetLowering::AtomicExpansionKind
32089X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
32090 Type *MemType = LI->getType();
32091
32092 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
32093 // can use movq to do the load. If we have X87 we can load into an 80-bit
32094 // X87 register and store it to a stack temporary.
32095 bool NoImplicitFloatOps =
32096 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32097 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32098 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32099 (Subtarget.hasSSE1() || Subtarget.hasX87()))
32100 return AtomicExpansionKind::None;
32101
32102 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32103 : AtomicExpansionKind::None;
32104}
32105
32106enum BitTestKind : unsigned {
32107 UndefBit,
32108 ConstantBit,
32109 NotConstantBit,
32110 ShiftBit,
32111 NotShiftBit
32112};
32113
32114static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
32115 using namespace llvm::PatternMatch;
32116 BitTestKind BTK = UndefBit;
32117 auto *C = dyn_cast<ConstantInt>(V);
32118 if (C) {
32119 // Check if V is a power of 2 or NOT power of 2.
32120 if (isPowerOf2_64(C->getZExtValue()))
32121 BTK = ConstantBit;
32122 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
32123 BTK = NotConstantBit;
32124 return {V, BTK};
32125 }
32126
32127 // Check if V is some power of 2 pattern known to be non-zero
32128 auto *I = dyn_cast<Instruction>(V);
32129 if (I) {
32130 bool Not = false;
32131 // Check if we have a NOT
32132 Value *PeekI;
32133 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
32134 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
32135 Not = true;
32136 I = dyn_cast<Instruction>(PeekI);
32137
32138 // If I is constant, it will fold and we can evaluate later. If its an
32139 // argument or something of that nature, we can't analyze.
32140 if (I == nullptr)
32141 return {nullptr, UndefBit};
32142 }
32143 // We can only use 1 << X without more sophisticated analysis. C << X where
32144 // C is a power of 2 but not 1 can result in zero which cannot be translated
32145 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
32146 if (I->getOpcode() == Instruction::Shl) {
32147 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
32148 // -X` and some other provable power of 2 patterns that we can use CTZ on
32149 // may be profitable.
32150 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
32151 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
32152 // be provably a non-zero power of 2.
32153 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
32154 // transformable to bittest.
32155 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
32156 if (!ShiftVal)
32157 return {nullptr, UndefBit};
32158 if (ShiftVal->equalsInt(1))
32159 BTK = Not ? NotShiftBit : ShiftBit;
32160
32161 if (BTK == UndefBit)
32162 return {nullptr, UndefBit};
32163
32164 Value *BitV = I->getOperand(1);
32165
32166 Value *AndOp;
32167 const APInt *AndC;
32168 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
32169 // Read past a shiftmask instruction to find count
32170 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
32171 BitV = AndOp;
32172 }
32173 return {BitV, BTK};
32174 }
32175 }
32176 return {nullptr, UndefBit};
32177}
32178
32179TargetLowering::AtomicExpansionKind
32180X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
32181 // If the atomicrmw's result isn't actually used, we can just add a "lock"
32182 // prefix to a normal instruction for these operations.
32183 if (AI->use_empty())
32184 return AtomicExpansionKind::None;
32185
32186 // If the atomicrmw's result is used by a single bit AND, we may use
32187 // bts/btr/btc instruction for these operations.
32188 // Note: InstCombinePass can cause a de-optimization here. It replaces the
32189 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
32190 // (depending on CC). This pattern can only use bts/btr/btc but we don't
32191 // detect it.
32192 Instruction *I = AI->user_back();
32193 auto BitChange = FindSingleBitChange(AI->getValOperand());
32194 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
32195 I->getOpcode() != Instruction::And ||
32196 AI->getType()->getPrimitiveSizeInBits() == 8 ||
32197 AI->getParent() != I->getParent())
32198 return AtomicExpansionKind::CmpXChg;
32199
32200 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
32201
32202 // This is a redundant AND, it should get cleaned up elsewhere.
32203 if (AI == I->getOperand(OtherIdx))
32204 return AtomicExpansionKind::CmpXChg;
32205
32206 // The following instruction must be a AND single bit.
32207 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
32208 auto *C1 = cast<ConstantInt>(AI->getValOperand());
32209 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
32210 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
32211 return AtomicExpansionKind::CmpXChg;
32212 }
32213 if (AI->getOperation() == AtomicRMWInst::And) {
32214 return ~C1->getValue() == C2->getValue()
32215 ? AtomicExpansionKind::BitTestIntrinsic
32216 : AtomicExpansionKind::CmpXChg;
32217 }
32218 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
32219 : AtomicExpansionKind::CmpXChg;
32220 }
32221
32222 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32222, __extension__
__PRETTY_FUNCTION__))
;
32223
32224 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
32225 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
32226 return AtomicExpansionKind::CmpXChg;
32227
32228 assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32228, __extension__
__PRETTY_FUNCTION__))
;
32229
32230 // If shift amounts are not the same we can't use BitTestIntrinsic.
32231 if (BitChange.first != BitTested.first)
32232 return AtomicExpansionKind::CmpXChg;
32233
32234 // If atomic AND need to be masking all be one bit and testing the one bit
32235 // unset in the mask.
32236 if (AI->getOperation() == AtomicRMWInst::And)
32237 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
32238 ? AtomicExpansionKind::BitTestIntrinsic
32239 : AtomicExpansionKind::CmpXChg;
32240
32241 // If atomic XOR/OR need to be setting and testing the same bit.
32242 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
32243 ? AtomicExpansionKind::BitTestIntrinsic
32244 : AtomicExpansionKind::CmpXChg;
32245}
32246
32247void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
32248 IRBuilder<> Builder(AI);
32249 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32250 Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
32251 Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
32252 switch (AI->getOperation()) {
32253 default:
32254 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32254)
;
32255 case AtomicRMWInst::Or:
32256 IID_C = Intrinsic::x86_atomic_bts;
32257 IID_I = Intrinsic::x86_atomic_bts_rm;
32258 break;
32259 case AtomicRMWInst::Xor:
32260 IID_C = Intrinsic::x86_atomic_btc;
32261 IID_I = Intrinsic::x86_atomic_btc_rm;
32262 break;
32263 case AtomicRMWInst::And:
32264 IID_C = Intrinsic::x86_atomic_btr;
32265 IID_I = Intrinsic::x86_atomic_btr_rm;
32266 break;
32267 }
32268 Instruction *I = AI->user_back();
32269 LLVMContext &Ctx = AI->getContext();
32270 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32271 Type::getInt8PtrTy(Ctx));
32272 Function *BitTest = nullptr;
32273 Value *Result = nullptr;
32274 auto BitTested = FindSingleBitChange(AI->getValOperand());
32275 assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32275, __extension__ __PRETTY_FUNCTION__))
;
32276
32277 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
32278 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
32279
32280 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
32281
32282 unsigned Imm = llvm::countr_zero(C->getZExtValue());
32283 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
32284 } else {
32285 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
32286
32287 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32287, __extension__
__PRETTY_FUNCTION__))
;
32288
32289 Value *SI = BitTested.first;
32290 assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32290, __extension__ __PRETTY_FUNCTION__))
;
32291
32292 // BT{S|R|C} on memory operand don't modulo bit position so we need to
32293 // mask it.
32294 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
32295 Value *BitPos =
32296 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
32297 // Todo(1): In many cases it may be provable that SI is less than
32298 // ShiftBits in which case this mask is unnecessary
32299 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
32300 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
32301 // favor of just a raw BT{S|R|C}.
32302
32303 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
32304 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
32305
32306 // If the result is only used for zero/non-zero status then we don't need to
32307 // shift value back. Otherwise do so.
32308 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
32309 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
32310 if (ICmp->isEquality()) {
32311 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
32312 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
32313 if (C0 || C1) {
32314 assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32314, __extension__ __PRETTY_FUNCTION__))
;
32315 if ((C0 ? C0 : C1)->isZero())
32316 continue;
32317 }
32318 }
32319 }
32320 Result = Builder.CreateShl(Result, BitPos);
32321 break;
32322 }
32323 }
32324
32325 I->replaceAllUsesWith(Result);
32326 I->eraseFromParent();
32327 AI->eraseFromParent();
32328}
32329
32330static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
32331 using namespace llvm::PatternMatch;
32332 if (!AI->hasOneUse())
32333 return false;
32334
32335 Value *Op = AI->getOperand(1);
32336 ICmpInst::Predicate Pred;
32337 Instruction *I = AI->user_back();
32338 AtomicRMWInst::BinOp Opc = AI->getOperation();
32339 if (Opc == AtomicRMWInst::Add) {
32340 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32341 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32342 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32343 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32344 return Pred == CmpInst::ICMP_SLT;
32345 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32346 return Pred == CmpInst::ICMP_SGT;
32347 }
32348 return false;
32349 }
32350 if (Opc == AtomicRMWInst::Sub) {
32351 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32352 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32353 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32354 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32355 return Pred == CmpInst::ICMP_SLT;
32356 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32357 return Pred == CmpInst::ICMP_SGT;
32358 }
32359 return false;
32360 }
32361 if ((Opc == AtomicRMWInst::Or &&
32362 match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
32363 (Opc == AtomicRMWInst::And &&
32364 match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
32365 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32366 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32367 Pred == CmpInst::ICMP_SLT;
32368 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32369 return Pred == CmpInst::ICMP_SGT;
32370 return false;
32371 }
32372 if (Opc == AtomicRMWInst::Xor) {
32373 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32374 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32375 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32376 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32377 return Pred == CmpInst::ICMP_SLT;
32378 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32379 return Pred == CmpInst::ICMP_SGT;
32380 }
32381 return false;
32382 }
32383
32384 return false;
32385}
32386
32387void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32388 AtomicRMWInst *AI) const {
32389 IRBuilder<> Builder(AI);
32390 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32391 Instruction *TempI = nullptr;
32392 LLVMContext &Ctx = AI->getContext();
32393 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32394 if (!ICI) {
32395 TempI = AI->user_back();
32396 assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32396, __extension__
__PRETTY_FUNCTION__))
;
32397 ICI = cast<ICmpInst>(TempI->user_back());
32398 }
32399 X86::CondCode CC = X86::COND_INVALID;
32400 ICmpInst::Predicate Pred = ICI->getPredicate();
32401 switch (Pred) {
32402 default:
32403 llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32403)
;
32404 case CmpInst::ICMP_EQ:
32405 CC = X86::COND_E;
32406 break;
32407 case CmpInst::ICMP_NE:
32408 CC = X86::COND_NE;
32409 break;
32410 case CmpInst::ICMP_SLT:
32411 CC = X86::COND_S;
32412 break;
32413 case CmpInst::ICMP_SGT:
32414 CC = X86::COND_NS;
32415 break;
32416 }
32417 Intrinsic::ID IID = Intrinsic::not_intrinsic;
32418 switch (AI->getOperation()) {
32419 default:
32420 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32420)
;
32421 case AtomicRMWInst::Add:
32422 IID = Intrinsic::x86_atomic_add_cc;
32423 break;
32424 case AtomicRMWInst::Sub:
32425 IID = Intrinsic::x86_atomic_sub_cc;
32426 break;
32427 case AtomicRMWInst::Or:
32428 IID = Intrinsic::x86_atomic_or_cc;
32429 break;
32430 case AtomicRMWInst::And:
32431 IID = Intrinsic::x86_atomic_and_cc;
32432 break;
32433 case AtomicRMWInst::Xor:
32434 IID = Intrinsic::x86_atomic_xor_cc;
32435 break;
32436 }
32437 Function *CmpArith =
32438 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
32439 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32440 Type::getInt8PtrTy(Ctx));
32441 Value *Call = Builder.CreateCall(
32442 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32443 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32444 ICI->replaceAllUsesWith(Result);
32445 ICI->eraseFromParent();
32446 if (TempI)
32447 TempI->eraseFromParent();
32448 AI->eraseFromParent();
32449}
32450
32451TargetLowering::AtomicExpansionKind
32452X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32453 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32454 Type *MemType = AI->getType();
32455
32456 // If the operand is too big, we must see if cmpxchg8/16b is available
32457 // and default to library calls otherwise.
32458 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32459 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32460 : AtomicExpansionKind::None;
32461 }
32462
32463 AtomicRMWInst::BinOp Op = AI->getOperation();
32464 switch (Op) {
32465 case AtomicRMWInst::Xchg:
32466 return AtomicExpansionKind::None;
32467 case AtomicRMWInst::Add:
32468 case AtomicRMWInst::Sub:
32469 if (shouldExpandCmpArithRMWInIR(AI))
32470 return AtomicExpansionKind::CmpArithIntrinsic;
32471 // It's better to use xadd, xsub or xchg for these in other cases.
32472 return AtomicExpansionKind::None;
32473 case AtomicRMWInst::Or:
32474 case AtomicRMWInst::And:
32475 case AtomicRMWInst::Xor:
32476 if (shouldExpandCmpArithRMWInIR(AI))
32477 return AtomicExpansionKind::CmpArithIntrinsic;
32478 return shouldExpandLogicAtomicRMWInIR(AI);
32479 case AtomicRMWInst::Nand:
32480 case AtomicRMWInst::Max:
32481 case AtomicRMWInst::Min:
32482 case AtomicRMWInst::UMax:
32483 case AtomicRMWInst::UMin:
32484 case AtomicRMWInst::FAdd:
32485 case AtomicRMWInst::FSub:
32486 case AtomicRMWInst::FMax:
32487 case AtomicRMWInst::FMin:
32488 case AtomicRMWInst::UIncWrap:
32489 case AtomicRMWInst::UDecWrap:
32490 default:
32491 // These always require a non-trivial set of data operations on x86. We must
32492 // use a cmpxchg loop.
32493 return AtomicExpansionKind::CmpXChg;
32494 }
32495}
32496
32497LoadInst *
32498X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32499 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32500 Type *MemType = AI->getType();
32501 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32502 // there is no benefit in turning such RMWs into loads, and it is actually
32503 // harmful as it introduces a mfence.
32504 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32505 return nullptr;
32506
32507 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32508 // lowering available in lowerAtomicArith.
32509 // TODO: push more cases through this path.
32510 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32511 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32512 AI->use_empty())
32513 return nullptr;
32514
32515 IRBuilder<> Builder(AI);
32516 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32517 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
32518 auto SSID = AI->getSyncScopeID();
32519 // We must restrict the ordering to avoid generating loads with Release or
32520 // ReleaseAcquire orderings.
32521 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
32522
32523 // Before the load we need a fence. Here is an example lifted from
32524 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32525 // is required:
32526 // Thread 0:
32527 // x.store(1, relaxed);
32528 // r1 = y.fetch_add(0, release);
32529 // Thread 1:
32530 // y.fetch_add(42, acquire);
32531 // r2 = x.load(relaxed);
32532 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32533 // lowered to just a load without a fence. A mfence flushes the store buffer,
32534 // making the optimization clearly correct.
32535 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32536 // otherwise, we might be able to be more aggressive on relaxed idempotent
32537 // rmw. In practice, they do not look useful, so we don't try to be
32538 // especially clever.
32539 if (SSID == SyncScope::SingleThread)
32540 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
32541 // the IR level, so we must wrap it in an intrinsic.
32542 return nullptr;
32543
32544 if (!Subtarget.hasMFence())
32545 // FIXME: it might make sense to use a locked operation here but on a
32546 // different cache-line to prevent cache-line bouncing. In practice it
32547 // is probably a small win, and x86 processors without mfence are rare
32548 // enough that we do not bother.
32549 return nullptr;
32550
32551 Function *MFence =
32552 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
32553 Builder.CreateCall(MFence, {});
32554
32555 // Finally we can emit the atomic load.
32556 LoadInst *Loaded = Builder.CreateAlignedLoad(
32557 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32558 Loaded->setAtomic(Order, SSID);
32559 AI->replaceAllUsesWith(Loaded);
32560 AI->eraseFromParent();
32561 return Loaded;
32562}
32563
32564bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
32565 if (!SI.isUnordered())
32566 return false;
32567 return ExperimentalUnorderedISEL;
32568}
32569bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
32570 if (!LI.isUnordered())
32571 return false;
32572 return ExperimentalUnorderedISEL;
32573}
32574
32575
32576/// Emit a locked operation on a stack location which does not change any
32577/// memory location, but does involve a lock prefix. Location is chosen to be
32578/// a) very likely accessed only by a single thread to minimize cache traffic,
32579/// and b) definitely dereferenceable. Returns the new Chain result.
32580static SDValue emitLockedStackOp(SelectionDAG &DAG,
32581 const X86Subtarget &Subtarget, SDValue Chain,
32582 const SDLoc &DL) {
32583 // Implementation notes:
32584 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32585 // operations issued by the current processor. As such, the location
32586 // referenced is not relevant for the ordering properties of the instruction.
32587 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32588 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32589 // 2) Using an immediate operand appears to be the best encoding choice
32590 // here since it doesn't require an extra register.
32591 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32592 // is small enough it might just be measurement noise.)
32593 // 4) When choosing offsets, there are several contributing factors:
32594 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32595 // line aligned stack object to improve this case.)
32596 // b) To minimize our chances of introducing a false dependence, we prefer
32597 // to offset the stack usage from TOS slightly.
32598 // c) To minimize concerns about cross thread stack usage - in particular,
32599 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32600 // captures state in the TOS frame and accesses it from many threads -
32601 // we want to use an offset such that the offset is in a distinct cache
32602 // line from the TOS frame.
32603 //
32604 // For a general discussion of the tradeoffs and benchmark results, see:
32605 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32606
32607 auto &MF = DAG.getMachineFunction();
32608 auto &TFL = *Subtarget.getFrameLowering();
32609 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32610
32611 if (Subtarget.is64Bit()) {
32612 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32613 SDValue Ops[] = {
32614 DAG.getRegister(X86::RSP, MVT::i64), // Base
32615 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32616 DAG.getRegister(0, MVT::i64), // Index
32617 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32618 DAG.getRegister(0, MVT::i16), // Segment.
32619 Zero,
32620 Chain};
32621 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32622 MVT::Other, Ops);
32623 return SDValue(Res, 1);
32624 }
32625
32626 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32627 SDValue Ops[] = {
32628 DAG.getRegister(X86::ESP, MVT::i32), // Base
32629 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32630 DAG.getRegister(0, MVT::i32), // Index
32631 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32632 DAG.getRegister(0, MVT::i16), // Segment.
32633 Zero,
32634 Chain
32635 };
32636 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32637 MVT::Other, Ops);
32638 return SDValue(Res, 1);
32639}
32640
32641static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
32642 SelectionDAG &DAG) {
32643 SDLoc dl(Op);
32644 AtomicOrdering FenceOrdering =
32645 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32646 SyncScope::ID FenceSSID =
32647 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32648
32649 // The only fence that needs an instruction is a sequentially-consistent
32650 // cross-thread fence.
32651 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32652 FenceSSID == SyncScope::System) {
32653 if (Subtarget.hasMFence())
32654 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32655
32656 SDValue Chain = Op.getOperand(0);
32657 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32658 }
32659
32660 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32661 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32662}
32663
32664static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
32665 SelectionDAG &DAG) {
32666 MVT T = Op.getSimpleValueType();
32667 SDLoc DL(Op);
32668 unsigned Reg = 0;
32669 unsigned size = 0;
32670 switch(T.SimpleTy) {
32671 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32671)
;
32672 case MVT::i8: Reg = X86::AL; size = 1; break;
32673 case MVT::i16: Reg = X86::AX; size = 2; break;
32674 case MVT::i32: Reg = X86::EAX; size = 4; break;
32675 case MVT::i64:
32676 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32676, __extension__
__PRETTY_FUNCTION__))
;
32677 Reg = X86::RAX; size = 8;
32678 break;
32679 }
32680 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32681 Op.getOperand(2), SDValue());
32682 SDValue Ops[] = { cpIn.getValue(0),
32683 Op.getOperand(1),
32684 Op.getOperand(3),
32685 DAG.getTargetConstant(size, DL, MVT::i8),
32686 cpIn.getValue(1) };
32687 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32688 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32689 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
32690 Ops, T, MMO);
32691
32692 SDValue cpOut =
32693 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32694 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32695 MVT::i32, cpOut.getValue(2));
32696 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32697
32698 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32699 cpOut, Success, EFLAGS.getValue(1));
32700}
32701
32702// Create MOVMSKB, taking into account whether we need to split for AVX1.
32703static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
32704 const X86Subtarget &Subtarget) {
32705 MVT InVT = V.getSimpleValueType();
32706
32707 if (InVT == MVT::v64i8) {
32708 SDValue Lo, Hi;
32709 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32710 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32711 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32712 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32713 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32714 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32715 DAG.getConstant(32, DL, MVT::i8));
32716 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32717 }
32718 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32719 SDValue Lo, Hi;
32720 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32721 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32722 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32723 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32724 DAG.getConstant(16, DL, MVT::i8));
32725 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32726 }
32727
32728 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32729}
32730
32731static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32732 SelectionDAG &DAG) {
32733 SDValue Src = Op.getOperand(0);
32734 MVT SrcVT = Src.getSimpleValueType();
32735 MVT DstVT = Op.getSimpleValueType();
32736
32737 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32738 // half to v32i1 and concatenating the result.
32739 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32740 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32740, __extension__
__PRETTY_FUNCTION__))
;
32741 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32741, __extension__
__PRETTY_FUNCTION__))
;
32742 SDLoc dl(Op);
32743 SDValue Lo, Hi;
32744 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32745 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32746 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32747 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32748 }
32749
32750 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32751 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32752 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32752, __extension__
__PRETTY_FUNCTION__))
;
32753 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32754 SDLoc DL(Op);
32755 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32756 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32757 return DAG.getZExtOrTrunc(V, DL, DstVT);
32758 }
32759
32760 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32761, __extension__
__PRETTY_FUNCTION__))
32761 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32761, __extension__
__PRETTY_FUNCTION__))
;
32762
32763 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32763, __extension__
__PRETTY_FUNCTION__))
;
32764 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32765 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32766 // This conversion needs to be expanded.
32767 return SDValue();
32768
32769 SDLoc dl(Op);
32770 if (SrcVT.isVector()) {
32771 // Widen the vector in input in the case of MVT::v2i32.
32772 // Example: from MVT::v2i32 to MVT::v4i32.
32773 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
32774 SrcVT.getVectorNumElements() * 2);
32775 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32776 DAG.getUNDEF(SrcVT));
32777 } else {
32778 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32779, __extension__
__PRETTY_FUNCTION__))
32779 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32779, __extension__
__PRETTY_FUNCTION__))
;
32780 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32781 }
32782
32783 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32784 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32785
32786 if (DstVT == MVT::x86mmx)
32787 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32788
32789 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32790 DAG.getIntPtrConstant(0, dl));
32791}
32792
32793/// Compute the horizontal sum of bytes in V for the elements of VT.
32794///
32795/// Requires V to be a byte vector and VT to be an integer vector type with
32796/// wider elements than V's type. The width of the elements of VT determines
32797/// how many bytes of V are summed horizontally to produce each element of the
32798/// result.
32799static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
32800 const X86Subtarget &Subtarget,
32801 SelectionDAG &DAG) {
32802 SDLoc DL(V);
32803 MVT ByteVecVT = V.getSimpleValueType();
32804 MVT EltVT = VT.getVectorElementType();
32805 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32806, __extension__
__PRETTY_FUNCTION__))
32806 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32806, __extension__
__PRETTY_FUNCTION__))
;
32807 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32808, __extension__
__PRETTY_FUNCTION__))
32808 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32808, __extension__
__PRETTY_FUNCTION__))
;
32809 unsigned VecSize = VT.getSizeInBits();
32810 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32810, __extension__
__PRETTY_FUNCTION__))
;
32811
32812 // PSADBW instruction horizontally add all bytes and leave the result in i64
32813 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32814 if (EltVT == MVT::i64) {
32815 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32816 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32817 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32818 return DAG.getBitcast(VT, V);
32819 }
32820
32821 if (EltVT == MVT::i32) {
32822 // We unpack the low half and high half into i32s interleaved with zeros so
32823 // that we can use PSADBW to horizontally sum them. The most useful part of
32824 // this is that it lines up the results of two PSADBW instructions to be
32825 // two v2i64 vectors which concatenated are the 4 population counts. We can
32826 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32827 SDValue Zeros = DAG.getConstant(0, DL, VT);
32828 SDValue V32 = DAG.getBitcast(VT, V);
32829 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32830 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32831
32832 // Do the horizontal sums into two v2i64s.
32833 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32834 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32835 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32836 DAG.getBitcast(ByteVecVT, Low), Zeros);
32837 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32838 DAG.getBitcast(ByteVecVT, High), Zeros);
32839
32840 // Merge them together.
32841 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32842 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32843 DAG.getBitcast(ShortVecVT, Low),
32844 DAG.getBitcast(ShortVecVT, High));
32845
32846 return DAG.getBitcast(VT, V);
32847 }
32848
32849 // The only element type left is i16.
32850 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32850, __extension__
__PRETTY_FUNCTION__))
;
32851
32852 // To obtain pop count for each i16 element starting from the pop count for
32853 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32854 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32855 // directly supported.
32856 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32857 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32858 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32859 DAG.getBitcast(ByteVecVT, V));
32860 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32861}
32862
32863static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
32864 const X86Subtarget &Subtarget,
32865 SelectionDAG &DAG) {
32866 MVT VT = Op.getSimpleValueType();
32867 MVT EltVT = VT.getVectorElementType();
32868 int NumElts = VT.getVectorNumElements();
32869 (void)EltVT;
32870 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32870, __extension__
__PRETTY_FUNCTION__))
;
32871
32872 // Implement a lookup table in register by using an algorithm based on:
32873 // http://wm.ite.pl/articles/sse-popcount.html
32874 //
32875 // The general idea is that every lower byte nibble in the input vector is an
32876 // index into a in-register pre-computed pop count table. We then split up the
32877 // input vector in two new ones: (1) a vector with only the shifted-right
32878 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32879 // masked out higher ones) for each byte. PSHUFB is used separately with both
32880 // to index the in-register table. Next, both are added and the result is a
32881 // i8 vector where each element contains the pop count for input byte.
32882 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32883 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32884 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32885 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32886
32887 SmallVector<SDValue, 64> LUTVec;
32888 for (int i = 0; i < NumElts; ++i)
32889 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32890 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32891 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32892
32893 // High nibbles
32894 SDValue FourV = DAG.getConstant(4, DL, VT);
32895 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32896
32897 // Low nibbles
32898 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32899
32900 // The input vector is used as the shuffle mask that index elements into the
32901 // LUT. After counting low and high nibbles, add the vector to obtain the
32902 // final pop count per i8 element.
32903 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32904 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32905 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32906}
32907
32908// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32909// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32910static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
32911 SelectionDAG &DAG) {
32912 MVT VT = Op.getSimpleValueType();
32913 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32914, __extension__
__PRETTY_FUNCTION__))
32914 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32914, __extension__
__PRETTY_FUNCTION__))
;
32915 SDLoc DL(Op.getNode());
32916 SDValue Op0 = Op.getOperand(0);
32917
32918 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32919 if (Subtarget.hasVPOPCNTDQ()) {
32920 unsigned NumElems = VT.getVectorNumElements();
32921 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32922, __extension__
__PRETTY_FUNCTION__))
32922 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32922, __extension__
__PRETTY_FUNCTION__))
;
32923 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32924 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32925 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32926 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32927 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32928 }
32929 }
32930
32931 // Decompose 256-bit ops into smaller 128-bit ops.
32932 if (VT.is256BitVector() && !Subtarget.hasInt256())
32933 return splitVectorIntUnary(Op, DAG);
32934
32935 // Decompose 512-bit ops into smaller 256-bit ops.
32936 if (VT.is512BitVector() && !Subtarget.hasBWI())
32937 return splitVectorIntUnary(Op, DAG);
32938
32939 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32940 if (VT.getScalarType() != MVT::i8) {
32941 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32942 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32943 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32944 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32945 }
32946
32947 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32948 if (!Subtarget.hasSSSE3())
32949 return SDValue();
32950
32951 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32952}
32953
32954static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
32955 SelectionDAG &DAG) {
32956 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32957, __extension__
__PRETTY_FUNCTION__))
32957 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32957, __extension__
__PRETTY_FUNCTION__))
;
32958 return LowerVectorCTPOP(Op, Subtarget, DAG);
32959}
32960
32961static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
32962 MVT VT = Op.getSimpleValueType();
32963 SDValue In = Op.getOperand(0);
32964 SDLoc DL(Op);
32965
32966 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32967 // perform the BITREVERSE.
32968 if (!VT.isVector()) {
32969 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32970 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32971 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32972 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32973 DAG.getIntPtrConstant(0, DL));
32974 }
32975
32976 int NumElts = VT.getVectorNumElements();
32977 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32978
32979 // Decompose 256-bit ops into smaller 128-bit ops.
32980 if (VT.is256BitVector())
32981 return splitVectorIntUnary(Op, DAG);
32982
32983 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32984, __extension__
__PRETTY_FUNCTION__))
32984 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32984, __extension__
__PRETTY_FUNCTION__))
;
32985
32986 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32987 // perform the BSWAP in the shuffle.
32988 // Its best to shuffle using the second operand as this will implicitly allow
32989 // memory folding for multiple vectors.
32990 SmallVector<SDValue, 16> MaskElts;
32991 for (int i = 0; i != NumElts; ++i) {
32992 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32993 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32994 int PermuteByte = SourceByte | (2 << 5);
32995 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32996 }
32997 }
32998
32999 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
33000 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
33001 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
33002 Res, Mask);
33003 return DAG.getBitcast(VT, Res);
33004}
33005
33006static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
33007 SelectionDAG &DAG) {
33008 MVT VT = Op.getSimpleValueType();
33009
33010 if (Subtarget.hasXOP() && !VT.is512BitVector())
33011 return LowerBITREVERSE_XOP(Op, DAG);
33012
33013 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33013, __extension__
__PRETTY_FUNCTION__))
;
33014
33015 SDValue In = Op.getOperand(0);
33016 SDLoc DL(Op);
33017
33018 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33019, __extension__
__PRETTY_FUNCTION__))
33019 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33019, __extension__
__PRETTY_FUNCTION__))
;
33020
33021 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
33022 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
33023 return splitVectorIntUnary(Op, DAG);
33024
33025 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
33026 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
33027 return splitVectorIntUnary(Op, DAG);
33028
33029 unsigned NumElts = VT.getVectorNumElements();
33030
33031 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
33032 if (Subtarget.hasGFNI()) {
33033 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
33034 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
33035 Matrix = DAG.getBitcast(VT, Matrix);
33036 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
33037 DAG.getTargetConstant(0, DL, MVT::i8));
33038 }
33039
33040 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
33041 // two nibbles and a PSHUFB lookup to find the bitreverse of each
33042 // 0-15 value (moved to the other nibble).
33043 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
33044 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
33045 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
33046
33047 const int LoLUT[16] = {
33048 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
33049 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
33050 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
33051 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
33052 const int HiLUT[16] = {
33053 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
33054 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
33055 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
33056 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
33057
33058 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
33059 for (unsigned i = 0; i < NumElts; ++i) {
33060 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
33061 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
33062 }
33063
33064 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
33065 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
33066 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
33067 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
33068 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
33069}
33070
33071static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
33072 SelectionDAG &DAG) {
33073 SDLoc DL(Op);
33074 SDValue X = Op.getOperand(0);
33075 MVT VT = Op.getSimpleValueType();
33076
33077 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
33078 if (VT == MVT::i8 ||
33079 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
33080 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33081 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
33082 DAG.getConstant(0, DL, MVT::i8));
33083 // Copy the inverse of the parity flag into a register with setcc.
33084 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33085 // Extend to the original type.
33086 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33087 }
33088
33089 // If we have POPCNT, use the default expansion.
33090 if (Subtarget.hasPOPCNT())
33091 return SDValue();
33092
33093 if (VT == MVT::i64) {
33094 // Xor the high and low 16-bits together using a 32-bit operation.
33095 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
33096 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
33097 DAG.getConstant(32, DL, MVT::i8)));
33098 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
33099 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
33100 }
33101
33102 if (VT != MVT::i16) {
33103 // Xor the high and low 16-bits together using a 32-bit operation.
33104 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
33105 DAG.getConstant(16, DL, MVT::i8));
33106 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
33107 } else {
33108 // If the input is 16-bits, we need to extend to use an i32 shift below.
33109 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
33110 }
33111
33112 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
33113 // This should allow an h-reg to be used to save a shift.
33114 SDValue Hi = DAG.getNode(
33115 ISD::TRUNCATE, DL, MVT::i8,
33116 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
33117 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33118 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
33119 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
33120
33121 // Copy the inverse of the parity flag into a register with setcc.
33122 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33123 // Extend to the original type.
33124 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33125}
33126
33127static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
33128 const X86Subtarget &Subtarget) {
33129 unsigned NewOpc = 0;
33130 switch (N->getOpcode()) {
33131 case ISD::ATOMIC_LOAD_ADD:
33132 NewOpc = X86ISD::LADD;
33133 break;
33134 case ISD::ATOMIC_LOAD_SUB:
33135 NewOpc = X86ISD::LSUB;
33136 break;
33137 case ISD::ATOMIC_LOAD_OR:
33138 NewOpc = X86ISD::LOR;
33139 break;
33140 case ISD::ATOMIC_LOAD_XOR:
33141 NewOpc = X86ISD::LXOR;
33142 break;
33143 case ISD::ATOMIC_LOAD_AND:
33144 NewOpc = X86ISD::LAND;
33145 break;
33146 default:
33147 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33147)
;
33148 }
33149
33150 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33151
33152 return DAG.getMemIntrinsicNode(
33153 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
33154 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
33155 /*MemVT=*/N->getSimpleValueType(0), MMO);
33156}
33157
33158/// Lower atomic_load_ops into LOCK-prefixed operations.
33159static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
33160 const X86Subtarget &Subtarget) {
33161 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
33162 SDValue Chain = N->getOperand(0);
33163 SDValue LHS = N->getOperand(1);
33164 SDValue RHS = N->getOperand(2);
33165 unsigned Opc = N->getOpcode();
33166 MVT VT = N->getSimpleValueType(0);
33167 SDLoc DL(N);
33168
33169 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
33170 // can only be lowered when the result is unused. They should have already
33171 // been transformed into a cmpxchg loop in AtomicExpand.
33172 if (N->hasAnyUseOfValue(0)) {
33173 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
33174 // select LXADD if LOCK_SUB can't be selected.
33175 if (Opc == ISD::ATOMIC_LOAD_SUB) {
33176 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
33177 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
33178 RHS, AN->getMemOperand());
33179 }
33180 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33181, __extension__
__PRETTY_FUNCTION__))
33181 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33181, __extension__
__PRETTY_FUNCTION__))
;
33182 return N;
33183 }
33184
33185 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
33186 // The core idea here is that since the memory location isn't actually
33187 // changing, all we need is a lowering for the *ordering* impacts of the
33188 // atomicrmw. As such, we can chose a different operation and memory
33189 // location to minimize impact on other code.
33190 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
33191 // On X86, the only ordering which actually requires an instruction is
33192 // seq_cst which isn't SingleThread, everything just needs to be preserved
33193 // during codegen and then dropped. Note that we expect (but don't assume),
33194 // that orderings other than seq_cst and acq_rel have been canonicalized to
33195 // a store or load.
33196 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
33197 AN->getSyncScopeID() == SyncScope::System) {
33198 // Prefer a locked operation against a stack location to minimize cache
33199 // traffic. This assumes that stack locations are very likely to be
33200 // accessed only by the owning thread.
33201 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
33202 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33202, __extension__ __PRETTY_FUNCTION__))
;
33203 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33204 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33205 DAG.getUNDEF(VT), NewChain);
33206 }
33207 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
33208 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
33209 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33209, __extension__ __PRETTY_FUNCTION__))
;
33210 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33211 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33212 DAG.getUNDEF(VT), NewChain);
33213 }
33214
33215 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
33216 // RAUW the chain, but don't worry about the result, as it's unused.
33217 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33217, __extension__ __PRETTY_FUNCTION__))
;
33218 // NOTE: The getUNDEF is needed to give something for the unused result 0.
33219 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33220 DAG.getUNDEF(VT), LockOp.getValue(1));
33221}
33222
33223static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
33224 const X86Subtarget &Subtarget) {
33225 auto *Node = cast<AtomicSDNode>(Op.getNode());
33226 SDLoc dl(Node);
33227 EVT VT = Node->getMemoryVT();
33228
33229 bool IsSeqCst =
33230 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33231 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33232
33233 // If this store is not sequentially consistent and the type is legal
33234 // we can just keep it.
33235 if (!IsSeqCst && IsTypeLegal)
33236 return Op;
33237
33238 if (VT == MVT::i64 && !IsTypeLegal) {
33239 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33240 // is enabled.
33241 bool NoImplicitFloatOps =
33242 DAG.getMachineFunction().getFunction().hasFnAttribute(
33243 Attribute::NoImplicitFloat);
33244 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33245 SDValue Chain;
33246 if (Subtarget.hasSSE1()) {
33247 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
33248 Node->getOperand(2));
33249 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33250 SclToVec = DAG.getBitcast(StVT, SclToVec);
33251 SDVTList Tys = DAG.getVTList(MVT::Other);
33252 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33253 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33254 MVT::i64, Node->getMemOperand());
33255 } else if (Subtarget.hasX87()) {
33256 // First load this into an 80-bit X87 register using a stack temporary.
33257 // This will put the whole integer into the significand.
33258 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33259 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33260 MachinePointerInfo MPI =
33261 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
33262 Chain =
33263 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
33264 MPI, MaybeAlign(), MachineMemOperand::MOStore);
33265 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33266 SDValue LdOps[] = {Chain, StackPtr};
33267 SDValue Value = DAG.getMemIntrinsicNode(
33268 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33269 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33270 Chain = Value.getValue(1);
33271
33272 // Now use an FIST to do the atomic store.
33273 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33274 Chain =
33275 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33276 StoreOps, MVT::i64, Node->getMemOperand());
33277 }
33278
33279 if (Chain) {
33280 // If this is a sequentially consistent store, also emit an appropriate
33281 // barrier.
33282 if (IsSeqCst)
33283 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33284
33285 return Chain;
33286 }
33287 }
33288 }
33289
33290 // Convert seq_cst store -> xchg
33291 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33292 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33293 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
33294 Node->getMemoryVT(),
33295 Node->getOperand(0),
33296 Node->getOperand(1), Node->getOperand(2),
33297 Node->getMemOperand());
33298 return Swap.getValue(1);
33299}
33300
33301static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
33302 SDNode *N = Op.getNode();
33303 MVT VT = N->getSimpleValueType(0);
33304 unsigned Opc = Op.getOpcode();
33305
33306 // Let legalize expand this if it isn't a legal type yet.
33307 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33308 return SDValue();
33309
33310 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33311 SDLoc DL(N);
33312
33313 // Set the carry flag.
33314 SDValue Carry = Op.getOperand(2);
33315 EVT CarryVT = Carry.getValueType();
33316 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33317 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33318
33319 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
33320 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33321 Op.getOperand(0), Op.getOperand(1),
33322 Carry.getValue(1));
33323
33324 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33325 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33326 Sum.getValue(1), DL, DAG);
33327 if (N->getValueType(1) == MVT::i1)
33328 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33329
33330 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33331}
33332
33333static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33334 SelectionDAG &DAG) {
33335 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33335, __extension__
__PRETTY_FUNCTION__))
;
33336
33337 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33338 // which returns the values as { float, float } (in XMM0) or
33339 // { double, double } (which is returned in XMM0, XMM1).
33340 SDLoc dl(Op);
33341 SDValue Arg = Op.getOperand(0);
33342 EVT ArgVT = Arg.getValueType();
33343 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33344
33345 TargetLowering::ArgListTy Args;
33346 TargetLowering::ArgListEntry Entry;
33347
33348 Entry.Node = Arg;
33349 Entry.Ty = ArgTy;
33350 Entry.IsSExt = false;
33351 Entry.IsZExt = false;
33352 Args.push_back(Entry);
33353
33354 bool isF64 = ArgVT == MVT::f64;
33355 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33356 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33357 // the results are returned via SRet in memory.
33358 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33359 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33360 const char *LibcallName = TLI.getLibcallName(LC);
33361 SDValue Callee =
33362 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33363
33364 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33365 : (Type *)FixedVectorType::get(ArgTy, 4);
33366
33367 TargetLowering::CallLoweringInfo CLI(DAG);
33368 CLI.setDebugLoc(dl)
33369 .setChain(DAG.getEntryNode())
33370 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33371
33372 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33373
33374 if (isF64)
33375 // Returned in xmm0 and xmm1.
33376 return CallResult.first;
33377
33378 // Returned in bits 0:31 and 32:64 xmm0.
33379 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33380 CallResult.first, DAG.getIntPtrConstant(0, dl));
33381 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33382 CallResult.first, DAG.getIntPtrConstant(1, dl));
33383 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33384 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33385}
33386
33387/// Widen a vector input to a vector of NVT. The
33388/// input vector must have the same element type as NVT.
33389static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
33390 bool FillWithZeroes = false) {
33391 // Check if InOp already has the right width.
33392 MVT InVT = InOp.getSimpleValueType();
33393 if (InVT == NVT)
33394 return InOp;
33395
33396 if (InOp.isUndef())
33397 return DAG.getUNDEF(NVT);
33398
33399 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33400, __extension__
__PRETTY_FUNCTION__))
33400 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33400, __extension__
__PRETTY_FUNCTION__))
;
33401
33402 unsigned InNumElts = InVT.getVectorNumElements();
33403 unsigned WidenNumElts = NVT.getVectorNumElements();
33404 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33405, __extension__
__PRETTY_FUNCTION__))
33405 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33405, __extension__
__PRETTY_FUNCTION__))
;
33406
33407 SDLoc dl(InOp);
33408 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
33409 InOp.getNumOperands() == 2) {
33410 SDValue N1 = InOp.getOperand(1);
33411 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33412 N1.isUndef()) {
33413 InOp = InOp.getOperand(0);
33414 InVT = InOp.getSimpleValueType();
33415 InNumElts = InVT.getVectorNumElements();
33416 }
33417 }
33418 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
33419 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
33420 SmallVector<SDValue, 16> Ops;
33421 for (unsigned i = 0; i < InNumElts; ++i)
33422 Ops.push_back(InOp.getOperand(i));
33423
33424 EVT EltVT = InOp.getOperand(0).getValueType();
33425
33426 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
33427 DAG.getUNDEF(EltVT);
33428 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
33429 Ops.push_back(FillVal);
33430 return DAG.getBuildVector(NVT, dl, Ops);
33431 }
33432 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
33433 DAG.getUNDEF(NVT);
33434 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
33435 InOp, DAG.getIntPtrConstant(0, dl));
33436}
33437
33438static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
33439 SelectionDAG &DAG) {
33440 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33441, __extension__
__PRETTY_FUNCTION__))
33441 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33441, __extension__
__PRETTY_FUNCTION__))
;
33442
33443 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33444 SDValue Src = N->getValue();
33445 MVT VT = Src.getSimpleValueType();
33446 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33446, __extension__
__PRETTY_FUNCTION__))
;
33447 SDLoc dl(Op);
33448
33449 SDValue Scale = N->getScale();
33450 SDValue Index = N->getIndex();
33451 SDValue Mask = N->getMask();
33452 SDValue Chain = N->getChain();
33453 SDValue BasePtr = N->getBasePtr();
33454
33455 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33456 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33456, __extension__
__PRETTY_FUNCTION__))
;
33457 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33458 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33460 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33461 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33462 SDVTList VTs = DAG.getVTList(MVT::Other);
33463 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33464 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33465 N->getMemoryVT(), N->getMemOperand());
33466 }
33467 return SDValue();
33468 }
33469
33470 MVT IndexVT = Index.getSimpleValueType();
33471
33472 // If the index is v2i32, we're being called by type legalization and we
33473 // should just let the default handling take care of it.
33474 if (IndexVT == MVT::v2i32)
33475 return SDValue();
33476
33477 // If we don't have VLX and neither the passthru or index is 512-bits, we
33478 // need to widen until one is.
33479 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33480 !Index.getSimpleValueType().is512BitVector()) {
33481 // Determine how much we need to widen by to get a 512-bit type.
33482 unsigned Factor = std::min(512/VT.getSizeInBits(),
33483 512/IndexVT.getSizeInBits());
33484 unsigned NumElts = VT.getVectorNumElements() * Factor;
33485
33486 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33487 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33488 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33489
33490 Src = ExtendToType(Src, VT, DAG);
33491 Index = ExtendToType(Index, IndexVT, DAG);
33492 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33493 }
33494
33495 SDVTList VTs = DAG.getVTList(MVT::Other);
33496 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33497 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33498 N->getMemoryVT(), N->getMemOperand());
33499}
33500
33501static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33502 SelectionDAG &DAG) {
33503
33504 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33505 MVT VT = Op.getSimpleValueType();
33506 MVT ScalarVT = VT.getScalarType();
33507 SDValue Mask = N->getMask();
33508 MVT MaskVT = Mask.getSimpleValueType();
33509 SDValue PassThru = N->getPassThru();
33510 SDLoc dl(Op);
33511
33512 // Handle AVX masked loads which don't support passthru other than 0.
33513 if (MaskVT.getVectorElementType() != MVT::i1) {
33514 // We also allow undef in the isel pattern.
33515 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33516 return Op;
33517
33518 SDValue NewLoad = DAG.getMaskedLoad(
33519 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33520 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33521 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33522 N->isExpandingLoad());
33523 // Emit a blend.
33524 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33525 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33526 }
33527
33528 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33529, __extension__
__PRETTY_FUNCTION__))
33529 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33529, __extension__
__PRETTY_FUNCTION__))
;
33530
33531 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33532, __extension__
__PRETTY_FUNCTION__))
33532 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33532, __extension__
__PRETTY_FUNCTION__))
;
33533
33534 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33535, __extension__
__PRETTY_FUNCTION__))
33535 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33535, __extension__
__PRETTY_FUNCTION__))
;
33536
33537 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))
33538 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))
33539 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))
33540 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))
;
33541
33542 // This operation is legal for targets with VLX, but without
33543 // VLX the vector should be widened to 512 bit
33544 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33545 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33546 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33547
33548 // Mask element has to be i1.
33549 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33550, __extension__
__PRETTY_FUNCTION__))
33550 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33550, __extension__
__PRETTY_FUNCTION__))
;
33551
33552 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33553
33554 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33555 SDValue NewLoad = DAG.getMaskedLoad(
33556 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33557 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33558 N->getExtensionType(), N->isExpandingLoad());
33559
33560 SDValue Extract =
33561 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33562 DAG.getIntPtrConstant(0, dl));
33563 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33564 return DAG.getMergeValues(RetOps, dl);
33565}
33566
33567static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33568 SelectionDAG &DAG) {
33569 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33570 SDValue DataToStore = N->getValue();
33571 MVT VT = DataToStore.getSimpleValueType();
33572 MVT ScalarVT = VT.getScalarType();
33573 SDValue Mask = N->getMask();
33574 SDLoc dl(Op);
33575
33576 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33577, __extension__
__PRETTY_FUNCTION__))
33577 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33577, __extension__
__PRETTY_FUNCTION__))
;
33578
33579 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33580, __extension__
__PRETTY_FUNCTION__))
33580 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33580, __extension__
__PRETTY_FUNCTION__))
;
33581
33582 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33583, __extension__
__PRETTY_FUNCTION__))
33583 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33583, __extension__
__PRETTY_FUNCTION__))
;
33584
33585 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))
33586 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))
33587 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))
33588 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))
;
33589
33590 // This operation is legal for targets with VLX, but without
33591 // VLX the vector should be widened to 512 bit
33592 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33593 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33594
33595 // Mask element has to be i1.
33596 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33597, __extension__
__PRETTY_FUNCTION__))
33597 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33597, __extension__
__PRETTY_FUNCTION__))
;
33598
33599 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33600
33601 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33602 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33603 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33604 N->getOffset(), Mask, N->getMemoryVT(),
33605 N->getMemOperand(), N->getAddressingMode(),
33606 N->isTruncatingStore(), N->isCompressingStore());
33607}
33608
33609static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33610 SelectionDAG &DAG) {
33611 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33612, __extension__
__PRETTY_FUNCTION__))
33612 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33612, __extension__
__PRETTY_FUNCTION__))
;
33613
33614 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33615 SDLoc dl(Op);
33616 MVT VT = Op.getSimpleValueType();
33617 SDValue Index = N->getIndex();
33618 SDValue Mask = N->getMask();
33619 SDValue PassThru = N->getPassThru();
33620 MVT IndexVT = Index.getSimpleValueType();
33621
33622 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33622, __extension__
__PRETTY_FUNCTION__))
;
33623
33624 // If the index is v2i32, we're being called by type legalization.
33625 if (IndexVT == MVT::v2i32)
33626 return SDValue();
33627
33628 // If we don't have VLX and neither the passthru or index is 512-bits, we
33629 // need to widen until one is.
33630 MVT OrigVT = VT;
33631 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33632 !IndexVT.is512BitVector()) {
33633 // Determine how much we need to widen by to get a 512-bit type.
33634 unsigned Factor = std::min(512/VT.getSizeInBits(),
33635 512/IndexVT.getSizeInBits());
33636
33637 unsigned NumElts = VT.getVectorNumElements() * Factor;
33638
33639 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33640 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33641 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33642
33643 PassThru = ExtendToType(PassThru, VT, DAG);
33644 Index = ExtendToType(Index, IndexVT, DAG);
33645 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33646 }
33647
33648 // Break dependency on the data register.
33649 if (PassThru.isUndef())
33650 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33651
33652 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33653 N->getScale() };
33654 SDValue NewGather = DAG.getMemIntrinsicNode(
33655 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33656 N->getMemOperand());
33657 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
33658 NewGather, DAG.getIntPtrConstant(0, dl));
33659 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33660}
33661
33662static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
33663 SDLoc dl(Op);
33664 SDValue Src = Op.getOperand(0);
33665 MVT DstVT = Op.getSimpleValueType();
33666
33667 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33668 unsigned SrcAS = N->getSrcAddressSpace();
33669
33670 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33671, __extension__
__PRETTY_FUNCTION__))
33671 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33671, __extension__
__PRETTY_FUNCTION__))
;
33672
33673 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33674 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33675 } else if (DstVT == MVT::i64) {
33676 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33677 } else if (DstVT == MVT::i32) {
33678 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33679 } else {
33680 report_fatal_error("Bad address space in addrspacecast");
33681 }
33682 return Op;
33683}
33684
33685SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33686 SelectionDAG &DAG) const {
33687 // TODO: Eventually, the lowering of these nodes should be informed by or
33688 // deferred to the GC strategy for the function in which they appear. For
33689 // now, however, they must be lowered to something. Since they are logically
33690 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33691 // require special handling for these nodes), lower them as literal NOOPs for
33692 // the time being.
33693 SmallVector<SDValue, 2> Ops;
33694 Ops.push_back(Op.getOperand(0));
33695 if (Op->getGluedNode())
33696 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33697
33698 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33699 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33700}
33701
33702// Custom split CVTPS2PH with wide types.
33703static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
33704 SDLoc dl(Op);
33705 EVT VT = Op.getValueType();
33706 SDValue Lo, Hi;
33707 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33708 EVT LoVT, HiVT;
33709 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33710 SDValue RC = Op.getOperand(1);
33711 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33712 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33713 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33714}
33715
33716static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
33717 unsigned OpNo) {
33718 const APInt Operand(32, OpNo);
33719 std::string OpNoStr = llvm::toString(Operand, 10, false);
33720 std::string Str(" $");
33721
33722 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33723 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33724
33725 auto I = StringRef::npos;
33726 for (auto &AsmStr : AsmStrs) {
33727 // Match the OpNo string. We should match exactly to exclude match
33728 // sub-string, e.g. "$12" contain "$1"
33729 if (AsmStr.endswith(OpNoStr1))
33730 I = AsmStr.size() - OpNoStr1.size();
33731
33732 // Get the index of operand in AsmStr.
33733 if (I == StringRef::npos)
33734 I = AsmStr.find(OpNoStr1 + ",");
33735 if (I == StringRef::npos)
33736 I = AsmStr.find(OpNoStr2);
33737
33738 if (I == StringRef::npos)
33739 continue;
33740
33741 assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33741, __extension__
__PRETTY_FUNCTION__))
;
33742 // Remove the operand string and label (if exsit).
33743 // For example:
33744 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33745 // ==>
33746 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33747 // ==>
33748 // "call dword ptr "
33749 auto TmpStr = AsmStr.substr(0, I);
33750 I = TmpStr.rfind(':');
33751 if (I == StringRef::npos)
33752 return TmpStr;
33753
33754 assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33754, __extension__
__PRETTY_FUNCTION__))
;
33755 auto Asm = TmpStr.drop_front(I + 1);
33756 return Asm;
33757 }
33758
33759 return StringRef();
33760}
33761
33762bool X86TargetLowering::isInlineAsmTargetBranch(
33763 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33764 StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);
33765
33766 if (InstrStr.contains("call"))
33767 return true;
33768
33769 return false;
33770}
33771
33772/// Provide custom lowering hooks for some operations.
33773SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
33774 switch (Op.getOpcode()) {
33775 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33775)
;
33776 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33777 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33778 return LowerCMP_SWAP(Op, Subtarget, DAG);
33779 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33780 case ISD::ATOMIC_LOAD_ADD:
33781 case ISD::ATOMIC_LOAD_SUB:
33782 case ISD::ATOMIC_LOAD_OR:
33783 case ISD::ATOMIC_LOAD_XOR:
33784 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33785 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33786 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33787 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33788 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33789 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33790 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33791 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33792 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33793 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33794 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33795 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33796 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33797 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33798 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33799 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33800 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33801 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33802 case ISD::SHL_PARTS:
33803 case ISD::SRA_PARTS:
33804 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33805 case ISD::FSHL:
33806 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33807 case ISD::STRICT_SINT_TO_FP:
33808 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33809 case ISD::STRICT_UINT_TO_FP:
33810 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33811 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33812 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33813 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33814 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33815 case ISD::ZERO_EXTEND_VECTOR_INREG:
33816 case ISD::SIGN_EXTEND_VECTOR_INREG:
33817 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33818 case ISD::FP_TO_SINT:
33819 case ISD::STRICT_FP_TO_SINT:
33820 case ISD::FP_TO_UINT:
33821 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33822 case ISD::FP_TO_SINT_SAT:
33823 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33824 case ISD::FP_EXTEND:
33825 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33826 case ISD::FP_ROUND:
33827 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33828 case ISD::FP16_TO_FP:
33829 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33830 case ISD::FP_TO_FP16:
33831 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33832 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33833 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33834 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33835 case ISD::FADD:
33836 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33837 case ISD::FROUND: return LowerFROUND(Op, DAG);
33838 case ISD::FABS:
33839 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33840 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33841 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33842 case ISD::LRINT:
33843 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33844 case ISD::SETCC:
33845 case ISD::STRICT_FSETCC:
33846 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33847 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33848 case ISD::SELECT: return LowerSELECT(Op, DAG);
33849 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33850 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33851 case ISD::VASTART: return LowerVASTART(Op, DAG);
33852 case ISD::VAARG: return LowerVAARG(Op, DAG);
33853 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33854 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33855 case ISD::INTRINSIC_VOID:
33856 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33857 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33858 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33859 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33860 case ISD::FRAME_TO_ARGS_OFFSET:
33861 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33862 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33863 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33864 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33865 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33866 case ISD::EH_SJLJ_SETUP_DISPATCH:
33867 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33868 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33869 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33870 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33871 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33872 case ISD::CTLZ:
33873 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33874 case ISD::CTTZ:
33875 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33876 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33877 case ISD::MULHS:
33878 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33879 case ISD::ROTL:
33880 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33881 case ISD::SRA:
33882 case ISD::SRL:
33883 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33884 case ISD::SADDO:
33885 case ISD::UADDO:
33886 case ISD::SSUBO:
33887 case ISD::USUBO: return LowerXALUO(Op, DAG);
33888 case ISD::SMULO:
33889 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33890 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33891 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33892 case ISD::SADDO_CARRY:
33893 case ISD::SSUBO_CARRY:
33894 case ISD::ADDCARRY:
33895 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
33896 case ISD::ADD:
33897 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33898 case ISD::UADDSAT:
33899 case ISD::SADDSAT:
33900 case ISD::USUBSAT:
33901 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33902 case ISD::SMAX:
33903 case ISD::SMIN:
33904 case ISD::UMAX:
33905 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33906 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33907 case ISD::ABDS:
33908 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33909 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33910 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33911 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33912 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33913 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33914 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33915 case ISD::GC_TRANSITION_START:
33916 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33917 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33918 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33919 }
33920}
33921
33922/// Replace a node with an illegal result type with a new node built out of
33923/// custom code.
33924void X86TargetLowering::ReplaceNodeResults(SDNode *N,
33925 SmallVectorImpl<SDValue>&Results,
33926 SelectionDAG &DAG) const {
33927 SDLoc dl(N);
33928 switch (N->getOpcode()) {
33929 default:
33930#ifndef NDEBUG
33931 dbgs() << "ReplaceNodeResults: ";
33932 N->dump(&DAG);
33933#endif
33934 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33934)
;
33935 case X86ISD::CVTPH2PS: {
33936 EVT VT = N->getValueType(0);
33937 SDValue Lo, Hi;
33938 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33939 EVT LoVT, HiVT;
33940 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33941 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33942 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33943 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33944 Results.push_back(Res);
33945 return;
33946 }
33947 case X86ISD::STRICT_CVTPH2PS: {
33948 EVT VT = N->getValueType(0);
33949 SDValue Lo, Hi;
33950 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33951 EVT LoVT, HiVT;
33952 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33953 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33954 {N->getOperand(0), Lo});
33955 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33956 {N->getOperand(0), Hi});
33957 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33958 Lo.getValue(1), Hi.getValue(1));
33959 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33960 Results.push_back(Res);
33961 Results.push_back(Chain);
33962 return;
33963 }
33964 case X86ISD::CVTPS2PH:
33965 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33966 return;
33967 case ISD::CTPOP: {
33968 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33968, __extension__
__PRETTY_FUNCTION__))
;
33969 // Use a v2i64 if possible.
33970 bool NoImplicitFloatOps =
33971 DAG.getMachineFunction().getFunction().hasFnAttribute(
33972 Attribute::NoImplicitFloat);
33973 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33974 SDValue Wide =
33975 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33976 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33977 // Bit count should fit in 32-bits, extract it as that and then zero
33978 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33979 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33980 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33981 DAG.getIntPtrConstant(0, dl));
33982 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33983 Results.push_back(Wide);
33984 }
33985 return;
33986 }
33987 case ISD::MUL: {
33988 EVT VT = N->getValueType(0);
33989 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33990, __extension__
__PRETTY_FUNCTION__))
33990 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33990, __extension__
__PRETTY_FUNCTION__))
;
33991 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33992 // elements are needed.
33993 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33994 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33995 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33996 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33997 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33998 unsigned NumConcats = 16 / VT.getVectorNumElements();
33999 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34000 ConcatOps[0] = Res;
34001 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
34002 Results.push_back(Res);
34003 return;
34004 }
34005 case ISD::SMULO:
34006 case ISD::UMULO: {
34007 EVT VT = N->getValueType(0);
34008 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34009, __extension__
__PRETTY_FUNCTION__))
34009 VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34009, __extension__
__PRETTY_FUNCTION__))
;
34010 bool IsSigned = N->getOpcode() == ISD::SMULO;
34011 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
34012 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
34013 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
34014 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
34015 // Extract the high 32 bits from each result using PSHUFD.
34016 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
34017 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
34018 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
34019 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
34020 DAG.getIntPtrConstant(0, dl));
34021
34022 // Truncate the low bits of the result. This will become PSHUFD.
34023 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34024
34025 SDValue HiCmp;
34026 if (IsSigned) {
34027 // SMULO overflows if the high bits don't match the sign of the low.
34028 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
34029 } else {
34030 // UMULO overflows if the high bits are non-zero.
34031 HiCmp = DAG.getConstant(0, dl, VT);
34032 }
34033 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
34034
34035 // Widen the result with by padding with undef.
34036 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34037 DAG.getUNDEF(VT));
34038 Results.push_back(Res);
34039 Results.push_back(Ovf);
34040 return;
34041 }
34042 case X86ISD::VPMADDWD: {
34043 // Legalize types for X86ISD::VPMADDWD by widening.
34044 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34044, __extension__
__PRETTY_FUNCTION__))
;
34045
34046 EVT VT = N->getValueType(0);
34047 EVT InVT = N->getOperand(0).getValueType();
34048 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34049, __extension__
__PRETTY_FUNCTION__))
34049 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34049, __extension__
__PRETTY_FUNCTION__))
;
34050 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34051, __extension__
__PRETTY_FUNCTION__))
34051 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34051, __extension__
__PRETTY_FUNCTION__))
;
34052 unsigned NumConcat = 128 / InVT.getSizeInBits();
34053
34054 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
34055 InVT.getVectorElementType(),
34056 NumConcat * InVT.getVectorNumElements());
34057 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
34058 VT.getVectorElementType(),
34059 NumConcat * VT.getVectorNumElements());
34060
34061 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
34062 Ops[0] = N->getOperand(0);
34063 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34064 Ops[0] = N->getOperand(1);
34065 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34066
34067 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
34068 Results.push_back(Res);
34069 return;
34070 }
34071 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
34072 case X86ISD::FMINC:
34073 case X86ISD::FMIN:
34074 case X86ISD::FMAXC:
34075 case X86ISD::FMAX: {
34076 EVT VT = N->getValueType(0);
34077 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34077, __extension__
__PRETTY_FUNCTION__))
;
34078 SDValue UNDEF = DAG.getUNDEF(VT);
34079 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34080 N->getOperand(0), UNDEF);
34081 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34082 N->getOperand(1), UNDEF);
34083 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
34084 return;
34085 }
34086 case ISD::SDIV:
34087 case ISD::UDIV:
34088 case ISD::SREM:
34089 case ISD::UREM: {
34090 EVT VT = N->getValueType(0);
34091 if (VT.isVector()) {
34092 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34093, __extension__
__PRETTY_FUNCTION__))
34093 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34093, __extension__
__PRETTY_FUNCTION__))
;
34094 // If this RHS is a constant splat vector we can widen this and let
34095 // division/remainder by constant optimize it.
34096 // TODO: Can we do something for non-splat?
34097 APInt SplatVal;
34098 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
34099 unsigned NumConcats = 128 / VT.getSizeInBits();
34100 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
34101 Ops0[0] = N->getOperand(0);
34102 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
34103 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
34104 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
34105 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
34106 Results.push_back(Res);
34107 }
34108 return;
34109 }
34110
34111 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
34112 Results.push_back(V);
34113 return;
34114 }
34115 case ISD::TRUNCATE: {
34116 MVT VT = N->getSimpleValueType(0);
34117 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
34118 return;
34119
34120 // The generic legalizer will try to widen the input type to the same
34121 // number of elements as the widened result type. But this isn't always
34122 // the best thing so do some custom legalization to avoid some cases.
34123 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34124 SDValue In = N->getOperand(0);
34125 EVT InVT = In.getValueType();
34126
34127 unsigned InBits = InVT.getSizeInBits();
34128 if (128 % InBits == 0) {
34129 // 128 bit and smaller inputs should avoid truncate all together and
34130 // just use a build_vector that will become a shuffle.
34131 // TODO: Widen and use a shuffle directly?
34132 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
34133 EVT EltVT = VT.getVectorElementType();
34134 unsigned WidenNumElts = WidenVT.getVectorNumElements();
34135 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
34136 // Use the original element count so we don't do more scalar opts than
34137 // necessary.
34138 unsigned MinElts = VT.getVectorNumElements();
34139 for (unsigned i=0; i < MinElts; ++i) {
34140 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
34141 DAG.getIntPtrConstant(i, dl));
34142 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
34143 }
34144 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
34145 return;
34146 }
34147 // With AVX512 there are some cases that can use a target specific
34148 // truncate node to go from 256/512 to less than 128 with zeros in the
34149 // upper elements of the 128 bit result.
34150 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34151 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34152 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34153 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34154 return;
34155 }
34156 // There's one case we can widen to 512 bits and use VTRUNC.
34157 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34158 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34159 DAG.getUNDEF(MVT::v4i64));
34160 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34161 return;
34162 }
34163 }
34164 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34165 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34166 isTypeLegal(MVT::v4i64)) {
34167 // Input needs to be split and output needs to widened. Let's use two
34168 // VTRUNCs, and shuffle their results together into the wider type.
34169 SDValue Lo, Hi;
34170 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34171
34172 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34173 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34174 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34175 { 0, 1, 2, 3, 16, 17, 18, 19,
34176 -1, -1, -1, -1, -1, -1, -1, -1 });
34177 Results.push_back(Res);
34178 return;
34179 }
34180
34181 return;
34182 }
34183 case ISD::ANY_EXTEND:
34184 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34185 // It's intended to custom handle the input type.
34186 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34187, __extension__
__PRETTY_FUNCTION__))
34187 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34187, __extension__
__PRETTY_FUNCTION__))
;
34188 return;
34189 case ISD::SIGN_EXTEND:
34190 case ISD::ZERO_EXTEND: {
34191 EVT VT = N->getValueType(0);
34192 SDValue In = N->getOperand(0);
34193 EVT InVT = In.getValueType();
34194 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34195 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34196 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34197, __extension__
__PRETTY_FUNCTION__))
34197 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34197, __extension__
__PRETTY_FUNCTION__))
;
34198 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34198, __extension__
__PRETTY_FUNCTION__))
;
34199 // Custom split this so we can extend i8/i16->i32 invec. This is better
34200 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34201 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34202 // we allow the sra from the extend to i32 to be shared by the split.
34203 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34204
34205 // Fill a vector with sign bits for each element.
34206 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34207 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34208
34209 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34210 // to v2i64.
34211 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34212 {0, 4, 1, 5});
34213 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34214 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34215 {2, 6, 3, 7});
34216 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34217
34218 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34219 Results.push_back(Res);
34220 return;
34221 }
34222
34223 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34224 if (!InVT.is128BitVector()) {
34225 // Not a 128 bit vector, but maybe type legalization will promote
34226 // it to 128 bits.
34227 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34228 return;
34229 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34230 if (!InVT.is128BitVector())
34231 return;
34232
34233 // Promote the input to 128 bits. Type legalization will turn this into
34234 // zext_inreg/sext_inreg.
34235 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
34236 }
34237
34238 // Perform custom splitting instead of the two stage extend we would get
34239 // by default.
34240 EVT LoVT, HiVT;
34241 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34242 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__))
;
34243
34244 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
34245
34246 // We need to shift the input over by half the number of elements.
34247 unsigned NumElts = InVT.getVectorNumElements();
34248 unsigned HalfNumElts = NumElts / 2;
34249 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34250 for (unsigned i = 0; i != HalfNumElts; ++i)
34251 ShufMask[i] = i + HalfNumElts;
34252
34253 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34254 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
34255
34256 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34257 Results.push_back(Res);
34258 }
34259 return;
34260 }
34261 case ISD::FP_TO_SINT:
34262 case ISD::STRICT_FP_TO_SINT:
34263 case ISD::FP_TO_UINT:
34264 case ISD::STRICT_FP_TO_UINT: {
34265 bool IsStrict = N->isStrictFPOpcode();
34266 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
34267 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
34268 EVT VT = N->getValueType(0);
34269 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34270 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34271 EVT SrcVT = Src.getValueType();
34272
34273 SDValue Res;
34274 if (isSoftFP16(SrcVT)) {
34275 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34276 if (IsStrict) {
34277 Res =
34278 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
34279 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34280 {NVT, MVT::Other}, {Chain, Src})});
34281 Chain = Res.getValue(1);
34282 } else {
34283 Res = DAG.getNode(N->getOpcode(), dl, VT,
34284 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34285 }
34286 Results.push_back(Res);
34287 if (IsStrict)
34288 Results.push_back(Chain);
34289
34290 return;
34291 }
34292
34293 if (VT.isVector() && Subtarget.hasFP16() &&
34294 SrcVT.getVectorElementType() == MVT::f16) {
34295 EVT EleVT = VT.getVectorElementType();
34296 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34297
34298 if (SrcVT != MVT::v8f16) {
34299 SDValue Tmp =
34300 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34301 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34302 Ops[0] = Src;
34303 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34304 }
34305
34306 if (IsStrict) {
34307 unsigned Opc =
34308 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34309 Res =
34310 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34311 Chain = Res.getValue(1);
34312 } else {
34313 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34314 Res = DAG.getNode(Opc, dl, ResVT, Src);
34315 }
34316
34317 // TODO: Need to add exception check code for strict FP.
34318 if (EleVT.getSizeInBits() < 16) {
34319 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34320 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34321
34322 // Now widen to 128 bits.
34323 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34324 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34325 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34326 ConcatOps[0] = Res;
34327 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34328 }
34329
34330 Results.push_back(Res);
34331 if (IsStrict)
34332 Results.push_back(Chain);
34333
34334 return;
34335 }
34336
34337 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34338 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34339, __extension__
__PRETTY_FUNCTION__))
34339 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34339, __extension__
__PRETTY_FUNCTION__))
;
34340
34341 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34342 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34343 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34344 VT.getVectorNumElements());
34345 SDValue Res;
34346 SDValue Chain;
34347 if (IsStrict) {
34348 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34349 {N->getOperand(0), Src});
34350 Chain = Res.getValue(1);
34351 } else
34352 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34353
34354 // Preserve what we know about the size of the original result. If the
34355 // result is v2i32, we have to manually widen the assert.
34356 if (PromoteVT == MVT::v2i32)
34357 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34358 DAG.getUNDEF(MVT::v2i32));
34359
34360 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34361 Res.getValueType(), Res,
34362 DAG.getValueType(VT.getVectorElementType()));
34363
34364 if (PromoteVT == MVT::v2i32)
34365 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34366 DAG.getIntPtrConstant(0, dl));
34367
34368 // Truncate back to the original width.
34369 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34370
34371 // Now widen to 128 bits.
34372 unsigned NumConcats = 128 / VT.getSizeInBits();
34373 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
34374 VT.getVectorNumElements() * NumConcats);
34375 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34376 ConcatOps[0] = Res;
34377 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34378 Results.push_back(Res);
34379 if (IsStrict)
34380 Results.push_back(Chain);
34381 return;
34382 }
34383
34384
34385 if (VT == MVT::v2i32) {
34386 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34387, __extension__
__PRETTY_FUNCTION__))
34387 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34387, __extension__
__PRETTY_FUNCTION__))
;
34388 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34388, __extension__
__PRETTY_FUNCTION__))
;
34389 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34390, __extension__
__PRETTY_FUNCTION__))
34390 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34390, __extension__
__PRETTY_FUNCTION__))
;
34391 if (Src.getValueType() == MVT::v2f64) {
34392 if (!IsSigned && !Subtarget.hasAVX512()) {
34393 SDValue Res =
34394 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34395 Results.push_back(Res);
34396 return;
34397 }
34398
34399 unsigned Opc;
34400 if (IsStrict)
34401 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34402 else
34403 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34404
34405 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34406 if (!IsSigned && !Subtarget.hasVLX()) {
34407 // Otherwise we can defer to the generic legalizer which will widen
34408 // the input as well. This will be further widened during op
34409 // legalization to v8i32<-v8f64.
34410 // For strict nodes we'll need to widen ourselves.
34411 // FIXME: Fix the type legalizer to safely widen strict nodes?
34412 if (!IsStrict)
34413 return;
34414 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34415 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34416 Opc = N->getOpcode();
34417 }
34418 SDValue Res;
34419 SDValue Chain;
34420 if (IsStrict) {
34421 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34422 {N->getOperand(0), Src});
34423 Chain = Res.getValue(1);
34424 } else {
34425 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34426 }
34427 Results.push_back(Res);
34428 if (IsStrict)
34429 Results.push_back(Chain);
34430 return;
34431 }
34432
34433 // Custom widen strict v2f32->v2i32 by padding with zeros.
34434 // FIXME: Should generic type legalizer do this?
34435 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34436 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34437 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34438 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
34439 {N->getOperand(0), Src});
34440 Results.push_back(Res);
34441 Results.push_back(Res.getValue(1));
34442 return;
34443 }
34444
34445 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34446 // so early out here.
34447 return;
34448 }
34449
34450 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34450, __extension__
__PRETTY_FUNCTION__))
;
34451
34452 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34453 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34454 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34455 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34455, __extension__
__PRETTY_FUNCTION__))
;
34456 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34457 // If we use a 128-bit result we might need to use a target specific node.
34458 unsigned SrcElts =
34459 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34460 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34461 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34462 unsigned Opc = N->getOpcode();
34463 if (NumElts != SrcElts) {
34464 if (IsStrict)
34465 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34466 else
34467 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34468 }
34469
34470 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
34471 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34472 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34473 ZeroIdx);
34474 SDValue Chain;
34475 if (IsStrict) {
34476 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34477 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34478 Chain = Res.getValue(1);
34479 } else
34480 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34481 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34482 Results.push_back(Res);
34483 if (IsStrict)
34484 Results.push_back(Chain);
34485 return;
34486 }
34487
34488 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34489 SDValue Chain;
34490 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34491 Results.push_back(V);
34492 if (IsStrict)
34493 Results.push_back(Chain);
34494 return;
34495 }
34496
34497 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34498 Results.push_back(V);
34499 if (IsStrict)
34500 Results.push_back(Chain);
34501 }
34502 return;
34503 }
34504 case ISD::LRINT:
34505 case ISD::LLRINT: {
34506 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34507 Results.push_back(V);
34508 return;
34509 }
34510
34511 case ISD::SINT_TO_FP:
34512 case ISD::STRICT_SINT_TO_FP:
34513 case ISD::UINT_TO_FP:
34514 case ISD::STRICT_UINT_TO_FP: {
34515 bool IsStrict = N->isStrictFPOpcode();
34516 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
34517 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
34518 EVT VT = N->getValueType(0);
34519 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34520 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34521 Subtarget.hasVLX()) {
34522 if (Src.getValueType().getVectorElementType() == MVT::i16)
34523 return;
34524
34525 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34526 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34527 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34528 : DAG.getUNDEF(MVT::v2i32));
34529 if (IsStrict) {
34530 unsigned Opc =
34531 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
34532 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34533 {N->getOperand(0), Src});
34534 Results.push_back(Res);
34535 Results.push_back(Res.getValue(1));
34536 } else {
34537 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34538 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34539 }
34540 return;
34541 }
34542 if (VT != MVT::v2f32)
34543 return;
34544 EVT SrcVT = Src.getValueType();
34545 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34546 if (IsStrict) {
34547 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34548 : X86ISD::STRICT_CVTUI2P;
34549 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34550 {N->getOperand(0), Src});
34551 Results.push_back(Res);
34552 Results.push_back(Res.getValue(1));
34553 } else {
34554 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34555 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34556 }
34557 return;
34558 }
34559 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34560 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34561 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34562 SDValue One = DAG.getConstant(1, dl, SrcVT);
34563 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34564 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34565 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34566 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34567 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34568 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34569 for (int i = 0; i != 2; ++i) {
34570 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34571 SignSrc, DAG.getIntPtrConstant(i, dl));
34572 if (IsStrict)
34573 SignCvts[i] =
34574 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34575 {N->getOperand(0), Elt});
34576 else
34577 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34578 };
34579 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34580 SDValue Slow, Chain;
34581 if (IsStrict) {
34582 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34583 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34584 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34585 {Chain, SignCvt, SignCvt});
34586 Chain = Slow.getValue(1);
34587 } else {
34588 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34589 }
34590 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34591 IsNeg =
34592 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34593 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34594 Results.push_back(Cvt);
34595 if (IsStrict)
34596 Results.push_back(Chain);
34597 return;
34598 }
34599
34600 if (SrcVT != MVT::v2i32)
34601 return;
34602
34603 if (IsSigned || Subtarget.hasAVX512()) {
34604 if (!IsStrict)
34605 return;
34606
34607 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34608 // FIXME: Should generic type legalizer do this?
34609 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34610 DAG.getConstant(0, dl, MVT::v2i32));
34611 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
34612 {N->getOperand(0), Src});
34613 Results.push_back(Res);
34614 Results.push_back(Res.getValue(1));
34615 return;
34616 }
34617
34618 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34618, __extension__
__PRETTY_FUNCTION__))
;
34619 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34620 SDValue VBias = DAG.getConstantFP(
34621 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34622 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34623 DAG.getBitcast(MVT::v2i64, VBias));
34624 Or = DAG.getBitcast(MVT::v2f64, Or);
34625 if (IsStrict) {
34626 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34627 {N->getOperand(0), Or, VBias});
34628 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
34629 {MVT::v4f32, MVT::Other},
34630 {Sub.getValue(1), Sub});
34631 Results.push_back(Res);
34632 Results.push_back(Res.getValue(1));
34633 } else {
34634 // TODO: Are there any fast-math-flags to propagate here?
34635 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34636 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34637 }
34638 return;
34639 }
34640 case ISD::STRICT_FP_ROUND:
34641 case ISD::FP_ROUND: {
34642 bool IsStrict = N->isStrictFPOpcode();
34643 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34644 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34645 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34646 EVT SrcVT = Src.getValueType();
34647 EVT VT = N->getValueType(0);
34648 SDValue V;
34649 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34650 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34651 : DAG.getUNDEF(MVT::v2f32);
34652 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34653 }
34654 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34655 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34655, __extension__
__PRETTY_FUNCTION__))
;
34656 if (SrcVT.getVectorElementType() != MVT::f32)
34657 return;
34658
34659 if (IsStrict)
34660 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34661 {Chain, Src, Rnd});
34662 else
34663 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34664
34665 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34666 if (IsStrict)
34667 Results.push_back(V.getValue(1));
34668 return;
34669 }
34670 if (!isTypeLegal(Src.getValueType()))
34671 return;
34672 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34673 if (IsStrict)
34674 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34675 {Chain, Src});
34676 else
34677 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34678 Results.push_back(V);
34679 if (IsStrict)
34680 Results.push_back(V.getValue(1));
34681 return;
34682 }
34683 case ISD::FP_EXTEND:
34684 case ISD::STRICT_FP_EXTEND: {
34685 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34686 // No other ValueType for FP_EXTEND should reach this point.
34687 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__))
34688 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__))
;
34689 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34690 return;
34691 bool IsStrict = N->isStrictFPOpcode();
34692 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34693 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34694 : DAG.getUNDEF(MVT::v2f16);
34695 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34696 if (IsStrict)
34697 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34698 {N->getOperand(0), V});
34699 else
34700 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34701 Results.push_back(V);
34702 if (IsStrict)
34703 Results.push_back(V.getValue(1));
34704 return;
34705 }
34706 case ISD::INTRINSIC_W_CHAIN: {
34707 unsigned IntNo = N->getConstantOperandVal(1);
34708 switch (IntNo) {
34709 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34710)
34710 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34710)
;
34711 case Intrinsic::x86_rdtsc:
34712 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34713 Results);
34714 case Intrinsic::x86_rdtscp:
34715 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34716 Results);
34717 case Intrinsic::x86_rdpmc:
34718 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34719 Results);
34720 return;
34721 case Intrinsic::x86_rdpru:
34722 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34723 Results);
34724 return;
34725 case Intrinsic::x86_xgetbv:
34726 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34727 Results);
34728 return;
34729 }
34730 }
34731 case ISD::READCYCLECOUNTER: {
34732 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34733 }
34734 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34735 EVT T = N->getValueType(0);
34736 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34736, __extension__
__PRETTY_FUNCTION__))
;
34737 bool Regs64bit = T == MVT::i128;
34738 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34739, __extension__
__PRETTY_FUNCTION__))
34739 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34739, __extension__
__PRETTY_FUNCTION__))
;
34740 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34741 SDValue cpInL, cpInH;
34742 std::tie(cpInL, cpInH) =
34743 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34744 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34745 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34746 cpInH =
34747 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34748 cpInH, cpInL.getValue(1));
34749 SDValue swapInL, swapInH;
34750 std::tie(swapInL, swapInH) =
34751 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34752 swapInH =
34753 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34754 swapInH, cpInH.getValue(1));
34755
34756 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34757 // until later. So we keep the RBX input in a vreg and use a custom
34758 // inserter.
34759 // Since RBX will be a reserved register the register allocator will not
34760 // make sure its value will be properly saved and restored around this
34761 // live-range.
34762 SDValue Result;
34763 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34764 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34765 if (Regs64bit) {
34766 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34767 swapInH.getValue(1)};
34768 Result =
34769 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34770 } else {
34771 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34772 swapInH.getValue(1));
34773 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34774 swapInL.getValue(1)};
34775 Result =
34776 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34777 }
34778
34779 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34780 Regs64bit ? X86::RAX : X86::EAX,
34781 HalfT, Result.getValue(1));
34782 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34783 Regs64bit ? X86::RDX : X86::EDX,
34784 HalfT, cpOutL.getValue(2));
34785 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34786
34787 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34788 MVT::i32, cpOutH.getValue(2));
34789 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34790 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34791
34792 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34793 Results.push_back(Success);
34794 Results.push_back(EFLAGS.getValue(1));
34795 return;
34796 }
34797 case ISD::ATOMIC_LOAD: {
34798 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34798, __extension__
__PRETTY_FUNCTION__))
;
34799 bool NoImplicitFloatOps =
34800 DAG.getMachineFunction().getFunction().hasFnAttribute(
34801 Attribute::NoImplicitFloat);
34802 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34803 auto *Node = cast<AtomicSDNode>(N);
34804 if (Subtarget.hasSSE1()) {
34805 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34806 // Then extract the lower 64-bits.
34807 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34808 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34809 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34810 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34811 MVT::i64, Node->getMemOperand());
34812 if (Subtarget.hasSSE2()) {
34813 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34814 DAG.getIntPtrConstant(0, dl));
34815 Results.push_back(Res);
34816 Results.push_back(Ld.getValue(1));
34817 return;
34818 }
34819 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34820 // then casts to i64. This avoids a 128-bit stack temporary being
34821 // created by type legalization if we were to cast v4f32->v2i64.
34822 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34823 DAG.getIntPtrConstant(0, dl));
34824 Res = DAG.getBitcast(MVT::i64, Res);
34825 Results.push_back(Res);
34826 Results.push_back(Ld.getValue(1));
34827 return;
34828 }
34829 if (Subtarget.hasX87()) {
34830 // First load this into an 80-bit X87 register. This will put the whole
34831 // integer into the significand.
34832 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34833 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34834 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
34835 dl, Tys, Ops, MVT::i64,
34836 Node->getMemOperand());
34837 SDValue Chain = Result.getValue(1);
34838
34839 // Now store the X87 register to a stack temporary and convert to i64.
34840 // This store is not atomic and doesn't need to be.
34841 // FIXME: We don't need a stack temporary if the result of the load
34842 // is already being stored. We could just directly store there.
34843 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34844 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34845 MachinePointerInfo MPI =
34846 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
34847 SDValue StoreOps[] = { Chain, Result, StackPtr };
34848 Chain = DAG.getMemIntrinsicNode(
34849 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34850 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34851
34852 // Finally load the value back from the stack temporary and return it.
34853 // This load is not atomic and doesn't need to be.
34854 // This load will be further type legalized.
34855 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34856 Results.push_back(Result);
34857 Results.push_back(Result.getValue(1));
34858 return;
34859 }
34860 }
34861 // TODO: Use MOVLPS when SSE1 is available?
34862 // Delegate to generic TypeLegalization. Situations we can really handle
34863 // should have already been dealt with by AtomicExpandPass.cpp.
34864 break;
34865 }
34866 case ISD::ATOMIC_SWAP:
34867 case ISD::ATOMIC_LOAD_ADD:
34868 case ISD::ATOMIC_LOAD_SUB:
34869 case ISD::ATOMIC_LOAD_AND:
34870 case ISD::ATOMIC_LOAD_OR:
34871 case ISD::ATOMIC_LOAD_XOR:
34872 case ISD::ATOMIC_LOAD_NAND:
34873 case ISD::ATOMIC_LOAD_MIN:
34874 case ISD::ATOMIC_LOAD_MAX:
34875 case ISD::ATOMIC_LOAD_UMIN:
34876 case ISD::ATOMIC_LOAD_UMAX:
34877 // Delegate to generic TypeLegalization. Situations we can really handle
34878 // should have already been dealt with by AtomicExpandPass.cpp.
34879 break;
34880
34881 case ISD::BITCAST: {
34882 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34882, __extension__
__PRETTY_FUNCTION__))
;
34883 EVT DstVT = N->getValueType(0);
34884 EVT SrcVT = N->getOperand(0).getValueType();
34885
34886 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34887 // we can split using the k-register rather than memory.
34888 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34889 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34889, __extension__
__PRETTY_FUNCTION__))
;
34890 SDValue Lo, Hi;
34891 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34892 Lo = DAG.getBitcast(MVT::i32, Lo);
34893 Hi = DAG.getBitcast(MVT::i32, Hi);
34894 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34895 Results.push_back(Res);
34896 return;
34897 }
34898
34899 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34900 // FIXME: Use v4f32 for SSE1?
34901 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34901, __extension__
__PRETTY_FUNCTION__))
;
34902 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34903, __extension__
__PRETTY_FUNCTION__))
34903 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34903, __extension__
__PRETTY_FUNCTION__))
;
34904 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34905 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34906 N->getOperand(0));
34907 Res = DAG.getBitcast(WideVT, Res);
34908 Results.push_back(Res);
34909 return;
34910 }
34911
34912 return;
34913 }
34914 case ISD::MGATHER: {
34915 EVT VT = N->getValueType(0);
34916 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34917 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34918 auto *Gather = cast<MaskedGatherSDNode>(N);
34919 SDValue Index = Gather->getIndex();
34920 if (Index.getValueType() != MVT::v2i64)
34921 return;
34922 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34923, __extension__
__PRETTY_FUNCTION__))
34923 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34923, __extension__
__PRETTY_FUNCTION__))
;
34924 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34925 SDValue Mask = Gather->getMask();
34926 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34926, __extension__
__PRETTY_FUNCTION__))
;
34927 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34928 Gather->getPassThru(),
34929 DAG.getUNDEF(VT));
34930 if (!Subtarget.hasVLX()) {
34931 // We need to widen the mask, but the instruction will only use 2
34932 // of its elements. So we can use undef.
34933 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34934 DAG.getUNDEF(MVT::v2i1));
34935 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34936 }
34937 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34938 Gather->getBasePtr(), Index, Gather->getScale() };
34939 SDValue Res = DAG.getMemIntrinsicNode(
34940 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34941 Gather->getMemoryVT(), Gather->getMemOperand());
34942 Results.push_back(Res);
34943 Results.push_back(Res.getValue(1));
34944 return;
34945 }
34946 return;
34947 }
34948 case ISD::LOAD: {
34949 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34950 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34951 // cast since type legalization will try to use an i64 load.
34952 MVT VT = N->getSimpleValueType(0);
34953 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34953, __extension__
__PRETTY_FUNCTION__))
;
34954 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34955, __extension__
__PRETTY_FUNCTION__))
34955 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34955, __extension__
__PRETTY_FUNCTION__))
;
34956 if (!ISD::isNON_EXTLoad(N))
34957 return;
34958 auto *Ld = cast<LoadSDNode>(N);
34959 if (Subtarget.hasSSE2()) {
34960 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34961 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34962 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34963 Ld->getMemOperand()->getFlags());
34964 SDValue Chain = Res.getValue(1);
34965 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34966 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34967 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34968 Res = DAG.getBitcast(WideVT, Res);
34969 Results.push_back(Res);
34970 Results.push_back(Chain);
34971 return;
34972 }
34973 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34973, __extension__
__PRETTY_FUNCTION__))
;
34974 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34975 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34976 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34977 MVT::i64, Ld->getMemOperand());
34978 Results.push_back(Res);
34979 Results.push_back(Res.getValue(1));
34980 return;
34981 }
34982 case ISD::ADDRSPACECAST: {
34983 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34984 Results.push_back(V);
34985 return;
34986 }
34987 case ISD::BITREVERSE: {
34988 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34988, __extension__
__PRETTY_FUNCTION__))
;
34989 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34989, __extension__
__PRETTY_FUNCTION__))
;
34990 // We can use VPPERM by copying to a vector register and back. We'll need
34991 // to move the scalar in two i32 pieces.
34992 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34993 return;
34994 }
34995 case ISD::EXTRACT_VECTOR_ELT: {
34996 // f16 = extract vXf16 %vec, i64 %idx
34997 assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34998, __extension__
__PRETTY_FUNCTION__))
34998 "Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34998, __extension__
__PRETTY_FUNCTION__))
;
34999 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34999, __extension__
__PRETTY_FUNCTION__))
;
35000 SDValue VecOp = N->getOperand(0);
35001 EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
35002 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
35003 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
35004 N->getOperand(1));
35005 Split = DAG.getBitcast(MVT::f16, Split);
35006 Results.push_back(Split);
35007 return;
35008 }
35009 }
35010}
35011
35012const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
35013 switch ((X86ISD::NodeType)Opcode) {
35014 case X86ISD::FIRST_NUMBER: break;
35015#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
35016 NODE_NAME_CASE(BSF)
35017 NODE_NAME_CASE(BSR)
35018 NODE_NAME_CASE(FSHL)
35019 NODE_NAME_CASE(FSHR)
35020 NODE_NAME_CASE(FAND)
35021 NODE_NAME_CASE(FANDN)
35022 NODE_NAME_CASE(FOR)
35023 NODE_NAME_CASE(FXOR)
35024 NODE_NAME_CASE(FILD)
35025 NODE_NAME_CASE(FIST)
35026 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
35027 NODE_NAME_CASE(FLD)
35028 NODE_NAME_CASE(FST)
35029 NODE_NAME_CASE(CALL)
35030 NODE_NAME_CASE(CALL_RVMARKER)
35031 NODE_NAME_CASE(BT)
35032 NODE_NAME_CASE(CMP)
35033 NODE_NAME_CASE(FCMP)
35034 NODE_NAME_CASE(STRICT_FCMP)
35035 NODE_NAME_CASE(STRICT_FCMPS)
35036 NODE_NAME_CASE(COMI)
35037 NODE_NAME_CASE(UCOMI)
35038 NODE_NAME_CASE(CMPM)
35039 NODE_NAME_CASE(CMPMM)
35040 NODE_NAME_CASE(STRICT_CMPM)
35041 NODE_NAME_CASE(CMPMM_SAE)
35042 NODE_NAME_CASE(SETCC)
35043 NODE_NAME_CASE(SETCC_CARRY)
35044 NODE_NAME_CASE(FSETCC)
35045 NODE_NAME_CASE(FSETCCM)
35046 NODE_NAME_CASE(FSETCCM_SAE)
35047 NODE_NAME_CASE(CMOV)
35048 NODE_NAME_CASE(BRCOND)
35049 NODE_NAME_CASE(RET_GLUE)
35050 NODE_NAME_CASE(IRET)
35051 NODE_NAME_CASE(REP_STOS)
35052 NODE_NAME_CASE(REP_MOVS)
35053 NODE_NAME_CASE(GlobalBaseReg)
35054 NODE_NAME_CASE(Wrapper)
35055 NODE_NAME_CASE(WrapperRIP)
35056 NODE_NAME_CASE(MOVQ2DQ)
35057 NODE_NAME_CASE(MOVDQ2Q)
35058 NODE_NAME_CASE(MMX_MOVD2W)
35059 NODE_NAME_CASE(MMX_MOVW2D)
35060 NODE_NAME_CASE(PEXTRB)
35061 NODE_NAME_CASE(PEXTRW)
35062 NODE_NAME_CASE(INSERTPS)
35063 NODE_NAME_CASE(PINSRB)
35064 NODE_NAME_CASE(PINSRW)
35065 NODE_NAME_CASE(PSHUFB)
35066 NODE_NAME_CASE(ANDNP)
35067 NODE_NAME_CASE(BLENDI)
35068 NODE_NAME_CASE(BLENDV)
35069 NODE_NAME_CASE(HADD)
35070 NODE_NAME_CASE(HSUB)
35071 NODE_NAME_CASE(FHADD)
35072 NODE_NAME_CASE(FHSUB)
35073 NODE_NAME_CASE(CONFLICT)
35074 NODE_NAME_CASE(FMAX)
35075 NODE_NAME_CASE(FMAXS)
35076 NODE_NAME_CASE(FMAX_SAE)
35077 NODE_NAME_CASE(FMAXS_SAE)
35078 NODE_NAME_CASE(FMIN)
35079 NODE_NAME_CASE(FMINS)
35080 NODE_NAME_CASE(FMIN_SAE)
35081 NODE_NAME_CASE(FMINS_SAE)
35082 NODE_NAME_CASE(FMAXC)
35083 NODE_NAME_CASE(FMINC)
35084 NODE_NAME_CASE(FRSQRT)
35085 NODE_NAME_CASE(FRCP)
35086 NODE_NAME_CASE(EXTRQI)
35087 NODE_NAME_CASE(INSERTQI)
35088 NODE_NAME_CASE(TLSADDR)
35089 NODE_NAME_CASE(TLSBASEADDR)
35090 NODE_NAME_CASE(TLSCALL)
35091 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35092 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35093 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35094 NODE_NAME_CASE(EH_RETURN)
35095 NODE_NAME_CASE(TC_RETURN)
35096 NODE_NAME_CASE(FNSTCW16m)
35097 NODE_NAME_CASE(FLDCW16m)
35098 NODE_NAME_CASE(LCMPXCHG_DAG)
35099 NODE_NAME_CASE(LCMPXCHG8_DAG)
35100 NODE_NAME_CASE(LCMPXCHG16_DAG)
35101 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35102 NODE_NAME_CASE(LADD)
35103 NODE_NAME_CASE(LSUB)
35104 NODE_NAME_CASE(LOR)
35105 NODE_NAME_CASE(LXOR)
35106 NODE_NAME_CASE(LAND)
35107 NODE_NAME_CASE(LBTS)
35108 NODE_NAME_CASE(LBTC)
35109 NODE_NAME_CASE(LBTR)
35110 NODE_NAME_CASE(LBTS_RM)
35111 NODE_NAME_CASE(LBTC_RM)
35112 NODE_NAME_CASE(LBTR_RM)
35113 NODE_NAME_CASE(AADD)
35114 NODE_NAME_CASE(AOR)
35115 NODE_NAME_CASE(AXOR)
35116 NODE_NAME_CASE(AAND)
35117 NODE_NAME_CASE(VZEXT_MOVL)
35118 NODE_NAME_CASE(VZEXT_LOAD)
35119 NODE_NAME_CASE(VEXTRACT_STORE)
35120 NODE_NAME_CASE(VTRUNC)
35121 NODE_NAME_CASE(VTRUNCS)
35122 NODE_NAME_CASE(VTRUNCUS)
35123 NODE_NAME_CASE(VMTRUNC)
35124 NODE_NAME_CASE(VMTRUNCS)
35125 NODE_NAME_CASE(VMTRUNCUS)
35126 NODE_NAME_CASE(VTRUNCSTORES)
35127 NODE_NAME_CASE(VTRUNCSTOREUS)
35128 NODE_NAME_CASE(VMTRUNCSTORES)
35129 NODE_NAME_CASE(VMTRUNCSTOREUS)
35130 NODE_NAME_CASE(VFPEXT)
35131 NODE_NAME_CASE(STRICT_VFPEXT)
35132 NODE_NAME_CASE(VFPEXT_SAE)
35133 NODE_NAME_CASE(VFPEXTS)
35134 NODE_NAME_CASE(VFPEXTS_SAE)
35135 NODE_NAME_CASE(VFPROUND)
35136 NODE_NAME_CASE(STRICT_VFPROUND)
35137 NODE_NAME_CASE(VMFPROUND)
35138 NODE_NAME_CASE(VFPROUND_RND)
35139 NODE_NAME_CASE(VFPROUNDS)
35140 NODE_NAME_CASE(VFPROUNDS_RND)
35141 NODE_NAME_CASE(VSHLDQ)
35142 NODE_NAME_CASE(VSRLDQ)
35143 NODE_NAME_CASE(VSHL)
35144 NODE_NAME_CASE(VSRL)
35145 NODE_NAME_CASE(VSRA)
35146 NODE_NAME_CASE(VSHLI)
35147 NODE_NAME_CASE(VSRLI)
35148 NODE_NAME_CASE(VSRAI)
35149 NODE_NAME_CASE(VSHLV)
35150 NODE_NAME_CASE(VSRLV)
35151 NODE_NAME_CASE(VSRAV)
35152 NODE_NAME_CASE(VROTLI)
35153 NODE_NAME_CASE(VROTRI)
35154 NODE_NAME_CASE(VPPERM)
35155 NODE_NAME_CASE(CMPP)
35156 NODE_NAME_CASE(STRICT_CMPP)
35157 NODE_NAME_CASE(PCMPEQ)
35158 NODE_NAME_CASE(PCMPGT)
35159 NODE_NAME_CASE(PHMINPOS)
35160 NODE_NAME_CASE(ADD)
35161 NODE_NAME_CASE(SUB)
35162 NODE_NAME_CASE(ADC)
35163 NODE_NAME_CASE(SBB)
35164 NODE_NAME_CASE(SMUL)
35165 NODE_NAME_CASE(UMUL)
35166 NODE_NAME_CASE(OR)
35167 NODE_NAME_CASE(XOR)
35168 NODE_NAME_CASE(AND)
35169 NODE_NAME_CASE(BEXTR)
35170 NODE_NAME_CASE(BEXTRI)
35171 NODE_NAME_CASE(BZHI)
35172 NODE_NAME_CASE(PDEP)
35173 NODE_NAME_CASE(PEXT)
35174 NODE_NAME_CASE(MUL_IMM)
35175 NODE_NAME_CASE(MOVMSK)
35176 NODE_NAME_CASE(PTEST)
35177 NODE_NAME_CASE(TESTP)
35178 NODE_NAME_CASE(KORTEST)
35179 NODE_NAME_CASE(KTEST)
35180 NODE_NAME_CASE(KADD)
35181 NODE_NAME_CASE(KSHIFTL)
35182 NODE_NAME_CASE(KSHIFTR)
35183 NODE_NAME_CASE(PACKSS)
35184 NODE_NAME_CASE(PACKUS)
35185 NODE_NAME_CASE(PALIGNR)
35186 NODE_NAME_CASE(VALIGN)
35187 NODE_NAME_CASE(VSHLD)
35188 NODE_NAME_CASE(VSHRD)
35189 NODE_NAME_CASE(VSHLDV)
35190 NODE_NAME_CASE(VSHRDV)
35191 NODE_NAME_CASE(PSHUFD)
35192 NODE_NAME_CASE(PSHUFHW)
35193 NODE_NAME_CASE(PSHUFLW)
35194 NODE_NAME_CASE(SHUFP)
35195 NODE_NAME_CASE(SHUF128)
35196 NODE_NAME_CASE(MOVLHPS)
35197 NODE_NAME_CASE(MOVHLPS)
35198 NODE_NAME_CASE(MOVDDUP)
35199 NODE_NAME_CASE(MOVSHDUP)
35200 NODE_NAME_CASE(MOVSLDUP)
35201 NODE_NAME_CASE(MOVSD)
35202 NODE_NAME_CASE(MOVSS)
35203 NODE_NAME_CASE(MOVSH)
35204 NODE_NAME_CASE(UNPCKL)
35205 NODE_NAME_CASE(UNPCKH)
35206 NODE_NAME_CASE(VBROADCAST)
35207 NODE_NAME_CASE(VBROADCAST_LOAD)
35208 NODE_NAME_CASE(VBROADCASTM)
35209 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35210 NODE_NAME_CASE(VPERMILPV)
35211 NODE_NAME_CASE(VPERMILPI)
35212 NODE_NAME_CASE(VPERM2X128)
35213 NODE_NAME_CASE(VPERMV)
35214 NODE_NAME_CASE(VPERMV3)
35215 NODE_NAME_CASE(VPERMI)
35216 NODE_NAME_CASE(VPTERNLOG)
35217 NODE_NAME_CASE(VFIXUPIMM)
35218 NODE_NAME_CASE(VFIXUPIMM_SAE)
35219 NODE_NAME_CASE(VFIXUPIMMS)
35220 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35221 NODE_NAME_CASE(VRANGE)
35222 NODE_NAME_CASE(VRANGE_SAE)
35223 NODE_NAME_CASE(VRANGES)
35224 NODE_NAME_CASE(VRANGES_SAE)
35225 NODE_NAME_CASE(PMULUDQ)
35226 NODE_NAME_CASE(PMULDQ)
35227 NODE_NAME_CASE(PSADBW)
35228 NODE_NAME_CASE(DBPSADBW)
35229 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35230 NODE_NAME_CASE(VAARG_64)
35231 NODE_NAME_CASE(VAARG_X32)
35232 NODE_NAME_CASE(DYN_ALLOCA)
35233 NODE_NAME_CASE(MFENCE)
35234 NODE_NAME_CASE(SEG_ALLOCA)
35235 NODE_NAME_CASE(PROBED_ALLOCA)
35236 NODE_NAME_CASE(RDRAND)
35237 NODE_NAME_CASE(RDSEED)
35238 NODE_NAME_CASE(RDPKRU)
35239 NODE_NAME_CASE(WRPKRU)
35240 NODE_NAME_CASE(VPMADDUBSW)
35241 NODE_NAME_CASE(VPMADDWD)
35242 NODE_NAME_CASE(VPSHA)
35243 NODE_NAME_CASE(VPSHL)
35244 NODE_NAME_CASE(VPCOM)
35245 NODE_NAME_CASE(VPCOMU)
35246 NODE_NAME_CASE(VPERMIL2)
35247 NODE_NAME_CASE(FMSUB)
35248 NODE_NAME_CASE(STRICT_FMSUB)
35249 NODE_NAME_CASE(FNMADD)
35250 NODE_NAME_CASE(STRICT_FNMADD)
35251 NODE_NAME_CASE(FNMSUB)
35252 NODE_NAME_CASE(STRICT_FNMSUB)
35253 NODE_NAME_CASE(FMADDSUB)
35254 NODE_NAME_CASE(FMSUBADD)
35255 NODE_NAME_CASE(FMADD_RND)
35256 NODE_NAME_CASE(FNMADD_RND)
35257 NODE_NAME_CASE(FMSUB_RND)
35258 NODE_NAME_CASE(FNMSUB_RND)
35259 NODE_NAME_CASE(FMADDSUB_RND)
35260 NODE_NAME_CASE(FMSUBADD_RND)
35261 NODE_NAME_CASE(VFMADDC)
35262 NODE_NAME_CASE(VFMADDC_RND)
35263 NODE_NAME_CASE(VFCMADDC)
35264 NODE_NAME_CASE(VFCMADDC_RND)
35265 NODE_NAME_CASE(VFMULC)
35266 NODE_NAME_CASE(VFMULC_RND)
35267 NODE_NAME_CASE(VFCMULC)
35268 NODE_NAME_CASE(VFCMULC_RND)
35269 NODE_NAME_CASE(VFMULCSH)
35270 NODE_NAME_CASE(VFMULCSH_RND)
35271 NODE_NAME_CASE(VFCMULCSH)
35272 NODE_NAME_CASE(VFCMULCSH_RND)
35273 NODE_NAME_CASE(VFMADDCSH)
35274 NODE_NAME_CASE(VFMADDCSH_RND)
35275 NODE_NAME_CASE(VFCMADDCSH)
35276 NODE_NAME_CASE(VFCMADDCSH_RND)
35277 NODE_NAME_CASE(VPMADD52H)
35278 NODE_NAME_CASE(VPMADD52L)
35279 NODE_NAME_CASE(VRNDSCALE)
35280 NODE_NAME_CASE(STRICT_VRNDSCALE)
35281 NODE_NAME_CASE(VRNDSCALE_SAE)
35282 NODE_NAME_CASE(VRNDSCALES)
35283 NODE_NAME_CASE(VRNDSCALES_SAE)
35284 NODE_NAME_CASE(VREDUCE)
35285 NODE_NAME_CASE(VREDUCE_SAE)
35286 NODE_NAME_CASE(VREDUCES)
35287 NODE_NAME_CASE(VREDUCES_SAE)
35288 NODE_NAME_CASE(VGETMANT)
35289 NODE_NAME_CASE(VGETMANT_SAE)
35290 NODE_NAME_CASE(VGETMANTS)
35291 NODE_NAME_CASE(VGETMANTS_SAE)
35292 NODE_NAME_CASE(PCMPESTR)
35293 NODE_NAME_CASE(PCMPISTR)
35294 NODE_NAME_CASE(XTEST)
35295 NODE_NAME_CASE(COMPRESS)
35296 NODE_NAME_CASE(EXPAND)
35297 NODE_NAME_CASE(SELECTS)
35298 NODE_NAME_CASE(ADDSUB)
35299 NODE_NAME_CASE(RCP14)
35300 NODE_NAME_CASE(RCP14S)
35301 NODE_NAME_CASE(RCP28)
35302 NODE_NAME_CASE(RCP28_SAE)
35303 NODE_NAME_CASE(RCP28S)
35304 NODE_NAME_CASE(RCP28S_SAE)
35305 NODE_NAME_CASE(EXP2)
35306 NODE_NAME_CASE(EXP2_SAE)
35307 NODE_NAME_CASE(RSQRT14)
35308 NODE_NAME_CASE(RSQRT14S)
35309 NODE_NAME_CASE(RSQRT28)
35310 NODE_NAME_CASE(RSQRT28_SAE)
35311 NODE_NAME_CASE(RSQRT28S)
35312 NODE_NAME_CASE(RSQRT28S_SAE)
35313 NODE_NAME_CASE(FADD_RND)
35314 NODE_NAME_CASE(FADDS)
35315 NODE_NAME_CASE(FADDS_RND)
35316 NODE_NAME_CASE(FSUB_RND)
35317 NODE_NAME_CASE(FSUBS)
35318 NODE_NAME_CASE(FSUBS_RND)
35319 NODE_NAME_CASE(FMUL_RND)
35320 NODE_NAME_CASE(FMULS)
35321 NODE_NAME_CASE(FMULS_RND)
35322 NODE_NAME_CASE(FDIV_RND)
35323 NODE_NAME_CASE(FDIVS)
35324 NODE_NAME_CASE(FDIVS_RND)
35325 NODE_NAME_CASE(FSQRT_RND)
35326 NODE_NAME_CASE(FSQRTS)
35327 NODE_NAME_CASE(FSQRTS_RND)
35328 NODE_NAME_CASE(FGETEXP)
35329 NODE_NAME_CASE(FGETEXP_SAE)
35330 NODE_NAME_CASE(FGETEXPS)
35331 NODE_NAME_CASE(FGETEXPS_SAE)
35332 NODE_NAME_CASE(SCALEF)
35333 NODE_NAME_CASE(SCALEF_RND)
35334 NODE_NAME_CASE(SCALEFS)
35335 NODE_NAME_CASE(SCALEFS_RND)
35336 NODE_NAME_CASE(MULHRS)
35337 NODE_NAME_CASE(SINT_TO_FP_RND)
35338 NODE_NAME_CASE(UINT_TO_FP_RND)
35339 NODE_NAME_CASE(CVTTP2SI)
35340 NODE_NAME_CASE(CVTTP2UI)
35341 NODE_NAME_CASE(STRICT_CVTTP2SI)
35342 NODE_NAME_CASE(STRICT_CVTTP2UI)
35343 NODE_NAME_CASE(MCVTTP2SI)
35344 NODE_NAME_CASE(MCVTTP2UI)
35345 NODE_NAME_CASE(CVTTP2SI_SAE)
35346 NODE_NAME_CASE(CVTTP2UI_SAE)
35347 NODE_NAME_CASE(CVTTS2SI)
35348 NODE_NAME_CASE(CVTTS2UI)
35349 NODE_NAME_CASE(CVTTS2SI_SAE)
35350 NODE_NAME_CASE(CVTTS2UI_SAE)
35351 NODE_NAME_CASE(CVTSI2P)
35352 NODE_NAME_CASE(CVTUI2P)
35353 NODE_NAME_CASE(STRICT_CVTSI2P)
35354 NODE_NAME_CASE(STRICT_CVTUI2P)
35355 NODE_NAME_CASE(MCVTSI2P)
35356 NODE_NAME_CASE(MCVTUI2P)
35357 NODE_NAME_CASE(VFPCLASS)
35358 NODE_NAME_CASE(VFPCLASSS)
35359 NODE_NAME_CASE(MULTISHIFT)
35360 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35361 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35362 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35363 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35364 NODE_NAME_CASE(CVTPS2PH)
35365 NODE_NAME_CASE(STRICT_CVTPS2PH)
35366 NODE_NAME_CASE(CVTPS2PH_SAE)
35367 NODE_NAME_CASE(MCVTPS2PH)
35368 NODE_NAME_CASE(MCVTPS2PH_SAE)
35369 NODE_NAME_CASE(CVTPH2PS)
35370 NODE_NAME_CASE(STRICT_CVTPH2PS)
35371 NODE_NAME_CASE(CVTPH2PS_SAE)
35372 NODE_NAME_CASE(CVTP2SI)
35373 NODE_NAME_CASE(CVTP2UI)
35374 NODE_NAME_CASE(MCVTP2SI)
35375 NODE_NAME_CASE(MCVTP2UI)
35376 NODE_NAME_CASE(CVTP2SI_RND)
35377 NODE_NAME_CASE(CVTP2UI_RND)
35378 NODE_NAME_CASE(CVTS2SI)
35379 NODE_NAME_CASE(CVTS2UI)
35380 NODE_NAME_CASE(CVTS2SI_RND)
35381 NODE_NAME_CASE(CVTS2UI_RND)
35382 NODE_NAME_CASE(CVTNE2PS2BF16)
35383 NODE_NAME_CASE(CVTNEPS2BF16)
35384 NODE_NAME_CASE(MCVTNEPS2BF16)
35385 NODE_NAME_CASE(DPBF16PS)
35386 NODE_NAME_CASE(LWPINS)
35387 NODE_NAME_CASE(MGATHER)
35388 NODE_NAME_CASE(MSCATTER)
35389 NODE_NAME_CASE(VPDPBUSD)
35390 NODE_NAME_CASE(VPDPBUSDS)
35391 NODE_NAME_CASE(VPDPWSSD)
35392 NODE_NAME_CASE(VPDPWSSDS)
35393 NODE_NAME_CASE(VPSHUFBITQMB)
35394 NODE_NAME_CASE(GF2P8MULB)
35395 NODE_NAME_CASE(GF2P8AFFINEQB)
35396 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35397 NODE_NAME_CASE(NT_CALL)
35398 NODE_NAME_CASE(NT_BRIND)
35399 NODE_NAME_CASE(UMWAIT)
35400 NODE_NAME_CASE(TPAUSE)
35401 NODE_NAME_CASE(ENQCMD)
35402 NODE_NAME_CASE(ENQCMDS)
35403 NODE_NAME_CASE(VP2INTERSECT)
35404 NODE_NAME_CASE(VPDPBSUD)
35405 NODE_NAME_CASE(VPDPBSUDS)
35406 NODE_NAME_CASE(VPDPBUUD)
35407 NODE_NAME_CASE(VPDPBUUDS)
35408 NODE_NAME_CASE(VPDPBSSD)
35409 NODE_NAME_CASE(VPDPBSSDS)
35410 NODE_NAME_CASE(AESENC128KL)
35411 NODE_NAME_CASE(AESDEC128KL)
35412 NODE_NAME_CASE(AESENC256KL)
35413 NODE_NAME_CASE(AESDEC256KL)
35414 NODE_NAME_CASE(AESENCWIDE128KL)
35415 NODE_NAME_CASE(AESDECWIDE128KL)
35416 NODE_NAME_CASE(AESENCWIDE256KL)
35417 NODE_NAME_CASE(AESDECWIDE256KL)
35418 NODE_NAME_CASE(CMPCCXADD)
35419 NODE_NAME_CASE(TESTUI)
35420 NODE_NAME_CASE(FP80_ADD)
35421 NODE_NAME_CASE(STRICT_FP80_ADD)
35422 }
35423 return nullptr;
35424#undef NODE_NAME_CASE
35425}
35426
35427/// Return true if the addressing mode represented by AM is legal for this
35428/// target, for a load/store of the specified type.
35429bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
35430 const AddrMode &AM, Type *Ty,
35431 unsigned AS,
35432 Instruction *I) const {
35433 // X86 supports extremely general addressing modes.
35434 CodeModel::Model M = getTargetMachine().getCodeModel();
35435
35436 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35437 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35438 return false;
35439
35440 if (AM.BaseGV) {
35441 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35442
35443 // If a reference to this global requires an extra load, we can't fold it.
35444 if (isGlobalStubReference(GVFlags))
35445 return false;
35446
35447 // If BaseGV requires a register for the PIC base, we cannot also have a
35448 // BaseReg specified.
35449 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35450 return false;
35451
35452 // If lower 4G is not available, then we must use rip-relative addressing.
35453 if ((M != CodeModel::Small || isPositionIndependent()) &&
35454 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35455 return false;
35456 }
35457
35458 switch (AM.Scale) {
35459 case 0:
35460 case 1:
35461 case 2:
35462 case 4:
35463 case 8:
35464 // These scales always work.
35465 break;
35466 case 3:
35467 case 5:
35468 case 9:
35469 // These scales are formed with basereg+scalereg. Only accept if there is
35470 // no basereg yet.
35471 if (AM.HasBaseReg)
35472 return false;
35473 break;
35474 default: // Other stuff never works.
35475 return false;
35476 }
35477
35478 return true;
35479}
35480
35481bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
35482 unsigned Bits = Ty->getScalarSizeInBits();
35483
35484 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
35485 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
35486 if (Subtarget.hasXOP() &&
35487 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
35488 return false;
35489
35490 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
35491 // shifts just as cheap as scalar ones.
35492 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
35493 return false;
35494
35495 // AVX512BW has shifts such as vpsllvw.
35496 if (Subtarget.hasBWI() && Bits == 16)
35497 return false;
35498
35499 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
35500 // fully general vector.
35501 return true;
35502}
35503
35504bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35505 switch (Opcode) {
35506 // These are non-commutative binops.
35507 // TODO: Add more X86ISD opcodes once we have test coverage.
35508 case X86ISD::ANDNP:
35509 case X86ISD::PCMPGT:
35510 case X86ISD::FMAX:
35511 case X86ISD::FMIN:
35512 case X86ISD::FANDN:
35513 case X86ISD::VPSHA:
35514 case X86ISD::VPSHL:
35515 case X86ISD::VSHLV:
35516 case X86ISD::VSRLV:
35517 case X86ISD::VSRAV:
35518 return true;
35519 }
35520
35521 return TargetLoweringBase::isBinOp(Opcode);
35522}
35523
35524bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35525 switch (Opcode) {
35526 // TODO: Add more X86ISD opcodes once we have test coverage.
35527 case X86ISD::PCMPEQ:
35528 case X86ISD::PMULDQ:
35529 case X86ISD::PMULUDQ:
35530 case X86ISD::FMAXC:
35531 case X86ISD::FMINC:
35532 case X86ISD::FAND:
35533 case X86ISD::FOR:
35534 case X86ISD::FXOR:
35535 return true;
35536 }
35537
35538 return TargetLoweringBase::isCommutativeBinOp(Opcode);
35539}
35540
35541bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
35542 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35543 return false;
35544 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35545 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35546 return NumBits1 > NumBits2;
35547}
35548
35549bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
35550 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35551 return false;
35552
35553 if (!isTypeLegal(EVT::getEVT(Ty1)))
35554 return false;
35555
35556 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35556, __extension__
__PRETTY_FUNCTION__))
;
35557
35558 // Assuming the caller doesn't have a zeroext or signext return parameter,
35559 // truncation all the way down to i1 is valid.
35560 return true;
35561}
35562
35563bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
35564 return isInt<32>(Imm);
35565}
35566
35567bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
35568 // Can also use sub to handle negated immediates.
35569 return isInt<32>(Imm);
35570}
35571
35572bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
35573 return isInt<32>(Imm);
35574}
35575
35576bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
35577 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35578 return false;
35579 unsigned NumBits1 = VT1.getSizeInBits();
35580 unsigned NumBits2 = VT2.getSizeInBits();
35581 return NumBits1 > NumBits2;
35582}
35583
35584bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
35585 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35586 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35587}
35588
35589bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
35590 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35591 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35592}
35593
35594bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
35595 EVT VT1 = Val.getValueType();
35596 if (isZExtFree(VT1, VT2))
35597 return true;
35598
35599 if (Val.getOpcode() != ISD::LOAD)
35600 return false;
35601
35602 if (!VT1.isSimple() || !VT1.isInteger() ||
35603 !VT2.isSimple() || !VT2.isInteger())
35604 return false;
35605
35606 switch (VT1.getSimpleVT().SimpleTy) {
35607 default: break;
35608 case MVT::i8:
35609 case MVT::i16:
35610 case MVT::i32:
35611 // X86 has 8, 16, and 32-bit zero-extending loads.
35612 return true;
35613 }
35614
35615 return false;
35616}
35617
35618bool X86TargetLowering::shouldSinkOperands(Instruction *I,
35619 SmallVectorImpl<Use *> &Ops) const {
35620 using namespace llvm::PatternMatch;
35621
35622 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
35623 if (!VTy)
35624 return false;
35625
35626 if (I->getOpcode() == Instruction::Mul &&
35627 VTy->getElementType()->isIntegerTy(64)) {
35628 for (auto &Op : I->operands()) {
35629 // Make sure we are not already sinking this operand
35630 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
35631 continue;
35632
35633 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
35634 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
35635 if (Subtarget.hasSSE41() &&
35636 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
35637 m_SpecificInt(32)))) {
35638 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
35639 Ops.push_back(&Op);
35640 } else if (Subtarget.hasSSE2() &&
35641 match(Op.get(),
35642 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
35643 Ops.push_back(&Op);
35644 }
35645 }
35646
35647 return !Ops.empty();
35648 }
35649
35650 // A uniform shift amount in a vector shift or funnel shift may be much
35651 // cheaper than a generic variable vector shift, so make that pattern visible
35652 // to SDAG by sinking the shuffle instruction next to the shift.
35653 int ShiftAmountOpNum = -1;
35654 if (I->isShift())
35655 ShiftAmountOpNum = 1;
35656 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
35657 if (II->getIntrinsicID() == Intrinsic::fshl ||
35658 II->getIntrinsicID() == Intrinsic::fshr)
35659 ShiftAmountOpNum = 2;
35660 }
35661
35662 if (ShiftAmountOpNum == -1)
35663 return false;
35664
35665 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
35666 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
35667 isVectorShiftByScalarCheap(I->getType())) {
35668 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
35669 return true;
35670 }
35671
35672 return false;
35673}
35674
35675bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
35676 if (!Subtarget.is64Bit())
35677 return false;
35678 return TargetLowering::shouldConvertPhiType(From, To);
35679}
35680
35681bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
35682 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35683 return false;
35684
35685 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35686
35687 // There is no extending load for vXi1.
35688 if (SrcVT.getScalarType() == MVT::i1)
35689 return false;
35690
35691 return true;
35692}
35693
35694bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
35695 EVT VT) const {
35696 if (!Subtarget.hasAnyFMA())
35697 return false;
35698
35699 VT = VT.getScalarType();
35700
35701 if (!VT.isSimple())
35702 return false;
35703
35704 switch (VT.getSimpleVT().SimpleTy) {
35705 case MVT::f16:
35706 return Subtarget.hasFP16();
35707 case MVT::f32:
35708 case MVT::f64:
35709 return true;
35710 default:
35711 break;
35712 }
35713
35714 return false;
35715}
35716
35717bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
35718 // i16 instructions are longer (0x66 prefix) and potentially slower.
35719 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35720}
35721
35722bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
35723 EVT VT) const {
35724 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35725 // benefit. The transform may also be profitable for scalar code.
35726 if (!Subtarget.hasAVX512())
35727 return false;
35728 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35729 return false;
35730 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35731 return false;
35732
35733 return true;
35734}
35735
35736/// Targets can use this to indicate that they only support *some*
35737/// VECTOR_SHUFFLE operations, those with specific masks.
35738/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35739/// are assumed to be legal.
35740bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
35741 if (!VT.isSimple())
35742 return false;
35743
35744 // Not for i1 vectors
35745 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35746 return false;
35747
35748 // Very little shuffling can be done for 64-bit vectors right now.
35749 if (VT.getSimpleVT().getSizeInBits() == 64)
35750 return false;
35751
35752 // We only care that the types being shuffled are legal. The lowering can
35753 // handle any possible shuffle mask that results.
35754 return isTypeLegal(VT.getSimpleVT());
35755}
35756
35757bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
35758 EVT VT) const {
35759 // Don't convert an 'and' into a shuffle that we don't directly support.
35760 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35761 if (!Subtarget.hasAVX2())
35762 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35763 return false;
35764
35765 // Just delegate to the generic legality, clear masks aren't special.
35766 return isShuffleMaskLegal(Mask, VT);
35767}
35768
35769bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
35770 // If the subtarget is using thunks, we need to not generate jump tables.
35771 if (Subtarget.useIndirectThunkBranches())
35772 return false;
35773
35774 // Otherwise, fallback on the generic logic.
35775 return TargetLowering::areJTsAllowed(Fn);
35776}
35777
35778MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
35779 EVT ConditionVT) const {
35780 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35781 // zero-extensions.
35782 if (ConditionVT.getSizeInBits() < 32)
35783 return MVT::i32;
35784 return TargetLoweringBase::getPreferredSwitchConditionType(Context,
35785 ConditionVT);
35786}
35787
35788//===----------------------------------------------------------------------===//
35789// X86 Scheduler Hooks
35790//===----------------------------------------------------------------------===//
35791
35792// Returns true if EFLAG is consumed after this iterator in the rest of the
35793// basic block or any successors of the basic block.
35794static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
35795 MachineBasicBlock *BB) {
35796 // Scan forward through BB for a use/def of EFLAGS.
35797 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35798 if (mi.readsRegister(X86::EFLAGS))
35799 return true;
35800 // If we found a def, we can stop searching.
35801 if (mi.definesRegister(X86::EFLAGS))
35802 return false;
35803 }
35804
35805 // If we hit the end of the block, check whether EFLAGS is live into a
35806 // successor.
35807 for (MachineBasicBlock *Succ : BB->successors())
35808 if (Succ->isLiveIn(X86::EFLAGS))
35809 return true;
35810
35811 return false;
35812}
35813
35814/// Utility function to emit xbegin specifying the start of an RTM region.
35815static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
35816 const TargetInstrInfo *TII) {
35817 const DebugLoc &DL = MI.getDebugLoc();
35818
35819 const BasicBlock *BB = MBB->getBasicBlock();
35820 MachineFunction::iterator I = ++MBB->getIterator();
35821
35822 // For the v = xbegin(), we generate
35823 //
35824 // thisMBB:
35825 // xbegin sinkMBB
35826 //
35827 // mainMBB:
35828 // s0 = -1
35829 //
35830 // fallBB:
35831 // eax = # XABORT_DEF
35832 // s1 = eax
35833 //
35834 // sinkMBB:
35835 // v = phi(s0/mainBB, s1/fallBB)
35836
35837 MachineBasicBlock *thisMBB = MBB;
35838 MachineFunction *MF = MBB->getParent();
35839 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35840 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35841 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35842 MF->insert(I, mainMBB);
35843 MF->insert(I, fallMBB);
35844 MF->insert(I, sinkMBB);
35845
35846 if (isEFLAGSLiveAfter(MI, MBB)) {
35847 mainMBB->addLiveIn(X86::EFLAGS);
35848 fallMBB->addLiveIn(X86::EFLAGS);
35849 sinkMBB->addLiveIn(X86::EFLAGS);
35850 }
35851
35852 // Transfer the remainder of BB and its successor edges to sinkMBB.
35853 sinkMBB->splice(sinkMBB->begin(), MBB,
35854 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35855 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35856
35857 MachineRegisterInfo &MRI = MF->getRegInfo();
35858 Register DstReg = MI.getOperand(0).getReg();
35859 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35860 Register mainDstReg = MRI.createVirtualRegister(RC);
35861 Register fallDstReg = MRI.createVirtualRegister(RC);
35862
35863 // thisMBB:
35864 // xbegin fallMBB
35865 // # fallthrough to mainMBB
35866 // # abortion to fallMBB
35867 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35868 thisMBB->addSuccessor(mainMBB);
35869 thisMBB->addSuccessor(fallMBB);
35870
35871 // mainMBB:
35872 // mainDstReg := -1
35873 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35874 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35875 mainMBB->addSuccessor(sinkMBB);
35876
35877 // fallMBB:
35878 // ; pseudo instruction to model hardware's definition from XABORT
35879 // EAX := XABORT_DEF
35880 // fallDstReg := EAX
35881 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
35882 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
35883 .addReg(X86::EAX);
35884 fallMBB->addSuccessor(sinkMBB);
35885
35886 // sinkMBB:
35887 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35888 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
35889 .addReg(mainDstReg).addMBB(mainMBB)
35890 .addReg(fallDstReg).addMBB(fallMBB);
35891
35892 MI.eraseFromParent();
35893 return sinkMBB;
35894}
35895
35896MachineBasicBlock *
35897X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35898 MachineBasicBlock *MBB) const {
35899 // Emit va_arg instruction on X86-64.
35900
35901 // Operands to this pseudo-instruction:
35902 // 0 ) Output : destination address (reg)
35903 // 1-5) Input : va_list address (addr, i64mem)
35904 // 6 ) ArgSize : Size (in bytes) of vararg type
35905 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35906 // 8 ) Align : Alignment of type
35907 // 9 ) EFLAGS (implicit-def)
35908
35909 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35909, __extension__
__PRETTY_FUNCTION__))
;
35910 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35911
35912 Register DestReg = MI.getOperand(0).getReg();
35913 MachineOperand &Base = MI.getOperand(1);
35914 MachineOperand &Scale = MI.getOperand(2);
35915 MachineOperand &Index = MI.getOperand(3);
35916 MachineOperand &Disp = MI.getOperand(4);
35917 MachineOperand &Segment = MI.getOperand(5);
35918 unsigned ArgSize = MI.getOperand(6).getImm();
35919 unsigned ArgMode = MI.getOperand(7).getImm();
35920 Align Alignment = Align(MI.getOperand(8).getImm());
35921
35922 MachineFunction *MF = MBB->getParent();
35923
35924 // Memory Reference
35925 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35925, __extension__
__PRETTY_FUNCTION__))
;
35926
35927 MachineMemOperand *OldMMO = MI.memoperands().front();
35928
35929 // Clone the MMO into two separate MMOs for loading and storing
35930 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35931 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35932 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35933 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35934
35935 // Machine Information
35936 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35937 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35938 const TargetRegisterClass *AddrRegClass =
35939 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
35940 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35941 const DebugLoc &DL = MI.getDebugLoc();
35942
35943 // struct va_list {
35944 // i32 gp_offset
35945 // i32 fp_offset
35946 // i64 overflow_area (address)
35947 // i64 reg_save_area (address)
35948 // }
35949 // sizeof(va_list) = 24
35950 // alignment(va_list) = 8
35951
35952 unsigned TotalNumIntRegs = 6;
35953 unsigned TotalNumXMMRegs = 8;
35954 bool UseGPOffset = (ArgMode == 1);
35955 bool UseFPOffset = (ArgMode == 2);
35956 unsigned MaxOffset = TotalNumIntRegs * 8 +
35957 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35958
35959 /* Align ArgSize to a multiple of 8 */
35960 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35961 bool NeedsAlign = (Alignment > 8);
35962
35963 MachineBasicBlock *thisMBB = MBB;
35964 MachineBasicBlock *overflowMBB;
35965 MachineBasicBlock *offsetMBB;
35966 MachineBasicBlock *endMBB;
35967
35968 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
35969 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
35970 unsigned OffsetReg = 0;
35971
35972 if (!UseGPOffset && !UseFPOffset) {
35973 // If we only pull from the overflow region, we don't create a branch.
35974 // We don't need to alter control flow.
35975 OffsetDestReg = 0; // unused
35976 OverflowDestReg = DestReg;
35977
35978 offsetMBB = nullptr;
35979 overflowMBB = thisMBB;
35980 endMBB = thisMBB;
35981 } else {
35982 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35983 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35984 // If not, pull from overflow_area. (branch to overflowMBB)
35985 //
35986 // thisMBB
35987 // | .
35988 // | .
35989 // offsetMBB overflowMBB
35990 // | .
35991 // | .
35992 // endMBB
35993
35994 // Registers for the PHI in endMBB
35995 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35996 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35997
35998 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35999 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36000 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36001 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36002
36003 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36004
36005 // Insert the new basic blocks
36006 MF->insert(MBBIter, offsetMBB);
36007 MF->insert(MBBIter, overflowMBB);
36008 MF->insert(MBBIter, endMBB);
36009
36010 // Transfer the remainder of MBB and its successor edges to endMBB.
36011 endMBB->splice(endMBB->begin(), thisMBB,
36012 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
36013 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
36014
36015 // Make offsetMBB and overflowMBB successors of thisMBB
36016 thisMBB->addSuccessor(offsetMBB);
36017 thisMBB->addSuccessor(overflowMBB);
36018
36019 // endMBB is a successor of both offsetMBB and overflowMBB
36020 offsetMBB->addSuccessor(endMBB);
36021 overflowMBB->addSuccessor(endMBB);
36022
36023 // Load the offset value into a register
36024 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36025 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
36026 .add(Base)
36027 .add(Scale)
36028 .add(Index)
36029 .addDisp(Disp, UseFPOffset ? 4 : 0)
36030 .add(Segment)
36031 .setMemRefs(LoadOnlyMMO);
36032
36033 // Check if there is enough room left to pull this argument.
36034 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
36035 .addReg(OffsetReg)
36036 .addImm(MaxOffset + 8 - ArgSizeA8);
36037
36038 // Branch to "overflowMBB" if offset >= max
36039 // Fall through to "offsetMBB" otherwise
36040 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
36041 .addMBB(overflowMBB).addImm(X86::COND_AE);
36042 }
36043
36044 // In offsetMBB, emit code to use the reg_save_area.
36045 if (offsetMBB) {
36046 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36046, __extension__ __PRETTY_FUNCTION__))
;
36047
36048 // Read the reg_save_area address.
36049 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
36050 BuildMI(
36051 offsetMBB, DL,
36052 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36053 RegSaveReg)
36054 .add(Base)
36055 .add(Scale)
36056 .add(Index)
36057 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
36058 .add(Segment)
36059 .setMemRefs(LoadOnlyMMO);
36060
36061 if (Subtarget.isTarget64BitLP64()) {
36062 // Zero-extend the offset
36063 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36064 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36065 .addImm(0)
36066 .addReg(OffsetReg)
36067 .addImm(X86::sub_32bit);
36068
36069 // Add the offset to the reg_save_area to get the final address.
36070 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
36071 .addReg(OffsetReg64)
36072 .addReg(RegSaveReg);
36073 } else {
36074 // Add the offset to the reg_save_area to get the final address.
36075 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
36076 .addReg(OffsetReg)
36077 .addReg(RegSaveReg);
36078 }
36079
36080 // Compute the offset for the next argument
36081 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36082 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
36083 .addReg(OffsetReg)
36084 .addImm(UseFPOffset ? 16 : 8);
36085
36086 // Store it back into the va_list.
36087 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
36088 .add(Base)
36089 .add(Scale)
36090 .add(Index)
36091 .addDisp(Disp, UseFPOffset ? 4 : 0)
36092 .add(Segment)
36093 .addReg(NextOffsetReg)
36094 .setMemRefs(StoreOnlyMMO);
36095
36096 // Jump to endMBB
36097 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
36098 .addMBB(endMBB);
36099 }
36100
36101 //
36102 // Emit code to use overflow area
36103 //
36104
36105 // Load the overflow_area address into a register.
36106 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36107 BuildMI(overflowMBB, DL,
36108 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36109 OverflowAddrReg)
36110 .add(Base)
36111 .add(Scale)
36112 .add(Index)
36113 .addDisp(Disp, 8)
36114 .add(Segment)
36115 .setMemRefs(LoadOnlyMMO);
36116
36117 // If we need to align it, do so. Otherwise, just copy the address
36118 // to OverflowDestReg.
36119 if (NeedsAlign) {
36120 // Align the overflow address
36121 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36122
36123 // aligned_addr = (addr + (align-1)) & ~(align-1)
36124 BuildMI(
36125 overflowMBB, DL,
36126 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36127 TmpReg)
36128 .addReg(OverflowAddrReg)
36129 .addImm(Alignment.value() - 1);
36130
36131 BuildMI(
36132 overflowMBB, DL,
36133 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36134 OverflowDestReg)
36135 .addReg(TmpReg)
36136 .addImm(~(uint64_t)(Alignment.value() - 1));
36137 } else {
36138 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
36139 .addReg(OverflowAddrReg);
36140 }
36141
36142 // Compute the next overflow address after this argument.
36143 // (the overflow address should be kept 8-byte aligned)
36144 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36145 BuildMI(
36146 overflowMBB, DL,
36147 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36148 NextAddrReg)
36149 .addReg(OverflowDestReg)
36150 .addImm(ArgSizeA8);
36151
36152 // Store the new overflow address.
36153 BuildMI(overflowMBB, DL,
36154 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36155 .add(Base)
36156 .add(Scale)
36157 .add(Index)
36158 .addDisp(Disp, 8)
36159 .add(Segment)
36160 .addReg(NextAddrReg)
36161 .setMemRefs(StoreOnlyMMO);
36162
36163 // If we branched, emit the PHI to the front of endMBB.
36164 if (offsetMBB) {
36165 BuildMI(*endMBB, endMBB->begin(), DL,
36166 TII->get(X86::PHI), DestReg)
36167 .addReg(OffsetDestReg).addMBB(offsetMBB)
36168 .addReg(OverflowDestReg).addMBB(overflowMBB);
36169 }
36170
36171 // Erase the pseudo instruction
36172 MI.eraseFromParent();
36173
36174 return endMBB;
36175}
36176
36177// The EFLAGS operand of SelectItr might be missing a kill marker
36178// because there were multiple uses of EFLAGS, and ISel didn't know
36179// which to mark. Figure out whether SelectItr should have had a
36180// kill marker, and set it if it should. Returns the correct kill
36181// marker value.
36182static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
36183 MachineBasicBlock* BB,
36184 const TargetRegisterInfo* TRI) {
36185 if (isEFLAGSLiveAfter(SelectItr, BB))
36186 return false;
36187
36188 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36189 // out. SelectMI should have a kill flag on EFLAGS.
36190 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36191 return true;
36192}
36193
36194// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36195// together with other CMOV pseudo-opcodes into a single basic-block with
36196// conditional jump around it.
36197static bool isCMOVPseudo(MachineInstr &MI) {
36198 switch (MI.getOpcode()) {
36199 case X86::CMOV_FR16:
36200 case X86::CMOV_FR16X:
36201 case X86::CMOV_FR32:
36202 case X86::CMOV_FR32X:
36203 case X86::CMOV_FR64:
36204 case X86::CMOV_FR64X:
36205 case X86::CMOV_GR8:
36206 case X86::CMOV_GR16:
36207 case X86::CMOV_GR32:
36208 case X86::CMOV_RFP32:
36209 case X86::CMOV_RFP64:
36210 case X86::CMOV_RFP80:
36211 case X86::CMOV_VR64:
36212 case X86::CMOV_VR128:
36213 case X86::CMOV_VR128X:
36214 case X86::CMOV_VR256:
36215 case X86::CMOV_VR256X:
36216 case X86::CMOV_VR512:
36217 case X86::CMOV_VK1:
36218 case X86::CMOV_VK2:
36219 case X86::CMOV_VK4:
36220 case X86::CMOV_VK8:
36221 case X86::CMOV_VK16:
36222 case X86::CMOV_VK32:
36223 case X86::CMOV_VK64:
36224 return true;
36225
36226 default:
36227 return false;
36228 }
36229}
36230
36231// Helper function, which inserts PHI functions into SinkMBB:
36232// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36233// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36234// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36235// the last PHI function inserted.
36236static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
36237 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
36238 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36239 MachineBasicBlock *SinkMBB) {
36240 MachineFunction *MF = TrueMBB->getParent();
36241 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
36242 const DebugLoc &DL = MIItBegin->getDebugLoc();
36243
36244 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36245 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36246
36247 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36248
36249 // As we are creating the PHIs, we have to be careful if there is more than
36250 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36251 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36252 // That also means that PHI construction must work forward from earlier to
36253 // later, and that the code must maintain a mapping from earlier PHI's
36254 // destination registers, and the registers that went into the PHI.
36255 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
36256 MachineInstrBuilder MIB;
36257
36258 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36259 Register DestReg = MIIt->getOperand(0).getReg();
36260 Register Op1Reg = MIIt->getOperand(1).getReg();
36261 Register Op2Reg = MIIt->getOperand(2).getReg();
36262
36263 // If this CMOV we are generating is the opposite condition from
36264 // the jump we generated, then we have to swap the operands for the
36265 // PHI that is going to be generated.
36266 if (MIIt->getOperand(3).getImm() == OppCC)
36267 std::swap(Op1Reg, Op2Reg);
36268
36269 if (RegRewriteTable.contains(Op1Reg))
36270 Op1Reg = RegRewriteTable[Op1Reg].first;
36271
36272 if (RegRewriteTable.contains(Op2Reg))
36273 Op2Reg = RegRewriteTable[Op2Reg].second;
36274
36275 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
36276 .addReg(Op1Reg)
36277 .addMBB(FalseMBB)
36278 .addReg(Op2Reg)
36279 .addMBB(TrueMBB);
36280
36281 // Add this PHI to the rewrite table.
36282 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36283 }
36284
36285 return MIB;
36286}
36287
36288// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36289MachineBasicBlock *
36290X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36291 MachineInstr &SecondCascadedCMOV,
36292 MachineBasicBlock *ThisMBB) const {
36293 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36294 const DebugLoc &DL = FirstCMOV.getDebugLoc();
36295
36296 // We lower cascaded CMOVs such as
36297 //
36298 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36299 //
36300 // to two successive branches.
36301 //
36302 // Without this, we would add a PHI between the two jumps, which ends up
36303 // creating a few copies all around. For instance, for
36304 //
36305 // (sitofp (zext (fcmp une)))
36306 //
36307 // we would generate:
36308 //
36309 // ucomiss %xmm1, %xmm0
36310 // movss <1.0f>, %xmm0
36311 // movaps %xmm0, %xmm1
36312 // jne .LBB5_2
36313 // xorps %xmm1, %xmm1
36314 // .LBB5_2:
36315 // jp .LBB5_4
36316 // movaps %xmm1, %xmm0
36317 // .LBB5_4:
36318 // retq
36319 //
36320 // because this custom-inserter would have generated:
36321 //
36322 // A
36323 // | \
36324 // | B
36325 // | /
36326 // C
36327 // | \
36328 // | D
36329 // | /
36330 // E
36331 //
36332 // A: X = ...; Y = ...
36333 // B: empty
36334 // C: Z = PHI [X, A], [Y, B]
36335 // D: empty
36336 // E: PHI [X, C], [Z, D]
36337 //
36338 // If we lower both CMOVs in a single step, we can instead generate:
36339 //
36340 // A
36341 // | \
36342 // | C
36343 // | /|
36344 // |/ |
36345 // | |
36346 // | D
36347 // | /
36348 // E
36349 //
36350 // A: X = ...; Y = ...
36351 // D: empty
36352 // E: PHI [X, A], [X, C], [Y, D]
36353 //
36354 // Which, in our sitofp/fcmp example, gives us something like:
36355 //
36356 // ucomiss %xmm1, %xmm0
36357 // movss <1.0f>, %xmm0
36358 // jne .LBB5_4
36359 // jp .LBB5_4
36360 // xorps %xmm0, %xmm0
36361 // .LBB5_4:
36362 // retq
36363 //
36364
36365 // We lower cascaded CMOV into two successive branches to the same block.
36366 // EFLAGS is used by both, so mark it as live in the second.
36367 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36368 MachineFunction *F = ThisMBB->getParent();
36369 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36370 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36371 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36372
36373 MachineFunction::iterator It = ++ThisMBB->getIterator();
36374 F->insert(It, FirstInsertedMBB);
36375 F->insert(It, SecondInsertedMBB);
36376 F->insert(It, SinkMBB);
36377
36378 // For a cascaded CMOV, we lower it to two successive branches to
36379 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36380 // the FirstInsertedMBB.
36381 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36382
36383 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36384 // live into the sink and copy blocks.
36385 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36386 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
36387 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36388 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36389 SinkMBB->addLiveIn(X86::EFLAGS);
36390 }
36391
36392 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36393 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36394 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36395 ThisMBB->end());
36396 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36397
36398 // Fallthrough block for ThisMBB.
36399 ThisMBB->addSuccessor(FirstInsertedMBB);
36400 // The true block target of the first branch is always SinkMBB.
36401 ThisMBB->addSuccessor(SinkMBB);
36402 // Fallthrough block for FirstInsertedMBB.
36403 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36404 // The true block for the branch of FirstInsertedMBB.
36405 FirstInsertedMBB->addSuccessor(SinkMBB);
36406 // This is fallthrough.
36407 SecondInsertedMBB->addSuccessor(SinkMBB);
36408
36409 // Create the conditional branch instructions.
36410 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36411 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36412
36413 X86::CondCode SecondCC =
36414 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36415 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
36416
36417 // SinkMBB:
36418 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36419 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36420 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36421 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36422 MachineInstrBuilder MIB =
36423 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
36424 .addReg(Op1Reg)
36425 .addMBB(SecondInsertedMBB)
36426 .addReg(Op2Reg)
36427 .addMBB(ThisMBB);
36428
36429 // The second SecondInsertedMBB provides the same incoming value as the
36430 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36431 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36432
36433 // Now remove the CMOVs.
36434 FirstCMOV.eraseFromParent();
36435 SecondCascadedCMOV.eraseFromParent();
36436
36437 return SinkMBB;
36438}
36439
36440MachineBasicBlock *
36441X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36442 MachineBasicBlock *ThisMBB) const {
36443 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36444 const DebugLoc &DL = MI.getDebugLoc();
36445
36446 // To "insert" a SELECT_CC instruction, we actually have to insert the
36447 // diamond control-flow pattern. The incoming instruction knows the
36448 // destination vreg to set, the condition code register to branch on, the
36449 // true/false values to select between and a branch opcode to use.
36450
36451 // ThisMBB:
36452 // ...
36453 // TrueVal = ...
36454 // cmpTY ccX, r1, r2
36455 // bCC copy1MBB
36456 // fallthrough --> FalseMBB
36457
36458 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36459 // as described above, by inserting a BB, and then making a PHI at the join
36460 // point to select the true and false operands of the CMOV in the PHI.
36461 //
36462 // The code also handles two different cases of multiple CMOV opcodes
36463 // in a row.
36464 //
36465 // Case 1:
36466 // In this case, there are multiple CMOVs in a row, all which are based on
36467 // the same condition setting (or the exact opposite condition setting).
36468 // In this case we can lower all the CMOVs using a single inserted BB, and
36469 // then make a number of PHIs at the join point to model the CMOVs. The only
36470 // trickiness here, is that in a case like:
36471 //
36472 // t2 = CMOV cond1 t1, f1
36473 // t3 = CMOV cond1 t2, f2
36474 //
36475 // when rewriting this into PHIs, we have to perform some renaming on the
36476 // temps since you cannot have a PHI operand refer to a PHI result earlier
36477 // in the same block. The "simple" but wrong lowering would be:
36478 //
36479 // t2 = PHI t1(BB1), f1(BB2)
36480 // t3 = PHI t2(BB1), f2(BB2)
36481 //
36482 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36483 // renaming is to note that on the path through BB1, t2 is really just a
36484 // copy of t1, and do that renaming, properly generating:
36485 //
36486 // t2 = PHI t1(BB1), f1(BB2)
36487 // t3 = PHI t1(BB1), f2(BB2)
36488 //
36489 // Case 2:
36490 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36491 // function - EmitLoweredCascadedSelect.
36492
36493 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36494 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36495 MachineInstr *LastCMOV = &MI;
36496 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
36497
36498 // Check for case 1, where there are multiple CMOVs with the same condition
36499 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36500 // number of jumps the most.
36501
36502 if (isCMOVPseudo(MI)) {
36503 // See if we have a string of CMOVS with the same condition. Skip over
36504 // intervening debug insts.
36505 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36506 (NextMIIt->getOperand(3).getImm() == CC ||
36507 NextMIIt->getOperand(3).getImm() == OppCC)) {
36508 LastCMOV = &*NextMIIt;
36509 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36510 }
36511 }
36512
36513 // This checks for case 2, but only do this if we didn't already find
36514 // case 1, as indicated by LastCMOV == MI.
36515 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36516 NextMIIt->getOpcode() == MI.getOpcode() &&
36517 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36518 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36519 NextMIIt->getOperand(1).isKill()) {
36520 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36521 }
36522
36523 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36524 MachineFunction *F = ThisMBB->getParent();
36525 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36526 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36527
36528 MachineFunction::iterator It = ++ThisMBB->getIterator();
36529 F->insert(It, FalseMBB);
36530 F->insert(It, SinkMBB);
36531
36532 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36533 // live into the sink and copy blocks.
36534 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36535 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
36536 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36537 FalseMBB->addLiveIn(X86::EFLAGS);
36538 SinkMBB->addLiveIn(X86::EFLAGS);
36539 }
36540
36541 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36542 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
36543 MachineBasicBlock::iterator(LastCMOV));
36544 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36545 if (MI.isDebugInstr())
36546 SinkMBB->push_back(MI.removeFromParent());
36547
36548 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36549 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36550 std::next(MachineBasicBlock::iterator(LastCMOV)),
36551 ThisMBB->end());
36552 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36553
36554 // Fallthrough block for ThisMBB.
36555 ThisMBB->addSuccessor(FalseMBB);
36556 // The true block target of the first (or only) branch is always a SinkMBB.
36557 ThisMBB->addSuccessor(SinkMBB);
36558 // Fallthrough block for FalseMBB.
36559 FalseMBB->addSuccessor(SinkMBB);
36560
36561 // Create the conditional branch instruction.
36562 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36563
36564 // SinkMBB:
36565 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36566 // ...
36567 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
36568 MachineBasicBlock::iterator MIItEnd =
36569 std::next(MachineBasicBlock::iterator(LastCMOV));
36570 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36571
36572 // Now remove the CMOV(s).
36573 ThisMBB->erase(MIItBegin, MIItEnd);
36574
36575 return SinkMBB;
36576}
36577
36578static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
36579 if (IsLP64) {
36580 if (isInt<8>(Imm))
36581 return X86::SUB64ri8;
36582 return X86::SUB64ri32;
36583 } else {
36584 if (isInt<8>(Imm))
36585 return X86::SUB32ri8;
36586 return X86::SUB32ri;
36587 }
36588}
36589
36590MachineBasicBlock *
36591X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36592 MachineBasicBlock *MBB) const {
36593 MachineFunction *MF = MBB->getParent();
36594 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36595 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36596 const DebugLoc &DL = MI.getDebugLoc();
36597 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36598
36599 const unsigned ProbeSize = getStackProbeSize(*MF);
36600
36601 MachineRegisterInfo &MRI = MF->getRegInfo();
36602 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36603 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36604 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36605
36606 MachineFunction::iterator MBBIter = ++MBB->getIterator();
36607 MF->insert(MBBIter, testMBB);
36608 MF->insert(MBBIter, blockMBB);
36609 MF->insert(MBBIter, tailMBB);
36610
36611 Register sizeVReg = MI.getOperand(1).getReg();
36612
36613 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36614
36615 Register TmpStackPtr = MRI.createVirtualRegister(
36616 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36617 Register FinalStackPtr = MRI.createVirtualRegister(
36618 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36619
36620 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
36621 .addReg(physSPReg);
36622 {
36623 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36624 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
36625 .addReg(TmpStackPtr)
36626 .addReg(sizeVReg);
36627 }
36628
36629 // test rsp size
36630
36631 BuildMI(testMBB, DL,
36632 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36633 .addReg(FinalStackPtr)
36634 .addReg(physSPReg);
36635
36636 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
36637 .addMBB(tailMBB)
36638 .addImm(X86::COND_GE);
36639 testMBB->addSuccessor(blockMBB);
36640 testMBB->addSuccessor(tailMBB);
36641
36642 // Touch the block then extend it. This is done on the opposite side of
36643 // static probe where we allocate then touch, to avoid the need of probing the
36644 // tail of the static alloca. Possible scenarios are:
36645 //
36646 // + ---- <- ------------ <- ------------- <- ------------ +
36647 // | |
36648 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36649 // | |
36650 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36651 //
36652 // The property we want to enforce is to never have more than [page alloc] between two probes.
36653
36654 const unsigned XORMIOpc =
36655 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
36656 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
36657 .addImm(0);
36658
36659 BuildMI(blockMBB, DL,
36660 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
36661 .addReg(physSPReg)
36662 .addImm(ProbeSize);
36663
36664
36665 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
36666 blockMBB->addSuccessor(testMBB);
36667
36668 // Replace original instruction by the expected stack ptr
36669 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
36670 .addReg(FinalStackPtr);
36671
36672 tailMBB->splice(tailMBB->end(), MBB,
36673 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36674 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36675 MBB->addSuccessor(testMBB);
36676
36677 // Delete the original pseudo instruction.
36678 MI.eraseFromParent();
36679
36680 // And we're done.
36681 return tailMBB;
36682}
36683
36684MachineBasicBlock *
36685X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36686 MachineBasicBlock *BB) const {
36687 MachineFunction *MF = BB->getParent();
36688 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36689 const DebugLoc &DL = MI.getDebugLoc();
36690 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36691
36692 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36692, __extension__ __PRETTY_FUNCTION__))
;
36693
36694 const bool Is64Bit = Subtarget.is64Bit();
36695 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36696
36697 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36698 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36699
36700 // BB:
36701 // ... [Till the alloca]
36702 // If stacklet is not large enough, jump to mallocMBB
36703 //
36704 // bumpMBB:
36705 // Allocate by subtracting from RSP
36706 // Jump to continueMBB
36707 //
36708 // mallocMBB:
36709 // Allocate by call to runtime
36710 //
36711 // continueMBB:
36712 // ...
36713 // [rest of original BB]
36714 //
36715
36716 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36717 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36718 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36719
36720 MachineRegisterInfo &MRI = MF->getRegInfo();
36721 const TargetRegisterClass *AddrRegClass =
36722 getRegClassFor(getPointerTy(MF->getDataLayout()));
36723
36724 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36725 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36726 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36727 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36728 sizeVReg = MI.getOperand(1).getReg(),
36729 physSPReg =
36730 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36731
36732 MachineFunction::iterator MBBIter = ++BB->getIterator();
36733
36734 MF->insert(MBBIter, bumpMBB);
36735 MF->insert(MBBIter, mallocMBB);
36736 MF->insert(MBBIter, continueMBB);
36737
36738 continueMBB->splice(continueMBB->begin(), BB,
36739 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36740 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36741
36742 // Add code to the main basic block to check if the stack limit has been hit,
36743 // and if so, jump to mallocMBB otherwise to bumpMBB.
36744 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36745 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36746 .addReg(tmpSPVReg).addReg(sizeVReg);
36747 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36748 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36749 .addReg(SPLimitVReg);
36750 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36751
36752 // bumpMBB simply decreases the stack pointer, since we know the current
36753 // stacklet has enough space.
36754 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
36755 .addReg(SPLimitVReg);
36756 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36757 .addReg(SPLimitVReg);
36758 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36759
36760 // Calls into a routine in libgcc to allocate more space from the heap.
36761 const uint32_t *RegMask =
36762 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36763 if (IsLP64) {
36764 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
36765 .addReg(sizeVReg);
36766 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
36767 .addExternalSymbol("__morestack_allocate_stack_space")
36768 .addRegMask(RegMask)
36769 .addReg(X86::RDI, RegState::Implicit)
36770 .addReg(X86::RAX, RegState::ImplicitDefine);
36771 } else if (Is64Bit) {
36772 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
36773 .addReg(sizeVReg);
36774 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
36775 .addExternalSymbol("__morestack_allocate_stack_space")
36776 .addRegMask(RegMask)
36777 .addReg(X86::EDI, RegState::Implicit)
36778 .addReg(X86::EAX, RegState::ImplicitDefine);
36779 } else {
36780 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36781 .addImm(12);
36782 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36783 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
36784 .addExternalSymbol("__morestack_allocate_stack_space")
36785 .addRegMask(RegMask)
36786 .addReg(X86::EAX, RegState::ImplicitDefine);
36787 }
36788
36789 if (!Is64Bit)
36790 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36791 .addImm(16);
36792
36793 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36794 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36795 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36796
36797 // Set up the CFG correctly.
36798 BB->addSuccessor(bumpMBB);
36799 BB->addSuccessor(mallocMBB);
36800 mallocMBB->addSuccessor(continueMBB);
36801 bumpMBB->addSuccessor(continueMBB);
36802
36803 // Take care of the PHI nodes.
36804 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
36805 MI.getOperand(0).getReg())
36806 .addReg(mallocPtrVReg)
36807 .addMBB(mallocMBB)
36808 .addReg(bumpSPPtrVReg)
36809 .addMBB(bumpMBB);
36810
36811 // Delete the original pseudo instruction.
36812 MI.eraseFromParent();
36813
36814 // And we're done.
36815 return continueMBB;
36816}
36817
36818MachineBasicBlock *
36819X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36820 MachineBasicBlock *BB) const {
36821 MachineFunction *MF = BB->getParent();
36822 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36823 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36824 const DebugLoc &DL = MI.getDebugLoc();
36825
36826 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36828, __extension__
__PRETTY_FUNCTION__))
36827 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36828, __extension__
__PRETTY_FUNCTION__))
36828 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36828, __extension__
__PRETTY_FUNCTION__))
;
36829
36830 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36831 if (!Subtarget.is32Bit())
36832 return BB;
36833
36834 // C++ EH creates a new target block to hold the restore code, and wires up
36835 // the new block to the return destination with a normal JMP_4.
36836 MachineBasicBlock *RestoreMBB =
36837 MF->CreateMachineBasicBlock(BB->getBasicBlock());
36838 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36838, __extension__ __PRETTY_FUNCTION__))
;
36839 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36840 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36841 BB->addSuccessor(RestoreMBB);
36842 MI.getOperand(0).setMBB(RestoreMBB);
36843
36844 // Marking this as an EH pad but not a funclet entry block causes PEI to
36845 // restore stack pointers in the block.
36846 RestoreMBB->setIsEHPad(true);
36847
36848 auto RestoreMBBI = RestoreMBB->begin();
36849 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36850 return BB;
36851}
36852
36853MachineBasicBlock *
36854X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
36855 MachineBasicBlock *BB) const {
36856 // So, here we replace TLSADDR with the sequence:
36857 // adjust_stackdown -> TLSADDR -> adjust_stackup.
36858 // We need this because TLSADDR is lowered into calls
36859 // inside MC, therefore without the two markers shrink-wrapping
36860 // may push the prologue/epilogue pass them.
36861 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36862 const DebugLoc &DL = MI.getDebugLoc();
36863 MachineFunction &MF = *BB->getParent();
36864
36865 // Emit CALLSEQ_START right before the instruction.
36866 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
36867 MachineInstrBuilder CallseqStart =
36868 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
36869 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
36870
36871 // Emit CALLSEQ_END right after the instruction.
36872 // We don't call erase from parent because we want to keep the
36873 // original instruction around.
36874 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
36875 MachineInstrBuilder CallseqEnd =
36876 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
36877 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
36878
36879 return BB;
36880}
36881
36882MachineBasicBlock *
36883X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36884 MachineBasicBlock *BB) const {
36885 // This is pretty easy. We're taking the value that we received from
36886 // our load from the relocation, sticking it in either RDI (x86-64)
36887 // or EAX and doing an indirect call. The return value will then
36888 // be in the normal return register.
36889 MachineFunction *F = BB->getParent();
36890 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36891 const DebugLoc &DL = MI.getDebugLoc();
36892
36893 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36893, __extension__
__PRETTY_FUNCTION__))
;
36894 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36894, __extension__
__PRETTY_FUNCTION__))
;
36895
36896 // Get a register mask for the lowered call.
36897 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36898 // proper register mask.
36899 const uint32_t *RegMask =
36900 Subtarget.is64Bit() ?
36901 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36902 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36903 if (Subtarget.is64Bit()) {
36904 MachineInstrBuilder MIB =
36905 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
36906 .addReg(X86::RIP)
36907 .addImm(0)
36908 .addReg(0)
36909 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36910 MI.getOperand(3).getTargetFlags())
36911 .addReg(0);
36912 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
36913 addDirectMem(MIB, X86::RDI);
36914 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36915 } else if (!isPositionIndependent()) {
36916 MachineInstrBuilder MIB =
36917 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
36918 .addReg(0)
36919 .addImm(0)
36920 .addReg(0)
36921 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36922 MI.getOperand(3).getTargetFlags())
36923 .addReg(0);
36924 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
36925 addDirectMem(MIB, X86::EAX);
36926 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36927 } else {
36928 MachineInstrBuilder MIB =
36929 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
36930 .addReg(TII->getGlobalBaseReg(F))
36931 .addImm(0)
36932 .addReg(0)
36933 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36934 MI.getOperand(3).getTargetFlags())
36935 .addReg(0);
36936 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
36937 addDirectMem(MIB, X86::EAX);
36938 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36939 }
36940
36941 MI.eraseFromParent(); // The pseudo instruction is gone now.
36942 return BB;
36943}
36944
36945static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36946 switch (RPOpc) {
36947 case X86::INDIRECT_THUNK_CALL32:
36948 return X86::CALLpcrel32;
36949 case X86::INDIRECT_THUNK_CALL64:
36950 return X86::CALL64pcrel32;
36951 case X86::INDIRECT_THUNK_TCRETURN32:
36952 return X86::TCRETURNdi;
36953 case X86::INDIRECT_THUNK_TCRETURN64:
36954 return X86::TCRETURNdi64;
36955 }
36956 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36956)
;
36957}
36958
36959static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36960 unsigned Reg) {
36961 if (Subtarget.useRetpolineExternalThunk()) {
36962 // When using an external thunk for retpolines, we pick names that match the
36963 // names GCC happens to use as well. This helps simplify the implementation
36964 // of the thunks for kernels where they have no easy ability to create
36965 // aliases and are doing non-trivial configuration of the thunk's body. For
36966 // example, the Linux kernel will do boot-time hot patching of the thunk
36967 // bodies and cannot easily export aliases of these to loaded modules.
36968 //
36969 // Note that at any point in the future, we may need to change the semantics
36970 // of how we implement retpolines and at that time will likely change the
36971 // name of the called thunk. Essentially, there is no hard guarantee that
36972 // LLVM will generate calls to specific thunks, we merely make a best-effort
36973 // attempt to help out kernels and other systems where duplicating the
36974 // thunks is costly.
36975 switch (Reg) {
36976 case X86::EAX:
36977 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36977, __extension__
__PRETTY_FUNCTION__))
;
36978 return "__x86_indirect_thunk_eax";
36979 case X86::ECX:
36980 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36980, __extension__
__PRETTY_FUNCTION__))
;
36981 return "__x86_indirect_thunk_ecx";
36982 case X86::EDX:
36983 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36983, __extension__
__PRETTY_FUNCTION__))
;
36984 return "__x86_indirect_thunk_edx";
36985 case X86::EDI:
36986 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36986, __extension__
__PRETTY_FUNCTION__))
;
36987 return "__x86_indirect_thunk_edi";
36988 case X86::R11:
36989 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36989, __extension__
__PRETTY_FUNCTION__))
;
36990 return "__x86_indirect_thunk_r11";
36991 }
36992 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36992)
;
36993 }
36994
36995 if (Subtarget.useRetpolineIndirectCalls() ||
36996 Subtarget.useRetpolineIndirectBranches()) {
36997 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36998 switch (Reg) {
36999 case X86::EAX:
37000 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37000, __extension__
__PRETTY_FUNCTION__))
;
37001 return "__llvm_retpoline_eax";
37002 case X86::ECX:
37003 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37003, __extension__
__PRETTY_FUNCTION__))
;
37004 return "__llvm_retpoline_ecx";
37005 case X86::EDX:
37006 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37006, __extension__
__PRETTY_FUNCTION__))
;
37007 return "__llvm_retpoline_edx";
37008 case X86::EDI:
37009 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37009, __extension__
__PRETTY_FUNCTION__))
;
37010 return "__llvm_retpoline_edi";
37011 case X86::R11:
37012 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37012, __extension__
__PRETTY_FUNCTION__))
;
37013 return "__llvm_retpoline_r11";
37014 }
37015 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37015)
;
37016 }
37017
37018 if (Subtarget.useLVIControlFlowIntegrity()) {
37019 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37019, __extension__
__PRETTY_FUNCTION__))
;
37020 return "__llvm_lvi_thunk_r11";
37021 }
37022 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37022)
;
37023}
37024
37025MachineBasicBlock *
37026X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
37027 MachineBasicBlock *BB) const {
37028 // Copy the virtual register into the R11 physical register and
37029 // call the retpoline thunk.
37030 const DebugLoc &DL = MI.getDebugLoc();
37031 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37032 Register CalleeVReg = MI.getOperand(0).getReg();
37033 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
37034
37035 // Find an available scratch register to hold the callee. On 64-bit, we can
37036 // just use R11, but we scan for uses anyway to ensure we don't generate
37037 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
37038 // already a register use operand to the call to hold the callee. If none
37039 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
37040 // register and ESI is the base pointer to realigned stack frames with VLAs.
37041 SmallVector<unsigned, 3> AvailableRegs;
37042 if (Subtarget.is64Bit())
37043 AvailableRegs.push_back(X86::R11);
37044 else
37045 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
37046
37047 // Zero out any registers that are already used.
37048 for (const auto &MO : MI.operands()) {
37049 if (MO.isReg() && MO.isUse())
37050 for (unsigned &Reg : AvailableRegs)
37051 if (Reg == MO.getReg())
37052 Reg = 0;
37053 }
37054
37055 // Choose the first remaining non-zero available register.
37056 unsigned AvailableReg = 0;
37057 for (unsigned MaybeReg : AvailableRegs) {
37058 if (MaybeReg) {
37059 AvailableReg = MaybeReg;
37060 break;
37061 }
37062 }
37063 if (!AvailableReg)
37064 report_fatal_error("calling convention incompatible with retpoline, no "
37065 "available registers");
37066
37067 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
37068
37069 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
37070 .addReg(CalleeVReg);
37071 MI.getOperand(0).ChangeToES(Symbol);
37072 MI.setDesc(TII->get(Opc));
37073 MachineInstrBuilder(*BB->getParent(), &MI)
37074 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
37075 return BB;
37076}
37077
37078/// SetJmp implies future control flow change upon calling the corresponding
37079/// LongJmp.
37080/// Instead of using the 'return' instruction, the long jump fixes the stack and
37081/// performs an indirect branch. To do so it uses the registers that were stored
37082/// in the jump buffer (when calling SetJmp).
37083/// In case the shadow stack is enabled we need to fix it as well, because some
37084/// return addresses will be skipped.
37085/// The function will save the SSP for future fixing in the function
37086/// emitLongJmpShadowStackFix.
37087/// \sa emitLongJmpShadowStackFix
37088/// \param [in] MI The temporary Machine Instruction for the builtin.
37089/// \param [in] MBB The Machine Basic Block that will be modified.
37090void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37091 MachineBasicBlock *MBB) const {
37092 const DebugLoc &DL = MI.getDebugLoc();
37093 MachineFunction *MF = MBB->getParent();
37094 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37095 MachineRegisterInfo &MRI = MF->getRegInfo();
37096 MachineInstrBuilder MIB;
37097
37098 // Memory Reference.
37099 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37100 MI.memoperands_end());
37101
37102 // Initialize a register with zero.
37103 MVT PVT = getPointerTy(MF->getDataLayout());
37104 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37105 Register ZReg = MRI.createVirtualRegister(PtrRC);
37106 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37107 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
37108 .addDef(ZReg)
37109 .addReg(ZReg, RegState::Undef)
37110 .addReg(ZReg, RegState::Undef);
37111
37112 // Read the current SSP Register value to the zeroed register.
37113 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37114 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37115 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37116
37117 // Write the SSP register value to offset 3 in input memory buffer.
37118 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37119 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
37120 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37121 const unsigned MemOpndSlot = 1;
37122 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37123 if (i == X86::AddrDisp)
37124 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37125 else
37126 MIB.add(MI.getOperand(MemOpndSlot + i));
37127 }
37128 MIB.addReg(SSPCopyReg);
37129 MIB.setMemRefs(MMOs);
37130}
37131
37132MachineBasicBlock *
37133X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37134 MachineBasicBlock *MBB) const {
37135 const DebugLoc &DL = MI.getDebugLoc();
37136 MachineFunction *MF = MBB->getParent();
37137 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37138 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37139 MachineRegisterInfo &MRI = MF->getRegInfo();
37140
37141 const BasicBlock *BB = MBB->getBasicBlock();
37142 MachineFunction::iterator I = ++MBB->getIterator();
37143
37144 // Memory Reference
37145 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37146 MI.memoperands_end());
37147
37148 unsigned DstReg;
37149 unsigned MemOpndSlot = 0;
37150
37151 unsigned CurOp = 0;
37152
37153 DstReg = MI.getOperand(CurOp++).getReg();
37154 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37155 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37155, __extension__
__PRETTY_FUNCTION__))
;
37156 (void)TRI;
37157 Register mainDstReg = MRI.createVirtualRegister(RC);
37158 Register restoreDstReg = MRI.createVirtualRegister(RC);
37159
37160 MemOpndSlot = CurOp;
37161
37162 MVT PVT = getPointerTy(MF->getDataLayout());
37163 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37164, __extension__
__PRETTY_FUNCTION__))
37164 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37164, __extension__
__PRETTY_FUNCTION__))
;
37165
37166 // For v = setjmp(buf), we generate
37167 //
37168 // thisMBB:
37169 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37170 // SjLjSetup restoreMBB
37171 //
37172 // mainMBB:
37173 // v_main = 0
37174 //
37175 // sinkMBB:
37176 // v = phi(main, restore)
37177 //
37178 // restoreMBB:
37179 // if base pointer being used, load it from frame
37180 // v_restore = 1
37181
37182 MachineBasicBlock *thisMBB = MBB;
37183 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37184 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37185 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37186 MF->insert(I, mainMBB);
37187 MF->insert(I, sinkMBB);
37188 MF->push_back(restoreMBB);
37189 restoreMBB->setMachineBlockAddressTaken();
37190
37191 MachineInstrBuilder MIB;
37192
37193 // Transfer the remainder of BB and its successor edges to sinkMBB.
37194 sinkMBB->splice(sinkMBB->begin(), MBB,
37195 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37196 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37197
37198 // thisMBB:
37199 unsigned PtrStoreOpc = 0;
37200 unsigned LabelReg = 0;
37201 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37202 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37203 !isPositionIndependent();
37204
37205 // Prepare IP either in reg or imm.
37206 if (!UseImmLabel) {
37207 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37208 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37209 LabelReg = MRI.createVirtualRegister(PtrRC);
37210 if (Subtarget.is64Bit()) {
37211 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
37212 .addReg(X86::RIP)
37213 .addImm(0)
37214 .addReg(0)
37215 .addMBB(restoreMBB)
37216 .addReg(0);
37217 } else {
37218 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37219 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
37220 .addReg(XII->getGlobalBaseReg(MF))
37221 .addImm(0)
37222 .addReg(0)
37223 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37224 .addReg(0);
37225 }
37226 } else
37227 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37228 // Store IP
37229 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
37230 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37231 if (i == X86::AddrDisp)
37232 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37233 else
37234 MIB.add(MI.getOperand(MemOpndSlot + i));
37235 }
37236 if (!UseImmLabel)
37237 MIB.addReg(LabelReg);
37238 else
37239 MIB.addMBB(restoreMBB);
37240 MIB.setMemRefs(MMOs);
37241
37242 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37243 emitSetJmpShadowStackFix(MI, thisMBB);
37244 }
37245
37246 // Setup
37247 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
37248 .addMBB(restoreMBB);
37249
37250 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37251 MIB.addRegMask(RegInfo->getNoPreservedMask());
37252 thisMBB->addSuccessor(mainMBB);
37253 thisMBB->addSuccessor(restoreMBB);
37254
37255 // mainMBB:
37256 // EAX = 0
37257 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
37258 mainMBB->addSuccessor(sinkMBB);
37259
37260 // sinkMBB:
37261 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
37262 TII->get(X86::PHI), DstReg)
37263 .addReg(mainDstReg).addMBB(mainMBB)
37264 .addReg(restoreDstReg).addMBB(restoreMBB);
37265
37266 // restoreMBB:
37267 if (RegInfo->hasBasePointer(*MF)) {
37268 const bool Uses64BitFramePtr =
37269 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37270 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37271 X86FI->setRestoreBasePointer(MF);
37272 Register FramePtr = RegInfo->getFrameRegister(*MF);
37273 Register BasePtr = RegInfo->getBaseRegister();
37274 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37275 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
37276 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37277 .setMIFlag(MachineInstr::FrameSetup);
37278 }
37279 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37280 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37281 restoreMBB->addSuccessor(sinkMBB);
37282
37283 MI.eraseFromParent();
37284 return sinkMBB;
37285}
37286
37287/// Fix the shadow stack using the previously saved SSP pointer.
37288/// \sa emitSetJmpShadowStackFix
37289/// \param [in] MI The temporary Machine Instruction for the builtin.
37290/// \param [in] MBB The Machine Basic Block that will be modified.
37291/// \return The sink MBB that will perform the future indirect branch.
37292MachineBasicBlock *
37293X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37294 MachineBasicBlock *MBB) const {
37295 const DebugLoc &DL = MI.getDebugLoc();
37296 MachineFunction *MF = MBB->getParent();
37297 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37298 MachineRegisterInfo &MRI = MF->getRegInfo();
37299
37300 // Memory Reference
37301 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37302 MI.memoperands_end());
37303
37304 MVT PVT = getPointerTy(MF->getDataLayout());
37305 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37306
37307 // checkSspMBB:
37308 // xor vreg1, vreg1
37309 // rdssp vreg1
37310 // test vreg1, vreg1
37311 // je sinkMBB # Jump if Shadow Stack is not supported
37312 // fallMBB:
37313 // mov buf+24/12(%rip), vreg2
37314 // sub vreg1, vreg2
37315 // jbe sinkMBB # No need to fix the Shadow Stack
37316 // fixShadowMBB:
37317 // shr 3/2, vreg2
37318 // incssp vreg2 # fix the SSP according to the lower 8 bits
37319 // shr 8, vreg2
37320 // je sinkMBB
37321 // fixShadowLoopPrepareMBB:
37322 // shl vreg2
37323 // mov 128, vreg3
37324 // fixShadowLoopMBB:
37325 // incssp vreg3
37326 // dec vreg2
37327 // jne fixShadowLoopMBB # Iterate until you finish fixing
37328 // # the Shadow Stack
37329 // sinkMBB:
37330
37331 MachineFunction::iterator I = ++MBB->getIterator();
37332 const BasicBlock *BB = MBB->getBasicBlock();
37333
37334 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37335 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37336 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37337 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37338 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37339 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37340 MF->insert(I, checkSspMBB);
37341 MF->insert(I, fallMBB);
37342 MF->insert(I, fixShadowMBB);
37343 MF->insert(I, fixShadowLoopPrepareMBB);
37344 MF->insert(I, fixShadowLoopMBB);
37345 MF->insert(I, sinkMBB);
37346
37347 // Transfer the remainder of BB and its successor edges to sinkMBB.
37348 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37349 MBB->end());
37350 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37351
37352 MBB->addSuccessor(checkSspMBB);
37353
37354 // Initialize a register with zero.
37355 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37356 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
37357
37358 if (PVT == MVT::i64) {
37359 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37360 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37361 .addImm(0)
37362 .addReg(ZReg)
37363 .addImm(X86::sub_32bit);
37364 ZReg = TmpZReg;
37365 }
37366
37367 // Read the current SSP Register value to the zeroed register.
37368 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37369 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37370 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37371
37372 // Check whether the result of the SSP register is zero and jump directly
37373 // to the sink.
37374 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37375 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
37376 .addReg(SSPCopyReg)
37377 .addReg(SSPCopyReg);
37378 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37379 checkSspMBB->addSuccessor(sinkMBB);
37380 checkSspMBB->addSuccessor(fallMBB);
37381
37382 // Reload the previously saved SSP register value.
37383 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37384 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37385 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37386 MachineInstrBuilder MIB =
37387 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
37388 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37389 const MachineOperand &MO = MI.getOperand(i);
37390 if (i == X86::AddrDisp)
37391 MIB.addDisp(MO, SPPOffset);
37392 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37393 // preserve kill flags.
37394 MIB.addReg(MO.getReg());
37395 else
37396 MIB.add(MO);
37397 }
37398 MIB.setMemRefs(MMOs);
37399
37400 // Subtract the current SSP from the previous SSP.
37401 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37402 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37403 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
37404 .addReg(PrevSSPReg)
37405 .addReg(SSPCopyReg);
37406
37407 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37408 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
37409 fallMBB->addSuccessor(sinkMBB);
37410 fallMBB->addSuccessor(fixShadowMBB);
37411
37412 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37413 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37414 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37415 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37416 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
37417 .addReg(SspSubReg)
37418 .addImm(Offset);
37419
37420 // Increase SSP when looking only on the lower 8 bits of the delta.
37421 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37422 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37423
37424 // Reset the lower 8 bits.
37425 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37426 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
37427 .addReg(SspFirstShrReg)
37428 .addImm(8);
37429
37430 // Jump if the result of the shift is zero.
37431 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37432 fixShadowMBB->addSuccessor(sinkMBB);
37433 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37434
37435 // Do a single shift left.
37436 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
37437 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37438 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
37439 .addReg(SspSecondShrReg);
37440
37441 // Save the value 128 to a register (will be used next with incssp).
37442 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37443 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37444 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
37445 .addImm(128);
37446 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37447
37448 // Since incssp only looks at the lower 8 bits, we might need to do several
37449 // iterations of incssp until we finish fixing the shadow stack.
37450 Register DecReg = MRI.createVirtualRegister(PtrRC);
37451 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37452 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
37453 .addReg(SspAfterShlReg)
37454 .addMBB(fixShadowLoopPrepareMBB)
37455 .addReg(DecReg)
37456 .addMBB(fixShadowLoopMBB);
37457
37458 // Every iteration we increase the SSP by 128.
37459 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
37460
37461 // Every iteration we decrement the counter by 1.
37462 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37463 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
37464
37465 // Jump if the counter is not zero yet.
37466 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
37467 fixShadowLoopMBB->addSuccessor(sinkMBB);
37468 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37469
37470 return sinkMBB;
37471}
37472
37473MachineBasicBlock *
37474X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37475 MachineBasicBlock *MBB) const {
37476 const DebugLoc &DL = MI.getDebugLoc();
37477 MachineFunction *MF = MBB->getParent();
37478 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37479 MachineRegisterInfo &MRI = MF->getRegInfo();
37480
37481 // Memory Reference
37482 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37483 MI.memoperands_end());
37484
37485 MVT PVT = getPointerTy(MF->getDataLayout());
37486 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37487, __extension__
__PRETTY_FUNCTION__))
37487 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37487, __extension__
__PRETTY_FUNCTION__))
;
37488
37489 const TargetRegisterClass *RC =
37490 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37491 Register Tmp = MRI.createVirtualRegister(RC);
37492 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37493 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37494 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37495 Register SP = RegInfo->getStackRegister();
37496
37497 MachineInstrBuilder MIB;
37498
37499 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37500 const int64_t SPOffset = 2 * PVT.getStoreSize();
37501
37502 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37503 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37504
37505 MachineBasicBlock *thisMBB = MBB;
37506
37507 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37508 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37509 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37510 }
37511
37512 // Reload FP
37513 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
37514 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37515 const MachineOperand &MO = MI.getOperand(i);
37516 if (MO.isReg()) // Don't add the whole operand, we don't want to
37517 // preserve kill flags.
37518 MIB.addReg(MO.getReg());
37519 else
37520 MIB.add(MO);
37521 }
37522 MIB.setMemRefs(MMOs);
37523
37524 // Reload IP
37525 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
37526 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37527 const MachineOperand &MO = MI.getOperand(i);
37528 if (i == X86::AddrDisp)
37529 MIB.addDisp(MO, LabelOffset);
37530 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37531 // preserve kill flags.
37532 MIB.addReg(MO.getReg());
37533 else
37534 MIB.add(MO);
37535 }
37536 MIB.setMemRefs(MMOs);
37537
37538 // Reload SP
37539 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
37540 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37541 if (i == X86::AddrDisp)
37542 MIB.addDisp(MI.getOperand(i), SPOffset);
37543 else
37544 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37545 // the last instruction of the expansion.
37546 }
37547 MIB.setMemRefs(MMOs);
37548
37549 // Jump
37550 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
37551
37552 MI.eraseFromParent();
37553 return thisMBB;
37554}
37555
37556void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37557 MachineBasicBlock *MBB,
37558 MachineBasicBlock *DispatchBB,
37559 int FI) const {
37560 const DebugLoc &DL = MI.getDebugLoc();
37561 MachineFunction *MF = MBB->getParent();
37562 MachineRegisterInfo *MRI = &MF->getRegInfo();
37563 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37564
37565 MVT PVT = getPointerTy(MF->getDataLayout());
37566 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37566, __extension__
__PRETTY_FUNCTION__))
;
37567
37568 unsigned Op = 0;
37569 unsigned VR = 0;
37570
37571 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37572 !isPositionIndependent();
37573
37574 if (UseImmLabel) {
37575 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37576 } else {
37577 const TargetRegisterClass *TRC =
37578 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37579 VR = MRI->createVirtualRegister(TRC);
37580 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37581
37582 if (Subtarget.is64Bit())
37583 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
37584 .addReg(X86::RIP)
37585 .addImm(1)
37586 .addReg(0)
37587 .addMBB(DispatchBB)
37588 .addReg(0);
37589 else
37590 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
37591 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37592 .addImm(1)
37593 .addReg(0)
37594 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37595 .addReg(0);
37596 }
37597
37598 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
37599 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37600 if (UseImmLabel)
37601 MIB.addMBB(DispatchBB);
37602 else
37603 MIB.addReg(VR);
37604}
37605
37606MachineBasicBlock *
37607X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37608 MachineBasicBlock *BB) const {
37609 const DebugLoc &DL = MI.getDebugLoc();
37610 MachineFunction *MF = BB->getParent();
37611 MachineRegisterInfo *MRI = &MF->getRegInfo();
37612 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37613 int FI = MF->getFrameInfo().getFunctionContextIndex();
37614
37615 // Get a mapping of the call site numbers to all of the landing pads they're
37616 // associated with.
37617 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37618 unsigned MaxCSNum = 0;
37619 for (auto &MBB : *MF) {
37620 if (!MBB.isEHPad())
37621 continue;
37622
37623 MCSymbol *Sym = nullptr;
37624 for (const auto &MI : MBB) {
37625 if (MI.isDebugInstr())
37626 continue;
37627
37628 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37628, __extension__
__PRETTY_FUNCTION__))
;
37629 Sym = MI.getOperand(0).getMCSymbol();
37630 break;
37631 }
37632
37633 if (!MF->hasCallSiteLandingPad(Sym))
37634 continue;
37635
37636 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37637 CallSiteNumToLPad[CSI].push_back(&MBB);
37638 MaxCSNum = std::max(MaxCSNum, CSI);
37639 }
37640 }
37641
37642 // Get an ordered list of the machine basic blocks for the jump table.
37643 std::vector<MachineBasicBlock *> LPadList;
37644 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37645 LPadList.reserve(CallSiteNumToLPad.size());
37646
37647 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37648 for (auto &LP : CallSiteNumToLPad[CSI]) {
37649 LPadList.push_back(LP);
37650 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37651 }
37652 }
37653
37654 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37655, __extension__
__PRETTY_FUNCTION__))
37655 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37655, __extension__
__PRETTY_FUNCTION__))
;
37656
37657 // Create the MBBs for the dispatch code.
37658
37659 // Shove the dispatch's address into the return slot in the function context.
37660 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37661 DispatchBB->setIsEHPad(true);
37662
37663 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37664 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
37665 DispatchBB->addSuccessor(TrapBB);
37666
37667 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37668 DispatchBB->addSuccessor(DispContBB);
37669
37670 // Insert MBBs.
37671 MF->push_back(DispatchBB);
37672 MF->push_back(DispContBB);
37673 MF->push_back(TrapBB);
37674
37675 // Insert code into the entry block that creates and registers the function
37676 // context.
37677 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37678
37679 // Create the jump table and associated information
37680 unsigned JTE = getJumpTableEncoding();
37681 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37682 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37683
37684 const X86RegisterInfo &RI = TII->getRegisterInfo();
37685 // Add a register mask with no preserved registers. This results in all
37686 // registers being marked as clobbered.
37687 if (RI.hasBasePointer(*MF)) {
37688 const bool FPIs64Bit =
37689 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37690 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37691 MFI->setRestoreBasePointer(MF);
37692
37693 Register FP = RI.getFrameRegister(*MF);
37694 Register BP = RI.getBaseRegister();
37695 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37696 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
37697 MFI->getRestoreBasePointerOffset())
37698 .addRegMask(RI.getNoPreservedMask());
37699 } else {
37700 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
37701 .addRegMask(RI.getNoPreservedMask());
37702 }
37703
37704 // IReg is used as an index in a memory operand and therefore can't be SP
37705 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37706 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
37707 Subtarget.is64Bit() ? 8 : 4);
37708 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
37709 .addReg(IReg)
37710 .addImm(LPadList.size());
37711 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
37712
37713 if (Subtarget.is64Bit()) {
37714 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37715 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37716
37717 // leaq .LJTI0_0(%rip), BReg
37718 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
37719 .addReg(X86::RIP)
37720 .addImm(1)
37721 .addReg(0)
37722 .addJumpTableIndex(MJTI)
37723 .addReg(0);
37724 // movzx IReg64, IReg
37725 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37726 .addImm(0)
37727 .addReg(IReg)
37728 .addImm(X86::sub_32bit);
37729
37730 switch (JTE) {
37731 case MachineJumpTableInfo::EK_BlockAddress:
37732 // jmpq *(BReg,IReg64,8)
37733 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
37734 .addReg(BReg)
37735 .addImm(8)
37736 .addReg(IReg64)
37737 .addImm(0)
37738 .addReg(0);
37739 break;
37740 case MachineJumpTableInfo::EK_LabelDifference32: {
37741 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37742 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37743 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37744
37745 // movl (BReg,IReg64,4), OReg
37746 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
37747 .addReg(BReg)
37748 .addImm(4)
37749 .addReg(IReg64)
37750 .addImm(0)
37751 .addReg(0);
37752 // movsx OReg64, OReg
37753 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
37754 // addq BReg, OReg64, TReg
37755 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
37756 .addReg(OReg64)
37757 .addReg(BReg);
37758 // jmpq *TReg
37759 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
37760 break;
37761 }
37762 default:
37763 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37763)
;
37764 }
37765 } else {
37766 // jmpl *.LJTI0_0(,IReg,4)
37767 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
37768 .addReg(0)
37769 .addImm(4)
37770 .addReg(IReg)
37771 .addJumpTableIndex(MJTI)
37772 .addReg(0);
37773 }
37774
37775 // Add the jump table entries as successors to the MBB.
37776 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37777 for (auto &LP : LPadList)
37778 if (SeenMBBs.insert(LP).second)
37779 DispContBB->addSuccessor(LP);
37780
37781 // N.B. the order the invoke BBs are processed in doesn't matter here.
37782 SmallVector<MachineBasicBlock *, 64> MBBLPads;
37783 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37784 for (MachineBasicBlock *MBB : InvokeBBs) {
37785 // Remove the landing pad successor from the invoke block and replace it
37786 // with the new dispatch block.
37787 // Keep a copy of Successors since it's modified inside the loop.
37788 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37789 MBB->succ_rend());
37790 // FIXME: Avoid quadratic complexity.
37791 for (auto *MBBS : Successors) {
37792 if (MBBS->isEHPad()) {
37793 MBB->removeSuccessor(MBBS);
37794 MBBLPads.push_back(MBBS);
37795 }
37796 }
37797
37798 MBB->addSuccessor(DispatchBB);
37799
37800 // Find the invoke call and mark all of the callee-saved registers as
37801 // 'implicit defined' so that they're spilled. This prevents code from
37802 // moving instructions to before the EH block, where they will never be
37803 // executed.
37804 for (auto &II : reverse(*MBB)) {
37805 if (!II.isCall())
37806 continue;
37807
37808 DenseMap<unsigned, bool> DefRegs;
37809 for (auto &MOp : II.operands())
37810 if (MOp.isReg())
37811 DefRegs[MOp.getReg()] = true;
37812
37813 MachineInstrBuilder MIB(*MF, &II);
37814 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37815 unsigned Reg = SavedRegs[RegIdx];
37816 if (!DefRegs[Reg])
37817 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
37818 }
37819
37820 break;
37821 }
37822 }
37823
37824 // Mark all former landing pads as non-landing pads. The dispatch is the only
37825 // landing pad now.
37826 for (auto &LP : MBBLPads)
37827 LP->setIsEHPad(false);
37828
37829 // The instruction is gone now.
37830 MI.eraseFromParent();
37831 return BB;
37832}
37833
37834MachineBasicBlock *
37835X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
37836 MachineBasicBlock *BB) const {
37837 MachineFunction *MF = BB->getParent();
37838 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37839 const DebugLoc &DL = MI.getDebugLoc();
37840
37841 auto TMMImmToTMMReg = [](unsigned Imm) {
37842 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37842, __extension__
__PRETTY_FUNCTION__))
;
37843 return X86::TMM0 + Imm;
37844 };
37845 switch (MI.getOpcode()) {
37846 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37846)
;
37847 case X86::TLS_addr32:
37848 case X86::TLS_addr64:
37849 case X86::TLS_addrX32:
37850 case X86::TLS_base_addr32:
37851 case X86::TLS_base_addr64:
37852 case X86::TLS_base_addrX32:
37853 return EmitLoweredTLSAddr(MI, BB);
37854 case X86::INDIRECT_THUNK_CALL32:
37855 case X86::INDIRECT_THUNK_CALL64:
37856 case X86::INDIRECT_THUNK_TCRETURN32:
37857 case X86::INDIRECT_THUNK_TCRETURN64:
37858 return EmitLoweredIndirectThunk(MI, BB);
37859 case X86::CATCHRET:
37860 return EmitLoweredCatchRet(MI, BB);
37861 case X86::SEG_ALLOCA_32:
37862 case X86::SEG_ALLOCA_64:
37863 return EmitLoweredSegAlloca(MI, BB);
37864 case X86::PROBED_ALLOCA_32:
37865 case X86::PROBED_ALLOCA_64:
37866 return EmitLoweredProbedAlloca(MI, BB);
37867 case X86::TLSCall_32:
37868 case X86::TLSCall_64:
37869 return EmitLoweredTLSCall(MI, BB);
37870 case X86::CMOV_FR16:
37871 case X86::CMOV_FR16X:
37872 case X86::CMOV_FR32:
37873 case X86::CMOV_FR32X:
37874 case X86::CMOV_FR64:
37875 case X86::CMOV_FR64X:
37876 case X86::CMOV_GR8:
37877 case X86::CMOV_GR16:
37878 case X86::CMOV_GR32:
37879 case X86::CMOV_RFP32:
37880 case X86::CMOV_RFP64:
37881 case X86::CMOV_RFP80:
37882 case X86::CMOV_VR64:
37883 case X86::CMOV_VR128:
37884 case X86::CMOV_VR128X:
37885 case X86::CMOV_VR256:
37886 case X86::CMOV_VR256X:
37887 case X86::CMOV_VR512:
37888 case X86::CMOV_VK1:
37889 case X86::CMOV_VK2:
37890 case X86::CMOV_VK4:
37891 case X86::CMOV_VK8:
37892 case X86::CMOV_VK16:
37893 case X86::CMOV_VK32:
37894 case X86::CMOV_VK64:
37895 return EmitLoweredSelect(MI, BB);
37896
37897 case X86::FP80_ADDr:
37898 case X86::FP80_ADDm32: {
37899 // Change the floating point control register to use double extended
37900 // precision when performing the addition.
37901 int OrigCWFrameIdx =
37902 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37903 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
37904 OrigCWFrameIdx);
37905
37906 // Load the old value of the control word...
37907 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37908 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
37909 OrigCWFrameIdx);
37910
37911 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37912 // precision.
37913 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37914 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
37915 .addReg(OldCW, RegState::Kill)
37916 .addImm(0x300);
37917
37918 // Extract to 16 bits.
37919 Register NewCW16 =
37920 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37921 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
37922 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37923
37924 // Prepare memory for FLDCW.
37925 int NewCWFrameIdx =
37926 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37927 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
37928 NewCWFrameIdx)
37929 .addReg(NewCW16, RegState::Kill);
37930
37931 // Reload the modified control word now...
37932 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37933 NewCWFrameIdx);
37934
37935 // Do the addition.
37936 if (MI.getOpcode() == X86::FP80_ADDr) {
37937 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
37938 .add(MI.getOperand(0))
37939 .add(MI.getOperand(1))
37940 .add(MI.getOperand(2));
37941 } else {
37942 BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
37943 .add(MI.getOperand(0))
37944 .add(MI.getOperand(1))
37945 .add(MI.getOperand(2))
37946 .add(MI.getOperand(3))
37947 .add(MI.getOperand(4))
37948 .add(MI.getOperand(5))
37949 .add(MI.getOperand(6));
37950 }
37951
37952 // Reload the original control word now.
37953 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37954 OrigCWFrameIdx);
37955
37956 MI.eraseFromParent(); // The pseudo instruction is gone now.
37957 return BB;
37958 }
37959
37960 case X86::FP32_TO_INT16_IN_MEM:
37961 case X86::FP32_TO_INT32_IN_MEM:
37962 case X86::FP32_TO_INT64_IN_MEM:
37963 case X86::FP64_TO_INT16_IN_MEM:
37964 case X86::FP64_TO_INT32_IN_MEM:
37965 case X86::FP64_TO_INT64_IN_MEM:
37966 case X86::FP80_TO_INT16_IN_MEM:
37967 case X86::FP80_TO_INT32_IN_MEM:
37968 case X86::FP80_TO_INT64_IN_MEM: {
37969 // Change the floating point control register to use "round towards zero"
37970 // mode when truncating to an integer value.
37971 int OrigCWFrameIdx =
37972 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37973 addFrameReference(BuildMI(*BB, MI, DL,
37974 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
37975
37976 // Load the old value of the control word...
37977 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37978 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
37979 OrigCWFrameIdx);
37980
37981 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37982 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37983 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
37984 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37985
37986 // Extract to 16 bits.
37987 Register NewCW16 =
37988 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37989 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
37990 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37991
37992 // Prepare memory for FLDCW.
37993 int NewCWFrameIdx =
37994 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37995 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
37996 NewCWFrameIdx)
37997 .addReg(NewCW16, RegState::Kill);
37998
37999 // Reload the modified control word now...
38000 addFrameReference(BuildMI(*BB, MI, DL,
38001 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
38002
38003 // Get the X86 opcode to use.
38004 unsigned Opc;
38005 switch (MI.getOpcode()) {
38006 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38006)
;
38007 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
38008 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
38009 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
38010 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
38011 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
38012 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
38013 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
38014 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
38015 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
38016 }
38017
38018 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38019 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
38020 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
38021
38022 // Reload the original control word now.
38023 addFrameReference(BuildMI(*BB, MI, DL,
38024 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
38025
38026 MI.eraseFromParent(); // The pseudo instruction is gone now.
38027 return BB;
38028 }
38029
38030 // xbegin
38031 case X86::XBEGIN:
38032 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
38033
38034 case X86::VAARG_64:
38035 case X86::VAARG_X32:
38036 return EmitVAARGWithCustomInserter(MI, BB);
38037
38038 case X86::EH_SjLj_SetJmp32:
38039 case X86::EH_SjLj_SetJmp64:
38040 return emitEHSjLjSetJmp(MI, BB);
38041
38042 case X86::EH_SjLj_LongJmp32:
38043 case X86::EH_SjLj_LongJmp64:
38044 return emitEHSjLjLongJmp(MI, BB);
38045
38046 case X86::Int_eh_sjlj_setup_dispatch:
38047 return EmitSjLjDispatchBlock(MI, BB);
38048
38049 case TargetOpcode::STATEPOINT:
38050 // As an implementation detail, STATEPOINT shares the STACKMAP format at
38051 // this point in the process. We diverge later.
38052 return emitPatchPoint(MI, BB);
38053
38054 case TargetOpcode::STACKMAP:
38055 case TargetOpcode::PATCHPOINT:
38056 return emitPatchPoint(MI, BB);
38057
38058 case TargetOpcode::PATCHABLE_EVENT_CALL:
38059 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38060 return BB;
38061
38062 case X86::LCMPXCHG8B: {
38063 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38064 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38065 // requires a memory operand. If it happens that current architecture is
38066 // i686 and for current function we need a base pointer
38067 // - which is ESI for i686 - register allocator would not be able to
38068 // allocate registers for an address in form of X(%reg, %reg, Y)
38069 // - there never would be enough unreserved registers during regalloc
38070 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38071 // We are giving a hand to register allocator by precomputing the address in
38072 // a new vreg using LEA.
38073
38074 // If it is not i686 or there is no base pointer - nothing to do here.
38075 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38076 return BB;
38077
38078 // Even though this code does not necessarily needs the base pointer to
38079 // be ESI, we check for that. The reason: if this assert fails, there are
38080 // some changes happened in the compiler base pointer handling, which most
38081 // probably have to be addressed somehow here.
38082 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38084, __extension__
__PRETTY_FUNCTION__))
38083 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38084, __extension__
__PRETTY_FUNCTION__))
38084 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38084, __extension__
__PRETTY_FUNCTION__))
;
38085
38086 MachineRegisterInfo &MRI = MF->getRegInfo();
38087 MVT SPTy = getPointerTy(MF->getDataLayout());
38088 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38089 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38090
38091 X86AddressMode AM = getAddressFromInstr(&MI, 0);
38092 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38093 // does not use index register.
38094 if (AM.IndexReg == X86::NoRegister)
38095 return BB;
38096
38097 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38098 // four operand definitions that are E[ABCD] registers. We skip them and
38099 // then insert the LEA.
38100 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38101 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
38102 RMBBI->definesRegister(X86::EBX) ||
38103 RMBBI->definesRegister(X86::ECX) ||
38104 RMBBI->definesRegister(X86::EDX))) {
38105 ++RMBBI;
38106 }
38107 MachineBasicBlock::iterator MBBI(RMBBI);
38108 addFullAddress(
38109 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
38110
38111 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38112
38113 return BB;
38114 }
38115 case X86::LCMPXCHG16B_NO_RBX: {
38116 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38117 Register BasePtr = TRI->getBaseRegister();
38118 if (TRI->hasBasePointer(*MF) &&
38119 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38120 if (!BB->isLiveIn(BasePtr))
38121 BB->addLiveIn(BasePtr);
38122 // Save RBX into a virtual register.
38123 Register SaveRBX =
38124 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38125 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38126 .addReg(X86::RBX);
38127 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38128 MachineInstrBuilder MIB =
38129 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38130 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38131 MIB.add(MI.getOperand(Idx));
38132 MIB.add(MI.getOperand(X86::AddrNumOperands));
38133 MIB.addReg(SaveRBX);
38134 } else {
38135 // Simple case, just copy the virtual register to RBX.
38136 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
38137 .add(MI.getOperand(X86::AddrNumOperands));
38138 MachineInstrBuilder MIB =
38139 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
38140 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38141 MIB.add(MI.getOperand(Idx));
38142 }
38143 MI.eraseFromParent();
38144 return BB;
38145 }
38146 case X86::MWAITX: {
38147 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38148 Register BasePtr = TRI->getBaseRegister();
38149 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38150 // If no need to save the base pointer, we generate MWAITXrrr,
38151 // else we generate pseudo MWAITX_SAVE_RBX.
38152 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38153 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38154 .addReg(MI.getOperand(0).getReg());
38155 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38156 .addReg(MI.getOperand(1).getReg());
38157 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
38158 .addReg(MI.getOperand(2).getReg());
38159 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
38160 MI.eraseFromParent();
38161 } else {
38162 if (!BB->isLiveIn(BasePtr)) {
38163 BB->addLiveIn(BasePtr);
38164 }
38165 // Parameters can be copied into ECX and EAX but not EBX yet.
38166 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38167 .addReg(MI.getOperand(0).getReg());
38168 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38169 .addReg(MI.getOperand(1).getReg());
38170 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38170, __extension__
__PRETTY_FUNCTION__))
;
38171 // Save RBX into a virtual register.
38172 Register SaveRBX =
38173 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38174 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38175 .addReg(X86::RBX);
38176 // Generate mwaitx pseudo.
38177 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38178 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
38179 .addDef(Dst) // Destination tied in with SaveRBX.
38180 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38181 .addUse(SaveRBX); // Save of base pointer.
38182 MI.eraseFromParent();
38183 }
38184 return BB;
38185 }
38186 case TargetOpcode::PREALLOCATED_SETUP: {
38187 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38187, __extension__
__PRETTY_FUNCTION__))
;
38188 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38189 MFI->setHasPreallocatedCall(true);
38190 int64_t PreallocatedId = MI.getOperand(0).getImm();
38191 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38192 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38192, __extension__
__PRETTY_FUNCTION__))
;
38193 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
38194 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
38195 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
38196 .addReg(X86::ESP)
38197 .addImm(StackAdjustment);
38198 MI.eraseFromParent();
38199 return BB;
38200 }
38201 case TargetOpcode::PREALLOCATED_ARG: {
38202 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38202, __extension__
__PRETTY_FUNCTION__))
;
38203 int64_t PreallocatedId = MI.getOperand(1).getImm();
38204 int64_t ArgIdx = MI.getOperand(2).getImm();
38205 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38206 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38207 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
38208 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
38209 // stack pointer + offset
38210 addRegOffset(
38211 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
38212 X86::ESP, false, ArgOffset);
38213 MI.eraseFromParent();
38214 return BB;
38215 }
38216 case X86::PTDPBSSD:
38217 case X86::PTDPBSUD:
38218 case X86::PTDPBUSD:
38219 case X86::PTDPBUUD:
38220 case X86::PTDPBF16PS:
38221 case X86::PTDPFP16PS: {
38222 unsigned Opc;
38223 switch (MI.getOpcode()) {
38224 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38224)
;
38225 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38226 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38227 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38228 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38229 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38230 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38231 }
38232
38233 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38234 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38235 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38236 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38237 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38238
38239 MI.eraseFromParent(); // The pseudo is gone now.
38240 return BB;
38241 }
38242 case X86::PTILEZERO: {
38243 unsigned Imm = MI.getOperand(0).getImm();
38244 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38245 MI.eraseFromParent(); // The pseudo is gone now.
38246 return BB;
38247 }
38248 case X86::PTILELOADD:
38249 case X86::PTILELOADDT1:
38250 case X86::PTILESTORED: {
38251 unsigned Opc;
38252 switch (MI.getOpcode()) {
38253 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38253)
;
38254 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
38255 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
38256 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
38257 }
38258
38259 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38260 unsigned CurOp = 0;
38261 if (Opc != X86::TILESTORED)
38262 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38263 RegState::Define);
38264
38265 MIB.add(MI.getOperand(CurOp++)); // base
38266 MIB.add(MI.getOperand(CurOp++)); // scale
38267 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38268 MIB.add(MI.getOperand(CurOp++)); // displacement
38269 MIB.add(MI.getOperand(CurOp++)); // segment
38270
38271 if (Opc == X86::TILESTORED)
38272 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38273 RegState::Undef);
38274
38275 MI.eraseFromParent(); // The pseudo is gone now.
38276 return BB;
38277 }
38278 case X86::PTCMMIMFP16PS:
38279 case X86::PTCMMRLFP16PS: {
38280 const DebugLoc &DL = MI.getDebugLoc();
38281 unsigned Opc;
38282 switch (MI.getOpcode()) {
38283 default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38283)
;
38284 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
38285 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
38286 }
38287 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38288 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38289 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38290 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38291 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38292 MI.eraseFromParent(); // The pseudo is gone now.
38293 return BB;
38294 }
38295 }
38296}
38297
38298//===----------------------------------------------------------------------===//
38299// X86 Optimization Hooks
38300//===----------------------------------------------------------------------===//
38301
38302bool
38303X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
38304 const APInt &DemandedBits,
38305 const APInt &DemandedElts,
38306 TargetLoweringOpt &TLO) const {
38307 EVT VT = Op.getValueType();
38308 unsigned Opcode = Op.getOpcode();
38309 unsigned EltSize = VT.getScalarSizeInBits();
38310
38311 if (VT.isVector()) {
38312 // If the constant is only all signbits in the active bits, then we should
38313 // extend it to the entire constant to allow it act as a boolean constant
38314 // vector.
38315 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38316 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38317 return false;
38318 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38319 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38320 continue;
38321 const APInt &Val = V.getConstantOperandAPInt(i);
38322 if (Val.getBitWidth() > Val.getNumSignBits() &&
38323 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38324 return true;
38325 }
38326 return false;
38327 };
38328 // For vectors - if we have a constant, then try to sign extend.
38329 // TODO: Handle AND/ANDN cases.
38330 unsigned ActiveBits = DemandedBits.getActiveBits();
38331 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38332 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
38333 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38334 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38335 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38336 VT.getVectorNumElements());
38337 SDValue NewC =
38338 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
38339 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38340 SDValue NewOp =
38341 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38342 return TLO.CombineTo(Op, NewOp);
38343 }
38344 return false;
38345 }
38346
38347 // Only optimize Ands to prevent shrinking a constant that could be
38348 // matched by movzx.
38349 if (Opcode != ISD::AND)
38350 return false;
38351
38352 // Make sure the RHS really is a constant.
38353 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38354 if (!C)
38355 return false;
38356
38357 const APInt &Mask = C->getAPIntValue();
38358
38359 // Clear all non-demanded bits initially.
38360 APInt ShrunkMask = Mask & DemandedBits;
38361
38362 // Find the width of the shrunk mask.
38363 unsigned Width = ShrunkMask.getActiveBits();
38364
38365 // If the mask is all 0s there's nothing to do here.
38366 if (Width == 0)
38367 return false;
38368
38369 // Find the next power of 2 width, rounding up to a byte.
38370 Width = llvm::bit_ceil(std::max(Width, 8U));
38371 // Truncate the width to size to handle illegal types.
38372 Width = std::min(Width, EltSize);
38373
38374 // Calculate a possible zero extend mask for this constant.
38375 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38376
38377 // If we aren't changing the mask, just return true to keep it and prevent
38378 // the caller from optimizing.
38379 if (ZeroExtendMask == Mask)
38380 return true;
38381
38382 // Make sure the new mask can be represented by a combination of mask bits
38383 // and non-demanded bits.
38384 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38385 return false;
38386
38387 // Replace the constant with the zero extend mask.
38388 SDLoc DL(Op);
38389 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38390 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38391 return TLO.CombineTo(Op, NewOp);
38392}
38393
38394void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
38395 KnownBits &Known,
38396 const APInt &DemandedElts,
38397 const SelectionDAG &DAG,
38398 unsigned Depth) const {
38399 unsigned BitWidth = Known.getBitWidth();
38400 unsigned NumElts = DemandedElts.getBitWidth();
38401 unsigned Opc = Op.getOpcode();
38402 EVT VT = Op.getValueType();
38403 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))
38404 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))
38405 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))
38406 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))
38407 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))
38408 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))
;
38409
38410 Known.resetAll();
38411 switch (Opc) {
38412 default: break;
38413 case X86ISD::MUL_IMM: {
38414 KnownBits Known2;
38415 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38416 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38417 Known = KnownBits::mul(Known, Known2);
38418 break;
38419 }
38420 case X86ISD::SETCC:
38421 Known.Zero.setBitsFrom(1);
38422 break;
38423 case X86ISD::MOVMSK: {
38424 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38425 Known.Zero.setBitsFrom(NumLoBits);
38426 break;
38427 }
38428 case X86ISD::PEXTRB:
38429 case X86ISD::PEXTRW: {
38430 SDValue Src = Op.getOperand(0);
38431 EVT SrcVT = Src.getValueType();
38432 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38433 Op.getConstantOperandVal(1));
38434 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38435 Known = Known.anyextOrTrunc(BitWidth);
38436 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38437 break;
38438 }
38439 case X86ISD::VSRAI:
38440 case X86ISD::VSHLI:
38441 case X86ISD::VSRLI: {
38442 unsigned ShAmt = Op.getConstantOperandVal(1);
38443 if (ShAmt >= VT.getScalarSizeInBits()) {
38444 // Out of range logical bit shifts are guaranteed to be zero.
38445 // Out of range arithmetic bit shifts splat the sign bit.
38446 if (Opc != X86ISD::VSRAI) {
38447 Known.setAllZero();
38448 break;
38449 }
38450
38451 ShAmt = VT.getScalarSizeInBits() - 1;
38452 }
38453
38454 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38455 if (Opc == X86ISD::VSHLI) {
38456 Known.Zero <<= ShAmt;
38457 Known.One <<= ShAmt;
38458 // Low bits are known zero.
38459 Known.Zero.setLowBits(ShAmt);
38460 } else if (Opc == X86ISD::VSRLI) {
38461 Known.Zero.lshrInPlace(ShAmt);
38462 Known.One.lshrInPlace(ShAmt);
38463 // High bits are known zero.
38464 Known.Zero.setHighBits(ShAmt);
38465 } else {
38466 Known.Zero.ashrInPlace(ShAmt);
38467 Known.One.ashrInPlace(ShAmt);
38468 }
38469 break;
38470 }
38471 case X86ISD::PACKUS: {
38472 // PACKUS is just a truncation if the upper half is zero.
38473 APInt DemandedLHS, DemandedRHS;
38474 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38475
38476 Known.One = APInt::getAllOnes(BitWidth * 2);
38477 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38478
38479 KnownBits Known2;
38480 if (!!DemandedLHS) {
38481 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38482 Known = KnownBits::commonBits(Known, Known2);
38483 }
38484 if (!!DemandedRHS) {
38485 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38486 Known = KnownBits::commonBits(Known, Known2);
38487 }
38488
38489 if (Known.countMinLeadingZeros() < BitWidth)
38490 Known.resetAll();
38491 Known = Known.trunc(BitWidth);
38492 break;
38493 }
38494 case X86ISD::VBROADCAST: {
38495 SDValue Src = Op.getOperand(0);
38496 if (!Src.getSimpleValueType().isVector()) {
38497 Known = DAG.computeKnownBits(Src, Depth + 1);
38498 return;
38499 }
38500 break;
38501 }
38502 case X86ISD::AND: {
38503 if (Op.getResNo() == 0) {
38504 KnownBits Known2;
38505 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38506 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38507 Known &= Known2;
38508 }
38509 break;
38510 }
38511 case X86ISD::ANDNP: {
38512 KnownBits Known2;
38513 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38514 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38515
38516 // ANDNP = (~X & Y);
38517 Known.One &= Known2.Zero;
38518 Known.Zero |= Known2.One;
38519 break;
38520 }
38521 case X86ISD::FOR: {
38522 KnownBits Known2;
38523 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38524 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38525
38526 Known |= Known2;
38527 break;
38528 }
38529 case X86ISD::PSADBW: {
38530 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38532, __extension__
__PRETTY_FUNCTION__))
38531 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38532, __extension__
__PRETTY_FUNCTION__))
38532 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38532, __extension__
__PRETTY_FUNCTION__))
;
38533
38534 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
38535 Known.Zero.setBitsFrom(16);
38536 break;
38537 }
38538 case X86ISD::PMULUDQ: {
38539 KnownBits Known2;
38540 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38541 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38542
38543 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38544 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38545 Known = KnownBits::mul(Known, Known2);
38546 break;
38547 }
38548 case X86ISD::CMOV: {
38549 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38550 // If we don't know any bits, early out.
38551 if (Known.isUnknown())
38552 break;
38553 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38554
38555 // Only known if known in both the LHS and RHS.
38556 Known = KnownBits::commonBits(Known, Known2);
38557 break;
38558 }
38559 case X86ISD::BEXTR:
38560 case X86ISD::BEXTRI: {
38561 SDValue Op0 = Op.getOperand(0);
38562 SDValue Op1 = Op.getOperand(1);
38563
38564 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38565 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38566 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38567
38568 // If the length is 0, the result is 0.
38569 if (Length == 0) {
38570 Known.setAllZero();
38571 break;
38572 }
38573
38574 if ((Shift + Length) <= BitWidth) {
38575 Known = DAG.computeKnownBits(Op0, Depth + 1);
38576 Known = Known.extractBits(Length, Shift);
38577 Known = Known.zextOrTrunc(BitWidth);
38578 }
38579 }
38580 break;
38581 }
38582 case X86ISD::PDEP: {
38583 KnownBits Known2;
38584 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38585 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38586 // Zeros are retained from the mask operand. But not ones.
38587 Known.One.clearAllBits();
38588 // The result will have at least as many trailing zeros as the non-mask
38589 // operand since bits can only map to the same or higher bit position.
38590 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38591 break;
38592 }
38593 case X86ISD::PEXT: {
38594 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38595 // The result has as many leading zeros as the number of zeroes in the mask.
38596 unsigned Count = Known.Zero.popcount();
38597 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38598 Known.One.clearAllBits();
38599 break;
38600 }
38601 case X86ISD::VTRUNC:
38602 case X86ISD::VTRUNCS:
38603 case X86ISD::VTRUNCUS:
38604 case X86ISD::CVTSI2P:
38605 case X86ISD::CVTUI2P:
38606 case X86ISD::CVTP2SI:
38607 case X86ISD::CVTP2UI:
38608 case X86ISD::MCVTP2SI:
38609 case X86ISD::MCVTP2UI:
38610 case X86ISD::CVTTP2SI:
38611 case X86ISD::CVTTP2UI:
38612 case X86ISD::MCVTTP2SI:
38613 case X86ISD::MCVTTP2UI:
38614 case X86ISD::MCVTSI2P:
38615 case X86ISD::MCVTUI2P:
38616 case X86ISD::VFPROUND:
38617 case X86ISD::VMFPROUND:
38618 case X86ISD::CVTPS2PH:
38619 case X86ISD::MCVTPS2PH: {
38620 // Truncations/Conversions - upper elements are known zero.
38621 EVT SrcVT = Op.getOperand(0).getValueType();
38622 if (SrcVT.isVector()) {
38623 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38624 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38625 Known.setAllZero();
38626 }
38627 break;
38628 }
38629 case X86ISD::STRICT_CVTTP2SI:
38630 case X86ISD::STRICT_CVTTP2UI:
38631 case X86ISD::STRICT_CVTSI2P:
38632 case X86ISD::STRICT_CVTUI2P:
38633 case X86ISD::STRICT_VFPROUND:
38634 case X86ISD::STRICT_CVTPS2PH: {
38635 // Strict Conversions - upper elements are known zero.
38636 EVT SrcVT = Op.getOperand(1).getValueType();
38637 if (SrcVT.isVector()) {
38638 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38639 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38640 Known.setAllZero();
38641 }
38642 break;
38643 }
38644 case X86ISD::MOVQ2DQ: {
38645 // Move from MMX to XMM. Upper half of XMM should be 0.
38646 if (DemandedElts.countr_zero() >= (NumElts / 2))
38647 Known.setAllZero();
38648 break;
38649 }
38650 case X86ISD::VBROADCAST_LOAD: {
38651 APInt UndefElts;
38652 SmallVector<APInt, 16> EltBits;
38653 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38654 /*AllowWholeUndefs*/ false,
38655 /*AllowPartialUndefs*/ false)) {
38656 Known.Zero.setAllBits();
38657 Known.One.setAllBits();
38658 for (unsigned I = 0; I != NumElts; ++I) {
38659 if (!DemandedElts[I])
38660 continue;
38661 if (UndefElts[I]) {
38662 Known.resetAll();
38663 break;
38664 }
38665 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38666 Known = KnownBits::commonBits(Known, Known2);
38667 }
38668 return;
38669 }
38670 break;
38671 }
38672 }
38673
38674 // Handle target shuffles.
38675 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38676 if (isTargetShuffle(Opc)) {
38677 SmallVector<int, 64> Mask;
38678 SmallVector<SDValue, 2> Ops;
38679 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38680 unsigned NumOps = Ops.size();
38681 unsigned NumElts = VT.getVectorNumElements();
38682 if (Mask.size() == NumElts) {
38683 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38684 Known.Zero.setAllBits(); Known.One.setAllBits();
38685 for (unsigned i = 0; i != NumElts; ++i) {
38686 if (!DemandedElts[i])
38687 continue;
38688 int M = Mask[i];
38689 if (M == SM_SentinelUndef) {
38690 // For UNDEF elements, we don't know anything about the common state
38691 // of the shuffle result.
38692 Known.resetAll();
38693 break;
38694 }
38695 if (M == SM_SentinelZero) {
38696 Known.One.clearAllBits();
38697 continue;
38698 }
38699 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38700, __extension__
__PRETTY_FUNCTION__))
38700 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38700, __extension__
__PRETTY_FUNCTION__))
;
38701
38702 unsigned OpIdx = (unsigned)M / NumElts;
38703 unsigned EltIdx = (unsigned)M % NumElts;
38704 if (Ops[OpIdx].getValueType() != VT) {
38705 // TODO - handle target shuffle ops with different value types.
38706 Known.resetAll();
38707 break;
38708 }
38709 DemandedOps[OpIdx].setBit(EltIdx);
38710 }
38711 // Known bits are the values that are shared by every demanded element.
38712 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38713 if (!DemandedOps[i])
38714 continue;
38715 KnownBits Known2 =
38716 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38717 Known = KnownBits::commonBits(Known, Known2);
38718 }
38719 }
38720 }
38721 }
38722}
38723
38724unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
38725 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38726 unsigned Depth) const {
38727 EVT VT = Op.getValueType();
38728 unsigned VTBits = VT.getScalarSizeInBits();
38729 unsigned Opcode = Op.getOpcode();
38730 switch (Opcode) {
38731 case X86ISD::SETCC_CARRY:
38732 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38733 return VTBits;
38734
38735 case X86ISD::VTRUNC: {
38736 SDValue Src = Op.getOperand(0);
38737 MVT SrcVT = Src.getSimpleValueType();
38738 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38739 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38739, __extension__
__PRETTY_FUNCTION__))
;
38740 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38741 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38742 if (Tmp > (NumSrcBits - VTBits))
38743 return Tmp - (NumSrcBits - VTBits);
38744 return 1;
38745 }
38746
38747 case X86ISD::PACKSS: {
38748 // PACKSS is just a truncation if the sign bits extend to the packed size.
38749 APInt DemandedLHS, DemandedRHS;
38750 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
38751 DemandedRHS);
38752
38753 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
38754 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
38755 if (!!DemandedLHS)
38756 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38757 if (!!DemandedRHS)
38758 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38759 unsigned Tmp = std::min(Tmp0, Tmp1);
38760 if (Tmp > (SrcBits - VTBits))
38761 return Tmp - (SrcBits - VTBits);
38762 return 1;
38763 }
38764
38765 case X86ISD::VBROADCAST: {
38766 SDValue Src = Op.getOperand(0);
38767 if (!Src.getSimpleValueType().isVector())
38768 return DAG.ComputeNumSignBits(Src, Depth + 1);
38769 break;
38770 }
38771
38772 case X86ISD::VSHLI: {
38773 SDValue Src = Op.getOperand(0);
38774 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
38775 if (ShiftVal.uge(VTBits))
38776 return VTBits; // Shifted all bits out --> zero.
38777 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38778 if (ShiftVal.uge(Tmp))
38779 return 1; // Shifted all sign bits out --> unknown.
38780 return Tmp - ShiftVal.getZExtValue();
38781 }
38782
38783 case X86ISD::VSRAI: {
38784 SDValue Src = Op.getOperand(0);
38785 APInt ShiftVal = Op.getConstantOperandAPInt(1);
38786 if (ShiftVal.uge(VTBits - 1))
38787 return VTBits; // Sign splat.
38788 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38789 ShiftVal += Tmp;
38790 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
38791 }
38792
38793 case X86ISD::FSETCC:
38794 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38795 if (VT == MVT::f32 || VT == MVT::f64 ||
38796 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
38797 return VTBits;
38798 break;
38799
38800 case X86ISD::PCMPGT:
38801 case X86ISD::PCMPEQ:
38802 case X86ISD::CMPP:
38803 case X86ISD::VPCOM:
38804 case X86ISD::VPCOMU:
38805 // Vector compares return zero/all-bits result values.
38806 return VTBits;
38807
38808 case X86ISD::ANDNP: {
38809 unsigned Tmp0 =
38810 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38811 if (Tmp0 == 1) return 1; // Early out.
38812 unsigned Tmp1 =
38813 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38814 return std::min(Tmp0, Tmp1);
38815 }
38816
38817 case X86ISD::CMOV: {
38818 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38819 if (Tmp0 == 1) return 1; // Early out.
38820 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38821 return std::min(Tmp0, Tmp1);
38822 }
38823 }
38824
38825 // Handle target shuffles.
38826 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38827 if (isTargetShuffle(Opcode)) {
38828 SmallVector<int, 64> Mask;
38829 SmallVector<SDValue, 2> Ops;
38830 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38831 unsigned NumOps = Ops.size();
38832 unsigned NumElts = VT.getVectorNumElements();
38833 if (Mask.size() == NumElts) {
38834 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38835 for (unsigned i = 0; i != NumElts; ++i) {
38836 if (!DemandedElts[i])
38837 continue;
38838 int M = Mask[i];
38839 if (M == SM_SentinelUndef) {
38840 // For UNDEF elements, we don't know anything about the common state
38841 // of the shuffle result.
38842 return 1;
38843 } else if (M == SM_SentinelZero) {
38844 // Zero = all sign bits.
38845 continue;
38846 }
38847 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38848, __extension__
__PRETTY_FUNCTION__))
38848 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38848, __extension__
__PRETTY_FUNCTION__))
;
38849
38850 unsigned OpIdx = (unsigned)M / NumElts;
38851 unsigned EltIdx = (unsigned)M % NumElts;
38852 if (Ops[OpIdx].getValueType() != VT) {
38853 // TODO - handle target shuffle ops with different value types.
38854 return 1;
38855 }
38856 DemandedOps[OpIdx].setBit(EltIdx);
38857 }
38858 unsigned Tmp0 = VTBits;
38859 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
38860 if (!DemandedOps[i])
38861 continue;
38862 unsigned Tmp1 =
38863 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38864 Tmp0 = std::min(Tmp0, Tmp1);
38865 }
38866 return Tmp0;
38867 }
38868 }
38869 }
38870
38871 // Fallback case.
38872 return 1;
38873}
38874
38875SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
38876 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38877 return N->getOperand(0);
38878 return N;
38879}
38880
38881// Helper to look for a normal load that can be narrowed into a vzload with the
38882// specified VT and memory VT. Returns SDValue() on failure.
38883static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
38884 SelectionDAG &DAG) {
38885 // Can't if the load is volatile or atomic.
38886 if (!LN->isSimple())
38887 return SDValue();
38888
38889 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38890 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38891 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
38892 LN->getPointerInfo(), LN->getOriginalAlign(),
38893 LN->getMemOperand()->getFlags());
38894}
38895
38896// Attempt to match a combined shuffle mask against supported unary shuffle
38897// instructions.
38898// TODO: Investigate sharing more of this with shuffle lowering.
38899static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38900 bool AllowFloatDomain, bool AllowIntDomain,
38901 SDValue V1, const SelectionDAG &DAG,
38902 const X86Subtarget &Subtarget, unsigned &Shuffle,
38903 MVT &SrcVT, MVT &DstVT) {
38904 unsigned NumMaskElts = Mask.size();
38905 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
38906
38907 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38908 if (Mask[0] == 0 &&
38909 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
38910 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38911 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38912 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38913 Shuffle = X86ISD::VZEXT_MOVL;
38914 if (MaskEltSize == 16)
38915 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38916 else
38917 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38918 return true;
38919 }
38920 }
38921
38922 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
38923 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38924 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
38925 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
38926 unsigned MaxScale = 64 / MaskEltSize;
38927 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
38928 bool MatchAny = true;
38929 bool MatchZero = true;
38930 unsigned NumDstElts = NumMaskElts / Scale;
38931 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
38932 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
38933 MatchAny = MatchZero = false;
38934 break;
38935 }
38936 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
38937 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
38938 }
38939 if (MatchAny || MatchZero) {
38940 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38940, __extension__
__PRETTY_FUNCTION__))
;
38941 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
38942 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
38943 MVT::getIntegerVT(MaskEltSize);
38944 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
38945
38946 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
38947 if (SrcVT.getVectorNumElements() != NumDstElts)
38948 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
38949
38950 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
38951 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
38952 return true;
38953 }
38954 }
38955 }
38956
38957 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38958 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
38959 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
38960 isUndefOrEqual(Mask[0], 0) &&
38961 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38962 Shuffle = X86ISD::VZEXT_MOVL;
38963 if (MaskEltSize == 16)
38964 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38965 else
38966 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38967 return true;
38968 }
38969
38970 // Check if we have SSE3 which will let us use MOVDDUP etc. The
38971 // instructions are no slower than UNPCKLPD but has the option to
38972 // fold the input operand into even an unaligned memory load.
38973 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
38974 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
38975 Shuffle = X86ISD::MOVDDUP;
38976 SrcVT = DstVT = MVT::v2f64;
38977 return true;
38978 }
38979 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38980 Shuffle = X86ISD::MOVSLDUP;
38981 SrcVT = DstVT = MVT::v4f32;
38982 return true;
38983 }
38984 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
38985 Shuffle = X86ISD::MOVSHDUP;
38986 SrcVT = DstVT = MVT::v4f32;
38987 return true;
38988 }
38989 }
38990
38991 if (MaskVT.is256BitVector() && AllowFloatDomain) {
38992 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38992, __extension__
__PRETTY_FUNCTION__))
;
38993 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38994 Shuffle = X86ISD::MOVDDUP;
38995 SrcVT = DstVT = MVT::v4f64;
38996 return true;
38997 }
38998 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38999 V1)) {
39000 Shuffle = X86ISD::MOVSLDUP;
39001 SrcVT = DstVT = MVT::v8f32;
39002 return true;
39003 }
39004 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39005 V1)) {
39006 Shuffle = X86ISD::MOVSHDUP;
39007 SrcVT = DstVT = MVT::v8f32;
39008 return true;
39009 }
39010 }
39011
39012 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39013 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39014, __extension__
__PRETTY_FUNCTION__))
39014 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39014, __extension__
__PRETTY_FUNCTION__))
;
39015 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39016 V1)) {
39017 Shuffle = X86ISD::MOVDDUP;
39018 SrcVT = DstVT = MVT::v8f64;
39019 return true;
39020 }
39021 if (isTargetShuffleEquivalent(
39022 MaskVT, Mask,
39023 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39024 Shuffle = X86ISD::MOVSLDUP;
39025 SrcVT = DstVT = MVT::v16f32;
39026 return true;
39027 }
39028 if (isTargetShuffleEquivalent(
39029 MaskVT, Mask,
39030 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39031 Shuffle = X86ISD::MOVSHDUP;
39032 SrcVT = DstVT = MVT::v16f32;
39033 return true;
39034 }
39035 }
39036
39037 return false;
39038}
39039
39040// Attempt to match a combined shuffle mask against supported unary immediate
39041// permute instructions.
39042// TODO: Investigate sharing more of this with shuffle lowering.
39043static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
39044 const APInt &Zeroable,
39045 bool AllowFloatDomain, bool AllowIntDomain,
39046 const SelectionDAG &DAG,
39047 const X86Subtarget &Subtarget,
39048 unsigned &Shuffle, MVT &ShuffleVT,
39049 unsigned &PermuteImm) {
39050 unsigned NumMaskElts = Mask.size();
39051 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39052 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39053 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39054 bool ContainsZeros = isAnyZero(Mask);
39055
39056 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39057 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39058 // Check for lane crossing permutes.
39059 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39060 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39061 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39062 Shuffle = X86ISD::VPERMI;
39063 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39064 PermuteImm = getV4X86ShuffleImm(Mask);
39065 return true;
39066 }
39067 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39068 SmallVector<int, 4> RepeatedMask;
39069 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39070 Shuffle = X86ISD::VPERMI;
39071 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39072 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39073 return true;
39074 }
39075 }
39076 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39077 // VPERMILPD can permute with a non-repeating shuffle.
39078 Shuffle = X86ISD::VPERMILPI;
39079 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39080 PermuteImm = 0;
39081 for (int i = 0, e = Mask.size(); i != e; ++i) {
39082 int M = Mask[i];
39083 if (M == SM_SentinelUndef)
39084 continue;
39085 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39085, __extension__
__PRETTY_FUNCTION__))
;
39086 PermuteImm |= (M & 1) << i;
39087 }
39088 return true;
39089 }
39090 }
39091
39092 // We are checking for shuffle match or shift match. Loop twice so we can
39093 // order which we try and match first depending on target preference.
39094 for (unsigned Order = 0; Order < 2; ++Order) {
39095 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39096 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39097 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39098 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39099 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39100 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39101 SmallVector<int, 4> RepeatedMask;
39102 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39103 // Narrow the repeated mask to create 32-bit element permutes.
39104 SmallVector<int, 4> WordMask = RepeatedMask;
39105 if (MaskScalarSizeInBits == 64)
39106 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39107
39108 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39109 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39110 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39111 PermuteImm = getV4X86ShuffleImm(WordMask);
39112 return true;
39113 }
39114 }
39115
39116 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39117 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39118 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39119 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39120 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39121 SmallVector<int, 4> RepeatedMask;
39122 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39123 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39124 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39125
39126 // PSHUFLW: permute lower 4 elements only.
39127 if (isUndefOrInRange(LoMask, 0, 4) &&
39128 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39129 Shuffle = X86ISD::PSHUFLW;
39130 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39131 PermuteImm = getV4X86ShuffleImm(LoMask);
39132 return true;
39133 }
39134
39135 // PSHUFHW: permute upper 4 elements only.
39136 if (isUndefOrInRange(HiMask, 4, 8) &&
39137 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39138 // Offset the HiMask so that we can create the shuffle immediate.
39139 int OffsetHiMask[4];
39140 for (int i = 0; i != 4; ++i)
39141 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39142
39143 Shuffle = X86ISD::PSHUFHW;
39144 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39145 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39146 return true;
39147 }
39148 }
39149 }
39150 } else {
39151 // Attempt to match against bit rotates.
39152 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39153 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39154 Subtarget.hasAVX512())) {
39155 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39156 Subtarget, Mask);
39157 if (0 < RotateAmt) {
39158 Shuffle = X86ISD::VROTLI;
39159 PermuteImm = (unsigned)RotateAmt;
39160 return true;
39161 }
39162 }
39163 }
39164 // Attempt to match against byte/bit shifts.
39165 if (AllowIntDomain &&
39166 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39167 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39168 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39169 int ShiftAmt =
39170 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39171 Zeroable, Subtarget);
39172 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39173 32 <= ShuffleVT.getScalarSizeInBits())) {
39174 // Byte shifts can be slower so only match them on second attempt.
39175 if (Order == 0 &&
39176 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39177 continue;
39178
39179 PermuteImm = (unsigned)ShiftAmt;
39180 return true;
39181 }
39182
39183 }
39184 }
39185
39186 return false;
39187}
39188
39189// Attempt to match a combined unary shuffle mask against supported binary
39190// shuffle instructions.
39191// TODO: Investigate sharing more of this with shuffle lowering.
39192static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39193 bool AllowFloatDomain, bool AllowIntDomain,
39194 SDValue &V1, SDValue &V2, const SDLoc &DL,
39195 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39196 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39197 bool IsUnary) {
39198 unsigned NumMaskElts = Mask.size();
39199 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39200 unsigned SizeInBits = MaskVT.getSizeInBits();
39201
39202 if (MaskVT.is128BitVector()) {
39203 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39204 AllowFloatDomain) {
39205 V2 = V1;
39206 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39207 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39208 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39209 return true;
39210 }
39211 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39212 AllowFloatDomain) {
39213 V2 = V1;
39214 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39215 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39216 return true;
39217 }
39218 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39219 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39220 std::swap(V1, V2);
39221 Shuffle = X86ISD::MOVSD;
39222 SrcVT = DstVT = MVT::v2f64;
39223 return true;
39224 }
39225 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39226 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39227 Shuffle = X86ISD::MOVSS;
39228 SrcVT = DstVT = MVT::v4f32;
39229 return true;
39230 }
39231 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39232 DAG) &&
39233 Subtarget.hasFP16()) {
39234 Shuffle = X86ISD::MOVSH;
39235 SrcVT = DstVT = MVT::v8f16;
39236 return true;
39237 }
39238 }
39239
39240 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39241 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39242 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39243 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39244 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39245 Subtarget)) {
39246 DstVT = MaskVT;
39247 return true;
39248 }
39249 }
39250
39251 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39252 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39253 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39254 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39255 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39256 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
39257 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39258 Subtarget)) {
39259 SrcVT = DstVT = MaskVT;
39260 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39261 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39262 return true;
39263 }
39264 }
39265
39266 // Attempt to match against a OR if we're performing a blend shuffle and the
39267 // non-blended source element is zero in each case.
39268 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39269 if (SizeInBits == V1.getValueSizeInBits() &&
39270 SizeInBits == V2.getValueSizeInBits() &&
39271 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39272 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39273 bool IsBlend = true;
39274 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39275 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39276 unsigned Scale1 = NumV1Elts / NumMaskElts;
39277 unsigned Scale2 = NumV2Elts / NumMaskElts;
39278 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39279 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39280 for (unsigned i = 0; i != NumMaskElts; ++i) {
39281 int M = Mask[i];
39282 if (M == SM_SentinelUndef)
39283 continue;
39284 if (M == SM_SentinelZero) {
39285 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39286 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39287 continue;
39288 }
39289 if (M == (int)i) {
39290 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39291 continue;
39292 }
39293 if (M == (int)(i + NumMaskElts)) {
39294 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39295 continue;
39296 }
39297 IsBlend = false;
39298 break;
39299 }
39300 if (IsBlend) {
39301 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39302 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39303 Shuffle = ISD::OR;
39304 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39305 return true;
39306 }
39307 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39308 // FIXME: handle mismatched sizes?
39309 // TODO: investigate if `ISD::OR` handling in
39310 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39311 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39312 unsigned NumElts = V.getValueType().getVectorNumElements();
39313 KnownBits Known(NumElts);
39314 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39315 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39316 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39317 if (PeepholeKnown.isZero())
39318 Known.Zero.setBit(EltIdx);
39319 if (PeepholeKnown.isAllOnes())
39320 Known.One.setBit(EltIdx);
39321 }
39322 return Known;
39323 };
39324
39325 KnownBits V1Known = computeKnownBitsElementWise(V1);
39326 KnownBits V2Known = computeKnownBitsElementWise(V2);
39327
39328 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39329 int M = Mask[i];
39330 if (M == SM_SentinelUndef)
39331 continue;
39332 if (M == SM_SentinelZero) {
39333 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39334 continue;
39335 }
39336 if (M == (int)i) {
39337 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39338 continue;
39339 }
39340 if (M == (int)(i + NumMaskElts)) {
39341 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39342 continue;
39343 }
39344 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39344)
;
39345 }
39346 if (IsBlend) {
39347 Shuffle = ISD::OR;
39348 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39349 return true;
39350 }
39351 }
39352 }
39353 }
39354
39355 return false;
39356}
39357
39358static bool matchBinaryPermuteShuffle(
39359 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39360 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39361 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39362 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39363 unsigned NumMaskElts = Mask.size();
39364 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39365
39366 // Attempt to match against VALIGND/VALIGNQ rotate.
39367 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39368 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39369 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39370 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39371 if (!isAnyZero(Mask)) {
39372 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39373 if (0 < Rotation) {
39374 Shuffle = X86ISD::VALIGN;
39375 if (EltSizeInBits == 64)
39376 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39377 else
39378 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39379 PermuteImm = Rotation;
39380 return true;
39381 }
39382 }
39383 }
39384
39385 // Attempt to match against PALIGNR byte rotate.
39386 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39387 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39388 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39389 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39390 if (0 < ByteRotation) {
39391 Shuffle = X86ISD::PALIGNR;
39392 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39393 PermuteImm = ByteRotation;
39394 return true;
39395 }
39396 }
39397
39398 // Attempt to combine to X86ISD::BLENDI.
39399 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39400 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39401 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39402 uint64_t BlendMask = 0;
39403 bool ForceV1Zero = false, ForceV2Zero = false;
39404 SmallVector<int, 8> TargetMask(Mask);
39405 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39406 ForceV2Zero, BlendMask)) {
39407 if (MaskVT == MVT::v16i16) {
39408 // We can only use v16i16 PBLENDW if the lanes are repeated.
39409 SmallVector<int, 8> RepeatedMask;
39410 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39411 RepeatedMask)) {
39412 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39413, __extension__
__PRETTY_FUNCTION__))
39413 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39413, __extension__
__PRETTY_FUNCTION__))
;
39414 PermuteImm = 0;
39415 for (int i = 0; i < 8; ++i)
39416 if (RepeatedMask[i] >= 8)
39417 PermuteImm |= 1 << i;
39418 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39419 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39420 Shuffle = X86ISD::BLENDI;
39421 ShuffleVT = MaskVT;
39422 return true;
39423 }
39424 } else {
39425 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39426 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39427 PermuteImm = (unsigned)BlendMask;
39428 Shuffle = X86ISD::BLENDI;
39429 ShuffleVT = MaskVT;
39430 return true;
39431 }
39432 }
39433 }
39434
39435 // Attempt to combine to INSERTPS, but only if it has elements that need to
39436 // be set to zero.
39437 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39438 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39439 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39440 Shuffle = X86ISD::INSERTPS;
39441 ShuffleVT = MVT::v4f32;
39442 return true;
39443 }
39444
39445 // Attempt to combine to SHUFPD.
39446 if (AllowFloatDomain && EltSizeInBits == 64 &&
39447 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39448 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39449 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39450 bool ForceV1Zero = false, ForceV2Zero = false;
39451 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39452 PermuteImm, Mask, Zeroable)) {
39453 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39454 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39455 Shuffle = X86ISD::SHUFP;
39456 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39457 return true;
39458 }
39459 }
39460
39461 // Attempt to combine to SHUFPS.
39462 if (AllowFloatDomain && EltSizeInBits == 32 &&
39463 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39464 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39465 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39466 SmallVector<int, 4> RepeatedMask;
39467 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39468 // Match each half of the repeated mask, to determine if its just
39469 // referencing one of the vectors, is zeroable or entirely undef.
39470 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39471 int M0 = RepeatedMask[Offset];
39472 int M1 = RepeatedMask[Offset + 1];
39473
39474 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39475 return DAG.getUNDEF(MaskVT);
39476 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39477 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39478 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39479 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39480 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39481 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39482 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39483 return V1;
39484 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39485 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39486 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39487 return V2;
39488 }
39489
39490 return SDValue();
39491 };
39492
39493 int ShufMask[4] = {-1, -1, -1, -1};
39494 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39495 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39496
39497 if (Lo && Hi) {
39498 V1 = Lo;
39499 V2 = Hi;
39500 Shuffle = X86ISD::SHUFP;
39501 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39502 PermuteImm = getV4X86ShuffleImm(ShufMask);
39503 return true;
39504 }
39505 }
39506 }
39507
39508 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39509 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39510 MaskVT.is128BitVector() &&
39511 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39512 Shuffle = X86ISD::INSERTPS;
39513 ShuffleVT = MVT::v4f32;
39514 return true;
39515 }
39516
39517 return false;
39518}
39519
39520static SDValue combineX86ShuffleChainWithExtract(
39521 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39522 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39523 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39524 const X86Subtarget &Subtarget);
39525
39526/// Combine an arbitrary chain of shuffles into a single instruction if
39527/// possible.
39528///
39529/// This is the leaf of the recursive combine below. When we have found some
39530/// chain of single-use x86 shuffle instructions and accumulated the combined
39531/// shuffle mask represented by them, this will try to pattern match that mask
39532/// into either a single instruction if there is a special purpose instruction
39533/// for this operation, or into a PSHUFB instruction which is a fully general
39534/// instruction but should only be used to replace chains over a certain depth.
39535static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39536 ArrayRef<int> BaseMask, int Depth,
39537 bool HasVariableMask,
39538 bool AllowVariableCrossLaneMask,
39539 bool AllowVariablePerLaneMask,
39540 SelectionDAG &DAG,
39541 const X86Subtarget &Subtarget) {
39542 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39542, __extension__
__PRETTY_FUNCTION__))
;
39543 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39544, __extension__
__PRETTY_FUNCTION__))
39544 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39544, __extension__
__PRETTY_FUNCTION__))
;
39545
39546 SDLoc DL(Root);
39547 MVT RootVT = Root.getSimpleValueType();
39548 unsigned RootSizeInBits = RootVT.getSizeInBits();
39549 unsigned NumRootElts = RootVT.getVectorNumElements();
39550
39551 // Canonicalize shuffle input op to the requested type.
39552 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39553 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39554 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39555 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39556 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39557 return DAG.getBitcast(VT, Op);
39558 };
39559
39560 // Find the inputs that enter the chain. Note that multiple uses are OK
39561 // here, we're not going to remove the operands we find.
39562 bool UnaryShuffle = (Inputs.size() == 1);
39563 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39564 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39565 : peekThroughBitcasts(Inputs[1]));
39566
39567 MVT VT1 = V1.getSimpleValueType();
39568 MVT VT2 = V2.getSimpleValueType();
39569 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39570, __extension__
__PRETTY_FUNCTION__))
39570 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39570, __extension__
__PRETTY_FUNCTION__))
;
39571
39572 SDValue Res;
39573
39574 unsigned NumBaseMaskElts = BaseMask.size();
39575 if (NumBaseMaskElts == 1) {
39576 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39576, __extension__
__PRETTY_FUNCTION__))
;
39577 return CanonicalizeShuffleInput(RootVT, V1);
39578 }
39579
39580 bool OptForSize = DAG.shouldOptForSize();
39581 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39582 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39583 (RootVT.isFloatingPoint() && Depth >= 1) ||
39584 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39585
39586 // Don't combine if we are a AVX512/EVEX target and the mask element size
39587 // is different from the root element size - this would prevent writemasks
39588 // from being reused.
39589 bool IsMaskedShuffle = false;
39590 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39591 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
39592 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39593 IsMaskedShuffle = true;
39594 }
39595 }
39596
39597 // If we are shuffling a splat (and not introducing zeros) then we can just
39598 // use it directly. This works for smaller elements as well as they already
39599 // repeat across each mask element.
39600 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39601 V1.getValueSizeInBits() >= RootSizeInBits &&
39602 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39603 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39604 return CanonicalizeShuffleInput(RootVT, V1);
39605 }
39606
39607 SmallVector<int, 64> Mask(BaseMask);
39608
39609 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39610 // etc. can be simplified.
39611 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39612 SmallVector<int> ScaledMask, IdentityMask;
39613 unsigned NumElts = VT1.getVectorNumElements();
39614 if (Mask.size() <= NumElts &&
39615 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39616 for (unsigned i = 0; i != NumElts; ++i)
39617 IdentityMask.push_back(i);
39618 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39619 V2))
39620 return CanonicalizeShuffleInput(RootVT, V1);
39621 }
39622 }
39623
39624 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39625 if (RootVT.is512BitVector() &&
39626 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39627 // If the upper subvectors are zeroable, then an extract+insert is more
39628 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39629 // to zero the upper subvectors.
39630 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39631 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39632 return SDValue(); // Nothing to do!
39633 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39634, __extension__
__PRETTY_FUNCTION__))
39634 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39634, __extension__
__PRETTY_FUNCTION__))
;
39635 Res = CanonicalizeShuffleInput(RootVT, V1);
39636 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39637 bool UseZero = isAnyZero(Mask);
39638 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39639 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39640 }
39641
39642 // Narrow shuffle mask to v4x128.
39643 SmallVector<int, 4> ScaledMask;
39644 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39644, __extension__
__PRETTY_FUNCTION__))
;
39645 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39646
39647 // Try to lower to vshuf64x2/vshuf32x4.
39648 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39649 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39650 SelectionDAG &DAG) {
39651 unsigned PermMask = 0;
39652 // Insure elements came from the same Op.
39653 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39654 for (int i = 0; i < 4; ++i) {
39655 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39655, __extension__
__PRETTY_FUNCTION__))
;
39656 if (ScaledMask[i] < 0)
39657 continue;
39658
39659 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39660 unsigned OpIndex = i / 2;
39661 if (Ops[OpIndex].isUndef())
39662 Ops[OpIndex] = Op;
39663 else if (Ops[OpIndex] != Op)
39664 return SDValue();
39665
39666 // Convert the 128-bit shuffle mask selection values into 128-bit
39667 // selection bits defined by a vshuf64x2 instruction's immediate control
39668 // byte.
39669 PermMask |= (ScaledMask[i] % 4) << (i * 2);
39670 }
39671
39672 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39673 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39674 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39675 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39676 };
39677
39678 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39679 // doesn't work because our mask is for 128 bits and we don't have an MVT
39680 // to match that.
39681 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39682 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39683 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39684 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39685 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39686 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39687 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39688 ScaledMask[1] == (ScaledMask[3] % 2));
39689
39690 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39691 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39692 return SDValue(); // Nothing to do!
39693 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39694 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39695 return DAG.getBitcast(RootVT, V);
39696 }
39697 }
39698
39699 // Handle 128-bit lane shuffles of 256-bit vectors.
39700 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39701 // If the upper half is zeroable, then an extract+insert is more optimal
39702 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39703 // zero the upper half.
39704 if (isUndefOrZero(Mask[1])) {
39705 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39706 return SDValue(); // Nothing to do!
39707 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39707, __extension__
__PRETTY_FUNCTION__))
;
39708 Res = CanonicalizeShuffleInput(RootVT, V1);
39709 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39710 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39711 256);
39712 }
39713
39714 // If we're inserting the low subvector, an insert-subvector 'concat'
39715 // pattern is quicker than VPERM2X128.
39716 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39717 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39718 !Subtarget.hasAVX2()) {
39719 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39720 return SDValue(); // Nothing to do!
39721 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39722 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39723 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39724 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39725 }
39726
39727 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39728 return SDValue(); // Nothing to do!
39729
39730 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39731 // we need to use the zeroing feature.
39732 // Prefer blends for sequential shuffles unless we are optimizing for size.
39733 if (UnaryShuffle &&
39734 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39735 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39736 unsigned PermMask = 0;
39737 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39738 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39739 return DAG.getNode(
39740 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39741 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39742 }
39743
39744 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39745 return SDValue(); // Nothing to do!
39746
39747 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39748 if (!UnaryShuffle && !IsMaskedShuffle) {
39749 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39750, __extension__
__PRETTY_FUNCTION__))
39750 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39750, __extension__
__PRETTY_FUNCTION__))
;
39751 // Prefer blends to X86ISD::VPERM2X128.
39752 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
39753 unsigned PermMask = 0;
39754 PermMask |= ((Mask[0] & 3) << 0);
39755 PermMask |= ((Mask[1] & 3) << 4);
39756 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
39757 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
39758 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
39759 CanonicalizeShuffleInput(RootVT, LHS),
39760 CanonicalizeShuffleInput(RootVT, RHS),
39761 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39762 }
39763 }
39764 }
39765
39766 // For masks that have been widened to 128-bit elements or more,
39767 // narrow back down to 64-bit elements.
39768 if (BaseMaskEltSizeInBits > 64) {
39769 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39769, __extension__
__PRETTY_FUNCTION__))
;
39770 int MaskScale = BaseMaskEltSizeInBits / 64;
39771 SmallVector<int, 64> ScaledMask;
39772 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39773 Mask = std::move(ScaledMask);
39774 }
39775
39776 // For masked shuffles, we're trying to match the root width for better
39777 // writemask folding, attempt to scale the mask.
39778 // TODO - variable shuffles might need this to be widened again.
39779 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
39780 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39780, __extension__
__PRETTY_FUNCTION__))
;
39781 int MaskScale = NumRootElts / Mask.size();
39782 SmallVector<int, 64> ScaledMask;
39783 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39784 Mask = std::move(ScaledMask);
39785 }
39786
39787 unsigned NumMaskElts = Mask.size();
39788 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
39789
39790 // Determine the effective mask value type.
39791 FloatDomain &= (32 <= MaskEltSizeInBits);
39792 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
39793 : MVT::getIntegerVT(MaskEltSizeInBits);
39794 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
39795
39796 // Only allow legal mask types.
39797 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39798 return SDValue();
39799
39800 // Attempt to match the mask against known shuffle patterns.
39801 MVT ShuffleSrcVT, ShuffleVT;
39802 unsigned Shuffle, PermuteImm;
39803
39804 // Which shuffle domains are permitted?
39805 // Permit domain crossing at higher combine depths.
39806 // TODO: Should we indicate which domain is preferred if both are allowed?
39807 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39808 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39809 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
39810
39811 // Determine zeroable mask elements.
39812 APInt KnownUndef, KnownZero;
39813 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
39814 APInt Zeroable = KnownUndef | KnownZero;
39815
39816 if (UnaryShuffle) {
39817 // Attempt to match against broadcast-from-vector.
39818 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
39819 if ((Subtarget.hasAVX2() ||
39820 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
39821 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
39822 if (isUndefOrEqual(Mask, 0)) {
39823 if (V1.getValueType() == MaskVT &&
39824 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39825 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
39826 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39827 return SDValue(); // Nothing to do!
39828 Res = V1.getOperand(0);
39829 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39830 return DAG.getBitcast(RootVT, Res);
39831 }
39832 if (Subtarget.hasAVX2()) {
39833 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39834 return SDValue(); // Nothing to do!
39835 Res = CanonicalizeShuffleInput(MaskVT, V1);
39836 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39837 return DAG.getBitcast(RootVT, Res);
39838 }
39839 }
39840 }
39841
39842 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
39843 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
39844 (!IsMaskedShuffle ||
39845 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39846 if (Depth == 0 && Root.getOpcode() == Shuffle)
39847 return SDValue(); // Nothing to do!
39848 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39849 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
39850 return DAG.getBitcast(RootVT, Res);
39851 }
39852
39853 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39854 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
39855 PermuteImm) &&
39856 (!IsMaskedShuffle ||
39857 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39858 if (Depth == 0 && Root.getOpcode() == Shuffle)
39859 return SDValue(); // Nothing to do!
39860 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
39861 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
39862 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39863 return DAG.getBitcast(RootVT, Res);
39864 }
39865 }
39866
39867 // Attempt to combine to INSERTPS, but only if the inserted element has come
39868 // from a scalar.
39869 // TODO: Handle other insertions here as well?
39870 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
39871 Subtarget.hasSSE41() &&
39872 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
39873 if (MaskEltSizeInBits == 32) {
39874 SDValue SrcV1 = V1, SrcV2 = V2;
39875 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
39876 DAG) &&
39877 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39878 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39879 return SDValue(); // Nothing to do!
39880 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39881 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
39882 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
39883 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39884 return DAG.getBitcast(RootVT, Res);
39885 }
39886 }
39887 if (MaskEltSizeInBits == 64 &&
39888 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
39889 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39890 V2.getScalarValueSizeInBits() <= 32) {
39891 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39892 return SDValue(); // Nothing to do!
39893 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
39894 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39895 CanonicalizeShuffleInput(MVT::v4f32, V1),
39896 CanonicalizeShuffleInput(MVT::v4f32, V2),
39897 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39898 return DAG.getBitcast(RootVT, Res);
39899 }
39900 }
39901
39902 SDValue NewV1 = V1; // Save operands in case early exit happens.
39903 SDValue NewV2 = V2;
39904 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
39905 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
39906 ShuffleVT, UnaryShuffle) &&
39907 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39908 if (Depth == 0 && Root.getOpcode() == Shuffle)
39909 return SDValue(); // Nothing to do!
39910 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
39911 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
39912 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
39913 return DAG.getBitcast(RootVT, Res);
39914 }
39915
39916 NewV1 = V1; // Save operands in case early exit happens.
39917 NewV2 = V2;
39918 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39919 AllowIntDomain, NewV1, NewV2, DL, DAG,
39920 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
39921 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39922 if (Depth == 0 && Root.getOpcode() == Shuffle)
39923 return SDValue(); // Nothing to do!
39924 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
39925 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
39926 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
39927 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39928 return DAG.getBitcast(RootVT, Res);
39929 }
39930
39931 // Typically from here on, we need an integer version of MaskVT.
39932 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
39933 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
39934
39935 // Annoyingly, SSE4A instructions don't map into the above match helpers.
39936 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
39937 uint64_t BitLen, BitIdx;
39938 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
39939 Zeroable)) {
39940 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39941 return SDValue(); // Nothing to do!
39942 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39943 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
39944 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39945 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39946 return DAG.getBitcast(RootVT, Res);
39947 }
39948
39949 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
39950 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39951 return SDValue(); // Nothing to do!
39952 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39953 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
39954 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
39955 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39956 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39957 return DAG.getBitcast(RootVT, Res);
39958 }
39959 }
39960
39961 // Match shuffle against TRUNCATE patterns.
39962 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
39963 // Match against a VTRUNC instruction, accounting for src/dst sizes.
39964 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
39965 Subtarget)) {
39966 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
39967 ShuffleSrcVT.getVectorNumElements();
39968 unsigned Opc =
39969 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
39970 if (Depth == 0 && Root.getOpcode() == Opc)
39971 return SDValue(); // Nothing to do!
39972 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39973 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
39974 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
39975 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
39976 return DAG.getBitcast(RootVT, Res);
39977 }
39978
39979 // Do we need a more general binary truncation pattern?
39980 if (RootSizeInBits < 512 &&
39981 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
39982 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
39983 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
39984 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
39985 // Bail if this was already a truncation or PACK node.
39986 // We sometimes fail to match PACK if we demand known undef elements.
39987 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
39988 Root.getOpcode() == X86ISD::PACKSS ||
39989 Root.getOpcode() == X86ISD::PACKUS))
39990 return SDValue(); // Nothing to do!
39991 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39992 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
39993 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39994 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
39995 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39996 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
39997 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
39998 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
39999 return DAG.getBitcast(RootVT, Res);
40000 }
40001 }
40002
40003 // Don't try to re-form single instruction chains under any circumstances now
40004 // that we've done encoding canonicalization for them.
40005 if (Depth < 1)
40006 return SDValue();
40007
40008 // Depth threshold above which we can efficiently use variable mask shuffles.
40009 int VariableCrossLaneShuffleDepth =
40010 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40011 int VariablePerLaneShuffleDepth =
40012 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40013 AllowVariableCrossLaneMask &=
40014 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
40015 AllowVariablePerLaneMask &=
40016 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40017 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40018 // higher depth before combining them.
40019 bool AllowBWIVPERMV3 =
40020 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40021
40022 bool MaskContainsZeros = isAnyZero(Mask);
40023
40024 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40025 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40026 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40027 if (Subtarget.hasAVX2() &&
40028 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40029 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40030 Res = CanonicalizeShuffleInput(MaskVT, V1);
40031 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40032 return DAG.getBitcast(RootVT, Res);
40033 }
40034 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40035 if ((Subtarget.hasAVX512() &&
40036 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40037 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40038 (Subtarget.hasBWI() &&
40039 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40040 (Subtarget.hasVBMI() &&
40041 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40042 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40043 V2 = DAG.getUNDEF(MaskVT);
40044 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40045 return DAG.getBitcast(RootVT, Res);
40046 }
40047 }
40048
40049 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40050 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40051 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40052 ((Subtarget.hasAVX512() &&
40053 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40054 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40055 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40056 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40057 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40058 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40059 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40060 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40061 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40062 for (unsigned i = 0; i != NumMaskElts; ++i)
40063 if (Mask[i] == SM_SentinelZero)
40064 Mask[i] = NumMaskElts + i;
40065 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40066 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40067 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40068 return DAG.getBitcast(RootVT, Res);
40069 }
40070
40071 // If that failed and either input is extracted then try to combine as a
40072 // shuffle with the larger type.
40073 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40074 Inputs, Root, BaseMask, Depth, HasVariableMask,
40075 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40076 Subtarget))
40077 return WideShuffle;
40078
40079 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40080 // (non-VLX will pad to 512-bit shuffles).
40081 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40082 ((Subtarget.hasAVX512() &&
40083 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40084 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40085 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40086 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40087 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40088 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40089 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40090 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40091 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40092 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40093 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40094 return DAG.getBitcast(RootVT, Res);
40095 }
40096 return SDValue();
40097 }
40098
40099 // See if we can combine a single input shuffle with zeros to a bit-mask,
40100 // which is much simpler than any shuffle.
40101 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40102 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40103 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
40104 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40105 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40106 APInt UndefElts(NumMaskElts, 0);
40107 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40108 for (unsigned i = 0; i != NumMaskElts; ++i) {
40109 int M = Mask[i];
40110 if (M == SM_SentinelUndef) {
40111 UndefElts.setBit(i);
40112 continue;
40113 }
40114 if (M == SM_SentinelZero)
40115 continue;
40116 EltBits[i] = AllOnes;
40117 }
40118 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40119 Res = CanonicalizeShuffleInput(MaskVT, V1);
40120 unsigned AndOpcode =
40121 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
40122 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40123 return DAG.getBitcast(RootVT, Res);
40124 }
40125
40126 // If we have a single input shuffle with different shuffle patterns in the
40127 // the 128-bit lanes use the variable mask to VPERMILPS.
40128 // TODO Combine other mask types at higher depths.
40129 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40130 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40131 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40132 SmallVector<SDValue, 16> VPermIdx;
40133 for (int M : Mask) {
40134 SDValue Idx =
40135 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40136 VPermIdx.push_back(Idx);
40137 }
40138 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40139 Res = CanonicalizeShuffleInput(MaskVT, V1);
40140 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40141 return DAG.getBitcast(RootVT, Res);
40142 }
40143
40144 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40145 // to VPERMIL2PD/VPERMIL2PS.
40146 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40147 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40148 MaskVT == MVT::v8f32)) {
40149 // VPERMIL2 Operation.
40150 // Bits[3] - Match Bit.
40151 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40152 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40153 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40154 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40155 SmallVector<int, 8> VPerm2Idx;
40156 unsigned M2ZImm = 0;
40157 for (int M : Mask) {
40158 if (M == SM_SentinelUndef) {
40159 VPerm2Idx.push_back(-1);
40160 continue;
40161 }
40162 if (M == SM_SentinelZero) {
40163 M2ZImm = 2;
40164 VPerm2Idx.push_back(8);
40165 continue;
40166 }
40167 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40168 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40169 VPerm2Idx.push_back(Index);
40170 }
40171 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40172 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40173 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40174 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40175 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40176 return DAG.getBitcast(RootVT, Res);
40177 }
40178
40179 // If we have 3 or more shuffle instructions or a chain involving a variable
40180 // mask, we can replace them with a single PSHUFB instruction profitably.
40181 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40182 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40183 // more aggressive.
40184 if (UnaryShuffle && AllowVariablePerLaneMask &&
40185 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40186 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40187 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40188 SmallVector<SDValue, 16> PSHUFBMask;
40189 int NumBytes = RootVT.getSizeInBits() / 8;
40190 int Ratio = NumBytes / NumMaskElts;
40191 for (int i = 0; i < NumBytes; ++i) {
40192 int M = Mask[i / Ratio];
40193 if (M == SM_SentinelUndef) {
40194 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40195 continue;
40196 }
40197 if (M == SM_SentinelZero) {
40198 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40199 continue;
40200 }
40201 M = Ratio * M + i % Ratio;
40202 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40202, __extension__
__PRETTY_FUNCTION__))
;
40203 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40204 }
40205 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40206 Res = CanonicalizeShuffleInput(ByteVT, V1);
40207 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40208 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40209 return DAG.getBitcast(RootVT, Res);
40210 }
40211
40212 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40213 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40214 // slower than PSHUFB on targets that support both.
40215 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40216 Subtarget.hasXOP()) {
40217 // VPPERM Mask Operation
40218 // Bits[4:0] - Byte Index (0 - 31)
40219 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40220 SmallVector<SDValue, 16> VPPERMMask;
40221 int NumBytes = 16;
40222 int Ratio = NumBytes / NumMaskElts;
40223 for (int i = 0; i < NumBytes; ++i) {
40224 int M = Mask[i / Ratio];
40225 if (M == SM_SentinelUndef) {
40226 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40227 continue;
40228 }
40229 if (M == SM_SentinelZero) {
40230 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40231 continue;
40232 }
40233 M = Ratio * M + i % Ratio;
40234 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40235 }
40236 MVT ByteVT = MVT::v16i8;
40237 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40238 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40239 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40240 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40241 return DAG.getBitcast(RootVT, Res);
40242 }
40243
40244 // If that failed and either input is extracted then try to combine as a
40245 // shuffle with the larger type.
40246 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40247 Inputs, Root, BaseMask, Depth, HasVariableMask,
40248 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40249 return WideShuffle;
40250
40251 // If we have a dual input shuffle then lower to VPERMV3,
40252 // (non-VLX will pad to 512-bit shuffles)
40253 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40254 ((Subtarget.hasAVX512() &&
40255 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40256 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40257 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40258 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40259 MaskVT == MVT::v16i32)) ||
40260 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40261 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40262 MaskVT == MVT::v32i16)) ||
40263 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40264 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40265 MaskVT == MVT::v64i8)))) {
40266 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40267 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40268 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40269 return DAG.getBitcast(RootVT, Res);
40270 }
40271
40272 // Failed to find any combines.
40273 return SDValue();
40274}
40275
40276// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40277// instruction if possible.
40278//
40279// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40280// type size to attempt to combine:
40281// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40282// -->
40283// extract_subvector(shuffle(x,y,m2),0)
40284static SDValue combineX86ShuffleChainWithExtract(
40285 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40286 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40287 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40288 const X86Subtarget &Subtarget) {
40289 unsigned NumMaskElts = BaseMask.size();
40290 unsigned NumInputs = Inputs.size();
40291 if (NumInputs == 0)
40292 return SDValue();
40293
40294 EVT RootVT = Root.getValueType();
40295 unsigned RootSizeInBits = RootVT.getSizeInBits();
40296 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40297 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40297, __extension__
__PRETTY_FUNCTION__))
;
40298
40299 // Peek through extract_subvector to find widest legal vector.
40300 // TODO: Handle ISD::TRUNCATE
40301 unsigned WideSizeInBits = RootSizeInBits;
40302 for (unsigned I = 0; I != NumInputs; ++I) {
40303 SDValue Input = peekThroughBitcasts(Inputs[I]);
40304 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
40305 Input = peekThroughBitcasts(Input.getOperand(0));
40306 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40307 WideSizeInBits < Input.getValueSizeInBits())
40308 WideSizeInBits = Input.getValueSizeInBits();
40309 }
40310
40311 // Bail if we fail to find a source larger than the existing root.
40312 unsigned Scale = WideSizeInBits / RootSizeInBits;
40313 if (WideSizeInBits <= RootSizeInBits ||
40314 (WideSizeInBits % RootSizeInBits) != 0)
40315 return SDValue();
40316
40317 // Create new mask for larger type.
40318 SmallVector<int, 64> WideMask(BaseMask);
40319 for (int &M : WideMask) {
40320 if (M < 0)
40321 continue;
40322 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40323 }
40324 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40325
40326 // Attempt to peek through inputs and adjust mask when we extract from an
40327 // upper subvector.
40328 int AdjustedMasks = 0;
40329 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
40330 for (unsigned I = 0; I != NumInputs; ++I) {
40331 SDValue &Input = WideInputs[I];
40332 Input = peekThroughBitcasts(Input);
40333 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40334 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40335 uint64_t Idx = Input.getConstantOperandVal(1);
40336 if (Idx != 0) {
40337 ++AdjustedMasks;
40338 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40339 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40340
40341 int lo = I * WideMask.size();
40342 int hi = (I + 1) * WideMask.size();
40343 for (int &M : WideMask)
40344 if (lo <= M && M < hi)
40345 M += Idx;
40346 }
40347 Input = peekThroughBitcasts(Input.getOperand(0));
40348 }
40349 }
40350
40351 // Remove unused/repeated shuffle source ops.
40352 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40353 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40353, __extension__
__PRETTY_FUNCTION__))
;
40354
40355 // Bail if we're always extracting from the lowest subvectors,
40356 // combineX86ShuffleChain should match this for the current width, or the
40357 // shuffle still references too many inputs.
40358 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40359 return SDValue();
40360
40361 // Minor canonicalization of the accumulated shuffle mask to make it easier
40362 // to match below. All this does is detect masks with sequential pairs of
40363 // elements, and shrink them to the half-width mask. It does this in a loop
40364 // so it will reduce the size of the mask to the minimal width mask which
40365 // performs an equivalent shuffle.
40366 while (WideMask.size() > 1) {
40367 SmallVector<int, 64> WidenedMask;
40368 if (!canWidenShuffleElements(WideMask, WidenedMask))
40369 break;
40370 WideMask = std::move(WidenedMask);
40371 }
40372
40373 // Canonicalization of binary shuffle masks to improve pattern matching by
40374 // commuting the inputs.
40375 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40376 ShuffleVectorSDNode::commuteMask(WideMask);
40377 std::swap(WideInputs[0], WideInputs[1]);
40378 }
40379
40380 // Increase depth for every upper subvector we've peeked through.
40381 Depth += AdjustedMasks;
40382
40383 // Attempt to combine wider chain.
40384 // TODO: Can we use a better Root?
40385 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40386 WideInputs.back().getValueSizeInBits()
40387 ? WideInputs.front()
40388 : WideInputs.back();
40389 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40390, __extension__
__PRETTY_FUNCTION__))
40390 "WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40390, __extension__
__PRETTY_FUNCTION__))
;
40391
40392 if (SDValue WideShuffle =
40393 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40394 HasVariableMask, AllowVariableCrossLaneMask,
40395 AllowVariablePerLaneMask, DAG, Subtarget)) {
40396 WideShuffle =
40397 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40398 return DAG.getBitcast(RootVT, WideShuffle);
40399 }
40400
40401 return SDValue();
40402}
40403
40404// Canonicalize the combined shuffle mask chain with horizontal ops.
40405// NOTE: This may update the Ops and Mask.
40406static SDValue canonicalizeShuffleMaskWithHorizOp(
40407 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
40408 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40409 const X86Subtarget &Subtarget) {
40410 if (Mask.empty() || Ops.empty())
40411 return SDValue();
40412
40413 SmallVector<SDValue> BC;
40414 for (SDValue Op : Ops)
40415 BC.push_back(peekThroughBitcasts(Op));
40416
40417 // All ops must be the same horizop + type.
40418 SDValue BC0 = BC[0];
40419 EVT VT0 = BC0.getValueType();
40420 unsigned Opcode0 = BC0.getOpcode();
40421 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40422 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40423 }))
40424 return SDValue();
40425
40426 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40427 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40428 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40429 if (!isHoriz && !isPack)
40430 return SDValue();
40431
40432 // Do all ops have a single use?
40433 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40434 return Op.hasOneUse() &&
40435 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
40436 });
40437
40438 int NumElts = VT0.getVectorNumElements();
40439 int NumLanes = VT0.getSizeInBits() / 128;
40440 int NumEltsPerLane = NumElts / NumLanes;
40441 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40442 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40443 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40444
40445 if (NumEltsPerLane >= 4 &&
40446 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40447 SmallVector<int> LaneMask, ScaledMask;
40448 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40449 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40450 // See if we can remove the shuffle by resorting the HOP chain so that
40451 // the HOP args are pre-shuffled.
40452 // TODO: Generalize to any sized/depth chain.
40453 // TODO: Add support for PACKSS/PACKUS.
40454 if (isHoriz) {
40455 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40456 auto GetHOpSrc = [&](int M) {
40457 if (M == SM_SentinelUndef)
40458 return DAG.getUNDEF(VT0);
40459 if (M == SM_SentinelZero)
40460 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40461 SDValue Src0 = BC[M / 4];
40462 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40463 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40464 return Src1.getOperand(M % 2);
40465 return SDValue();
40466 };
40467 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40468 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40469 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40470 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40471 if (M0 && M1 && M2 && M3) {
40472 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40473 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40474 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40475 }
40476 }
40477 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40478 if (Ops.size() >= 2) {
40479 SDValue LHS, RHS;
40480 auto GetHOpSrc = [&](int M, int &OutM) {
40481 // TODO: Support SM_SentinelZero
40482 if (M < 0)
40483 return M == SM_SentinelUndef;
40484 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40485 if (!LHS || LHS == Src) {
40486 LHS = Src;
40487 OutM = (M % 2);
40488 return true;
40489 }
40490 if (!RHS || RHS == Src) {
40491 RHS = Src;
40492 OutM = (M % 2) + 2;
40493 return true;
40494 }
40495 return false;
40496 };
40497 int PostMask[4] = {-1, -1, -1, -1};
40498 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40499 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40500 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40501 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40502 LHS = DAG.getBitcast(SrcVT, LHS);
40503 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40504 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40505 // Use SHUFPS for the permute so this will work on SSE3 targets,
40506 // shuffle combining and domain handling will simplify this later on.
40507 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40508 Res = DAG.getBitcast(ShuffleVT, Res);
40509 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40510 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40511 }
40512 }
40513 }
40514 }
40515
40516 if (2 < Ops.size())
40517 return SDValue();
40518
40519 SDValue BC1 = BC[BC.size() - 1];
40520 if (Mask.size() == VT0.getVectorNumElements()) {
40521 // Canonicalize binary shuffles of horizontal ops that use the
40522 // same sources to an unary shuffle.
40523 // TODO: Try to perform this fold even if the shuffle remains.
40524 if (Ops.size() == 2) {
40525 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40526 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40527 };
40528 // Commute if all BC0's ops are contained in BC1.
40529 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40530 ContainsOps(BC1, BC0.getOperand(1))) {
40531 ShuffleVectorSDNode::commuteMask(Mask);
40532 std::swap(Ops[0], Ops[1]);
40533 std::swap(BC0, BC1);
40534 }
40535
40536 // If BC1 can be represented by BC0, then convert to unary shuffle.
40537 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40538 ContainsOps(BC0, BC1.getOperand(1))) {
40539 for (int &M : Mask) {
40540 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40541 continue;
40542 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40543 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40544 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40545 M += NumHalfEltsPerLane;
40546 }
40547 }
40548 }
40549
40550 // Canonicalize unary horizontal ops to only refer to lower halves.
40551 for (int i = 0; i != NumElts; ++i) {
40552 int &M = Mask[i];
40553 if (isUndefOrZero(M))
40554 continue;
40555 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40556 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40557 M -= NumHalfEltsPerLane;
40558 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40559 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40560 M -= NumHalfEltsPerLane;
40561 }
40562 }
40563
40564 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40565 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40566 // represents the LHS/RHS inputs for the lower/upper halves.
40567 SmallVector<int, 16> TargetMask128, WideMask128;
40568 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40569 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40570 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40570, __extension__
__PRETTY_FUNCTION__))
;
40571 bool SingleOp = (Ops.size() == 1);
40572 if (isPack || OneUseOps ||
40573 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40574 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40575 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40576 Lo = Lo.getOperand(WideMask128[0] & 1);
40577 Hi = Hi.getOperand(WideMask128[1] & 1);
40578 if (SingleOp) {
40579 SDValue Undef = DAG.getUNDEF(SrcVT);
40580 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40581 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40582 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40583 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40584 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40585 }
40586 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40587 }
40588 }
40589
40590 return SDValue();
40591}
40592
40593// Attempt to constant fold all of the constant source ops.
40594// Returns true if the entire shuffle is folded to a constant.
40595// TODO: Extend this to merge multiple constant Ops and update the mask.
40596static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
40597 ArrayRef<int> Mask, SDValue Root,
40598 bool HasVariableMask,
40599 SelectionDAG &DAG,
40600 const X86Subtarget &Subtarget) {
40601 MVT VT = Root.getSimpleValueType();
40602
40603 unsigned SizeInBits = VT.getSizeInBits();
40604 unsigned NumMaskElts = Mask.size();
40605 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40606 unsigned NumOps = Ops.size();
40607
40608 // Extract constant bits from each source op.
40609 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40610 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40611 for (unsigned I = 0; I != NumOps; ++I)
40612 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40613 RawBitsOps[I]))
40614 return SDValue();
40615
40616 // If we're optimizing for size, only fold if at least one of the constants is
40617 // only used once or the combined shuffle has included a variable mask
40618 // shuffle, this is to avoid constant pool bloat.
40619 bool IsOptimizingSize = DAG.shouldOptForSize();
40620 if (IsOptimizingSize && !HasVariableMask &&
40621 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40622 return SDValue();
40623
40624 // Shuffle the constant bits according to the mask.
40625 SDLoc DL(Root);
40626 APInt UndefElts(NumMaskElts, 0);
40627 APInt ZeroElts(NumMaskElts, 0);
40628 APInt ConstantElts(NumMaskElts, 0);
40629 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40630 APInt::getZero(MaskSizeInBits));
40631 for (unsigned i = 0; i != NumMaskElts; ++i) {
40632 int M = Mask[i];
40633 if (M == SM_SentinelUndef) {
40634 UndefElts.setBit(i);
40635 continue;
40636 } else if (M == SM_SentinelZero) {
40637 ZeroElts.setBit(i);
40638 continue;
40639 }
40640 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__))
;
40641
40642 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40643 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40644
40645 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40646 if (SrcUndefElts[SrcMaskIdx]) {
40647 UndefElts.setBit(i);
40648 continue;
40649 }
40650
40651 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40652 APInt &Bits = SrcEltBits[SrcMaskIdx];
40653 if (!Bits) {
40654 ZeroElts.setBit(i);
40655 continue;
40656 }
40657
40658 ConstantElts.setBit(i);
40659 ConstantBitData[i] = Bits;
40660 }
40661 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40661, __extension__
__PRETTY_FUNCTION__))
;
40662
40663 // Attempt to create a zero vector.
40664 if ((UndefElts | ZeroElts).isAllOnes())
40665 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
40666
40667 // Create the constant data.
40668 MVT MaskSVT;
40669 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40670 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40671 else
40672 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40673
40674 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40675 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40676 return SDValue();
40677
40678 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40679 return DAG.getBitcast(VT, CstOp);
40680}
40681
40682namespace llvm {
40683 namespace X86 {
40684 enum {
40685 MaxShuffleCombineDepth = 8
40686 };
40687 }
40688} // namespace llvm
40689
40690/// Fully generic combining of x86 shuffle instructions.
40691///
40692/// This should be the last combine run over the x86 shuffle instructions. Once
40693/// they have been fully optimized, this will recursively consider all chains
40694/// of single-use shuffle instructions, build a generic model of the cumulative
40695/// shuffle operation, and check for simpler instructions which implement this
40696/// operation. We use this primarily for two purposes:
40697///
40698/// 1) Collapse generic shuffles to specialized single instructions when
40699/// equivalent. In most cases, this is just an encoding size win, but
40700/// sometimes we will collapse multiple generic shuffles into a single
40701/// special-purpose shuffle.
40702/// 2) Look for sequences of shuffle instructions with 3 or more total
40703/// instructions, and replace them with the slightly more expensive SSSE3
40704/// PSHUFB instruction if available. We do this as the last combining step
40705/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40706/// a suitable short sequence of other instructions. The PSHUFB will either
40707/// use a register or have to read from memory and so is slightly (but only
40708/// slightly) more expensive than the other shuffle instructions.
40709///
40710/// Because this is inherently a quadratic operation (for each shuffle in
40711/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40712/// This should never be an issue in practice as the shuffle lowering doesn't
40713/// produce sequences of more than 8 instructions.
40714///
40715/// FIXME: We will currently miss some cases where the redundant shuffling
40716/// would simplify under the threshold for PSHUFB formation because of
40717/// combine-ordering. To fix this, we should do the redundant instruction
40718/// combining in this recursive walk.
40719static SDValue combineX86ShufflesRecursively(
40720 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40721 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40722 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40723 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40724 const X86Subtarget &Subtarget) {
40725 assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40727, __extension__
__PRETTY_FUNCTION__))
40726 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40727, __extension__
__PRETTY_FUNCTION__))
40727 "Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40727, __extension__
__PRETTY_FUNCTION__))
;
40728 MVT RootVT = Root.getSimpleValueType();
40729 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40729, __extension__
__PRETTY_FUNCTION__))
;
40730 unsigned RootSizeInBits = RootVT.getSizeInBits();
40731
40732 // Bound the depth of our recursive combine because this is ultimately
40733 // quadratic in nature.
40734 if (Depth >= MaxDepth)
40735 return SDValue();
40736
40737 // Directly rip through bitcasts to find the underlying operand.
40738 SDValue Op = SrcOps[SrcOpIndex];
40739 Op = peekThroughOneUseBitcasts(Op);
40740
40741 EVT VT = Op.getValueType();
40742 if (!VT.isVector() || !VT.isSimple())
40743 return SDValue(); // Bail if we hit a non-simple non-vector.
40744
40745 // FIXME: Just bail on f16 for now.
40746 if (VT.getVectorElementType() == MVT::f16)
40747 return SDValue();
40748
40749 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40750, __extension__
__PRETTY_FUNCTION__))
40750 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40750, __extension__
__PRETTY_FUNCTION__))
;
40751
40752 // Create a demanded elts mask from the referenced elements of Op.
40753 APInt OpDemandedElts = APInt::getZero(RootMask.size());
40754 for (int M : RootMask) {
40755 int BaseIdx = RootMask.size() * SrcOpIndex;
40756 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
40757 OpDemandedElts.setBit(M - BaseIdx);
40758 }
40759 if (RootSizeInBits != VT.getSizeInBits()) {
40760 // Op is smaller than Root - extract the demanded elts for the subvector.
40761 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
40762 unsigned NumOpMaskElts = RootMask.size() / Scale;
40763 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40763, __extension__
__PRETTY_FUNCTION__))
;
40764 assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))
40765 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))
40766 .isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))
40767 "Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))
;
40768 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
40769 }
40770 OpDemandedElts =
40771 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
40772
40773 // Extract target shuffle mask and resolve sentinels and inputs.
40774 SmallVector<int, 64> OpMask;
40775 SmallVector<SDValue, 2> OpInputs;
40776 APInt OpUndef, OpZero;
40777 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
40778 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
40779 OpZero, DAG, Depth, false)) {
40780 // Shuffle inputs must not be larger than the shuffle result.
40781 // TODO: Relax this for single input faux shuffles (e.g. trunc).
40782 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
40783 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
40784 }))
40785 return SDValue();
40786 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40787 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40788 !isNullConstant(Op.getOperand(1))) {
40789 SDValue SrcVec = Op.getOperand(0);
40790 int ExtractIdx = Op.getConstantOperandVal(1);
40791 unsigned NumElts = VT.getVectorNumElements();
40792 OpInputs.assign({SrcVec});
40793 OpMask.assign(NumElts, SM_SentinelUndef);
40794 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40795 OpZero = OpUndef = APInt::getZero(NumElts);
40796 } else {
40797 return SDValue();
40798 }
40799
40800 // If the shuffle result was smaller than the root, we need to adjust the
40801 // mask indices and pad the mask with undefs.
40802 if (RootSizeInBits > VT.getSizeInBits()) {
40803 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
40804 unsigned OpMaskSize = OpMask.size();
40805 if (OpInputs.size() > 1) {
40806 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
40807 for (int &M : OpMask) {
40808 if (M < 0)
40809 continue;
40810 int EltIdx = M % OpMaskSize;
40811 int OpIdx = M / OpMaskSize;
40812 M = (PaddedMaskSize * OpIdx) + EltIdx;
40813 }
40814 }
40815 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
40816 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
40817 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40818 }
40819
40820 SmallVector<int, 64> Mask;
40821 SmallVector<SDValue, 16> Ops;
40822
40823 // We don't need to merge masks if the root is empty.
40824 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40825 if (EmptyRoot) {
40826 // Only resolve zeros if it will remove an input, otherwise we might end
40827 // up in an infinite loop.
40828 bool ResolveKnownZeros = true;
40829 if (!OpZero.isZero()) {
40830 APInt UsedInputs = APInt::getZero(OpInputs.size());
40831 for (int i = 0, e = OpMask.size(); i != e; ++i) {
40832 int M = OpMask[i];
40833 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
40834 continue;
40835 UsedInputs.setBit(M / OpMask.size());
40836 if (UsedInputs.isAllOnes()) {
40837 ResolveKnownZeros = false;
40838 break;
40839 }
40840 }
40841 }
40842 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
40843 ResolveKnownZeros);
40844
40845 Mask = OpMask;
40846 Ops.append(OpInputs.begin(), OpInputs.end());
40847 } else {
40848 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
40849
40850 // Add the inputs to the Ops list, avoiding duplicates.
40851 Ops.append(SrcOps.begin(), SrcOps.end());
40852
40853 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40854 // Attempt to find an existing match.
40855 SDValue InputBC = peekThroughBitcasts(Input);
40856 for (int i = 0, e = Ops.size(); i < e; ++i)
40857 if (InputBC == peekThroughBitcasts(Ops[i]))
40858 return i;
40859 // Match failed - should we replace an existing Op?
40860 if (InsertionPoint >= 0) {
40861 Ops[InsertionPoint] = Input;
40862 return InsertionPoint;
40863 }
40864 // Add to the end of the Ops list.
40865 Ops.push_back(Input);
40866 return Ops.size() - 1;
40867 };
40868
40869 SmallVector<int, 2> OpInputIdx;
40870 for (SDValue OpInput : OpInputs)
40871 OpInputIdx.push_back(
40872 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40873
40874 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))
40875 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))
40876 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))
40877 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))
40878 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))
40879 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))
;
40880
40881 // This function can be performance-critical, so we rely on the power-of-2
40882 // knowledge that we have about the mask sizes to replace div/rem ops with
40883 // bit-masks and shifts.
40884 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40885, __extension__
__PRETTY_FUNCTION__))
40885 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40885, __extension__
__PRETTY_FUNCTION__))
;
40886 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40887, __extension__
__PRETTY_FUNCTION__))
40887 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40887, __extension__
__PRETTY_FUNCTION__))
;
40888 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
40889 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
40890
40891 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
40892 unsigned RootRatio =
40893 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
40894 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
40895 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40896, __extension__
__PRETTY_FUNCTION__))
40896 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40896, __extension__
__PRETTY_FUNCTION__))
;
40897
40898 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40898, __extension__
__PRETTY_FUNCTION__))
;
40899 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40899, __extension__
__PRETTY_FUNCTION__))
;
40900 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40900, __extension__
__PRETTY_FUNCTION__))
;
40901 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
40902 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
40903
40904 Mask.resize(MaskWidth, SM_SentinelUndef);
40905
40906 // Merge this shuffle operation's mask into our accumulated mask. Note that
40907 // this shuffle's mask will be the first applied to the input, followed by
40908 // the root mask to get us all the way to the root value arrangement. The
40909 // reason for this order is that we are recursing up the operation chain.
40910 for (unsigned i = 0; i < MaskWidth; ++i) {
40911 unsigned RootIdx = i >> RootRatioLog2;
40912 if (RootMask[RootIdx] < 0) {
40913 // This is a zero or undef lane, we're done.
40914 Mask[i] = RootMask[RootIdx];
40915 continue;
40916 }
40917
40918 unsigned RootMaskedIdx =
40919 RootRatio == 1
40920 ? RootMask[RootIdx]
40921 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40922
40923 // Just insert the scaled root mask value if it references an input other
40924 // than the SrcOp we're currently inserting.
40925 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
40926 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
40927 Mask[i] = RootMaskedIdx;
40928 continue;
40929 }
40930
40931 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40932 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
40933 if (OpMask[OpIdx] < 0) {
40934 // The incoming lanes are zero or undef, it doesn't matter which ones we
40935 // are using.
40936 Mask[i] = OpMask[OpIdx];
40937 continue;
40938 }
40939
40940 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40941 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
40942 : (OpMask[OpIdx] << OpRatioLog2) +
40943 (RootMaskedIdx & (OpRatio - 1));
40944
40945 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
40946 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
40947 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40947, __extension__
__PRETTY_FUNCTION__))
;
40948 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
40949
40950 Mask[i] = OpMaskedIdx;
40951 }
40952 }
40953
40954 // Peek through vector widenings and set out of bounds mask indices to undef.
40955 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
40956 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
40957 SDValue &Op = Ops[I];
40958 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
40959 isNullConstant(Op.getOperand(2))) {
40960 Op = Op.getOperand(1);
40961 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
40962 int Lo = I * Mask.size();
40963 int Hi = (I + 1) * Mask.size();
40964 int NewHi = Lo + (Mask.size() / Scale);
40965 for (int &M : Mask) {
40966 if (Lo <= M && NewHi <= M && M < Hi)
40967 M = SM_SentinelUndef;
40968 }
40969 }
40970 }
40971
40972 // Peek through any free extract_subvector nodes back to root size.
40973 for (SDValue &Op : Ops)
40974 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40975 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40976 isNullConstant(Op.getOperand(1)))
40977 Op = Op.getOperand(0);
40978
40979 // Remove unused/repeated shuffle source ops.
40980 resolveTargetShuffleInputsAndMask(Ops, Mask);
40981
40982 // Handle the all undef/zero/ones cases early.
40983 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
40984 return DAG.getUNDEF(RootVT);
40985 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
40986 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
40987 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
40988 !llvm::is_contained(Mask, SM_SentinelZero))
40989 return getOnesVector(RootVT, DAG, SDLoc(Root));
40990
40991 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40991, __extension__
__PRETTY_FUNCTION__))
;
40992 HasVariableMask |= IsOpVariableMask;
40993
40994 // Update the list of shuffle nodes that have been combined so far.
40995 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
40996 SrcNodes.end());
40997 CombinedNodes.push_back(Op.getNode());
40998
40999 // See if we can recurse into each shuffle source op (if it's a target
41000 // shuffle). The source op should only be generally combined if it either has
41001 // a single use (i.e. current Op) or all its users have already been combined,
41002 // if not then we can still combine but should prevent generation of variable
41003 // shuffles to avoid constant pool bloat.
41004 // Don't recurse if we already have more source ops than we can combine in
41005 // the remaining recursion depth.
41006 if (Ops.size() < (MaxDepth - Depth)) {
41007 for (int i = 0, e = Ops.size(); i < e; ++i) {
41008 // For empty roots, we need to resolve zeroable elements before combining
41009 // them with other shuffles.
41010 SmallVector<int, 64> ResolvedMask = Mask;
41011 if (EmptyRoot)
41012 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41013 bool AllowCrossLaneVar = false;
41014 bool AllowPerLaneVar = false;
41015 if (Ops[i].getNode()->hasOneUse() ||
41016 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41017 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41018 AllowPerLaneVar = AllowVariablePerLaneMask;
41019 }
41020 if (SDValue Res = combineX86ShufflesRecursively(
41021 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41022 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41023 Subtarget))
41024 return Res;
41025 }
41026 }
41027
41028 // Attempt to constant fold all of the constant source ops.
41029 if (SDValue Cst = combineX86ShufflesConstants(
41030 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
41031 return Cst;
41032
41033 // If constant fold failed and we only have constants - then we have
41034 // multiple uses by a single non-variable shuffle - just bail.
41035 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41036 APInt UndefElts;
41037 SmallVector<APInt> RawBits;
41038 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41039 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41040 RawBits);
41041 })) {
41042 return SDValue();
41043 }
41044
41045 // Canonicalize the combined shuffle mask chain with horizontal ops.
41046 // NOTE: This will update the Ops and Mask.
41047 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
41048 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
41049 return DAG.getBitcast(RootVT, HOp);
41050
41051 // Try to refine our inputs given our knowledge of target shuffle mask.
41052 for (auto I : enumerate(Ops)) {
41053 int OpIdx = I.index();
41054 SDValue &Op = I.value();
41055
41056 // What range of shuffle mask element values results in picking from Op?
41057 int Lo = OpIdx * Mask.size();
41058 int Hi = Lo + Mask.size();
41059
41060 // Which elements of Op do we demand, given the mask's granularity?
41061 APInt OpDemandedElts(Mask.size(), 0);
41062 for (int MaskElt : Mask) {
41063 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41064 int OpEltIdx = MaskElt - Lo;
41065 OpDemandedElts.setBit(OpEltIdx);
41066 }
41067 }
41068
41069 // Is the shuffle result smaller than the root?
41070 if (Op.getValueSizeInBits() < RootSizeInBits) {
41071 // We padded the mask with undefs. But we now need to undo that.
41072 unsigned NumExpectedVectorElts = Mask.size();
41073 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41074 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41075 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41077, __extension__
__PRETTY_FUNCTION__))
41076 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41077, __extension__
__PRETTY_FUNCTION__))
41077 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41077, __extension__
__PRETTY_FUNCTION__))
;
41078 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41079 }
41080
41081 // The Op itself may be of different VT, so we need to scale the mask.
41082 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41083 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41084
41085 // Can this operand be simplified any further, given it's demanded elements?
41086 if (SDValue NewOp =
41087 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
41088 Op, OpScaledDemandedElts, DAG))
41089 Op = NewOp;
41090 }
41091 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41092
41093 // Widen any subvector shuffle inputs we've collected.
41094 // TODO: Remove this to avoid generating temporary nodes, we should only
41095 // widen once combineX86ShuffleChain has found a match.
41096 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41097 return Op.getValueSizeInBits() < RootSizeInBits;
41098 })) {
41099 for (SDValue &Op : Ops)
41100 if (Op.getValueSizeInBits() < RootSizeInBits)
41101 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41102 RootSizeInBits);
41103 // Reresolve - we might have repeated subvector sources.
41104 resolveTargetShuffleInputsAndMask(Ops, Mask);
41105 }
41106
41107 // We can only combine unary and binary shuffle mask cases.
41108 if (Ops.size() <= 2) {
41109 // Minor canonicalization of the accumulated shuffle mask to make it easier
41110 // to match below. All this does is detect masks with sequential pairs of
41111 // elements, and shrink them to the half-width mask. It does this in a loop
41112 // so it will reduce the size of the mask to the minimal width mask which
41113 // performs an equivalent shuffle.
41114 while (Mask.size() > 1) {
41115 SmallVector<int, 64> WidenedMask;
41116 if (!canWidenShuffleElements(Mask, WidenedMask))
41117 break;
41118 Mask = std::move(WidenedMask);
41119 }
41120
41121 // Canonicalization of binary shuffle masks to improve pattern matching by
41122 // commuting the inputs.
41123 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41124 ShuffleVectorSDNode::commuteMask(Mask);
41125 std::swap(Ops[0], Ops[1]);
41126 }
41127
41128 // Try to combine into a single shuffle instruction.
41129 if (SDValue Shuffle = combineX86ShuffleChain(
41130 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41131 AllowVariablePerLaneMask, DAG, Subtarget))
41132 return Shuffle;
41133
41134 // If all the operands come from the same larger vector, fallthrough and try
41135 // to use combineX86ShuffleChainWithExtract.
41136 SDValue LHS = peekThroughBitcasts(Ops.front());
41137 SDValue RHS = peekThroughBitcasts(Ops.back());
41138 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41139 (RootSizeInBits / Mask.size()) != 64 ||
41140 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41141 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41142 LHS.getOperand(0) != RHS.getOperand(0))
41143 return SDValue();
41144 }
41145
41146 // If that failed and any input is extracted then try to combine as a
41147 // shuffle with the larger type.
41148 return combineX86ShuffleChainWithExtract(
41149 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41150 AllowVariablePerLaneMask, DAG, Subtarget);
41151}
41152
41153/// Helper entry wrapper to combineX86ShufflesRecursively.
41154static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
41155 const X86Subtarget &Subtarget) {
41156 return combineX86ShufflesRecursively(
41157 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41158 /*HasVarMask*/ false,
41159 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41160 Subtarget);
41161}
41162
41163/// Get the PSHUF-style mask from PSHUF node.
41164///
41165/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41166/// PSHUF-style masks that can be reused with such instructions.
41167static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
41168 MVT VT = N.getSimpleValueType();
41169 SmallVector<int, 4> Mask;
41170 SmallVector<SDValue, 2> Ops;
41171 bool HaveMask =
41172 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
41173 (void)HaveMask;
41174 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41174
, __extension__ __PRETTY_FUNCTION__))
;
41175
41176 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41177 // matter. Check that the upper masks are repeats and remove them.
41178 if (VT.getSizeInBits() > 128) {
41179 int LaneElts = 128 / VT.getScalarSizeInBits();
41180#ifndef NDEBUG
41181 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41182 for (int j = 0; j < LaneElts; ++j)
41183 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41184, __extension__
__PRETTY_FUNCTION__))
41184 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41184, __extension__
__PRETTY_FUNCTION__))
;
41185#endif
41186 Mask.resize(LaneElts);
41187 }
41188
41189 switch (N.getOpcode()) {
41190 case X86ISD::PSHUFD:
41191 return Mask;
41192 case X86ISD::PSHUFLW:
41193 Mask.resize(4);
41194 return Mask;
41195 case X86ISD::PSHUFHW:
41196 Mask.erase(Mask.begin(), Mask.begin() + 4);
41197 for (int &M : Mask)
41198 M -= 4;
41199 return Mask;
41200 default:
41201 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41201)
;
41202 }
41203}
41204
41205/// Search for a combinable shuffle across a chain ending in pshufd.
41206///
41207/// We walk up the chain and look for a combinable shuffle, skipping over
41208/// shuffles that we could hoist this shuffle's transformation past without
41209/// altering anything.
41210static SDValue
41211combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
41212 SelectionDAG &DAG) {
41213 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41214, __extension__
__PRETTY_FUNCTION__))
41214 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41214, __extension__
__PRETTY_FUNCTION__))
;
41215 SDLoc DL(N);
41216
41217 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41218 // of the shuffles in the chain so that we can form a fresh chain to replace
41219 // this one.
41220 SmallVector<SDValue, 8> Chain;
41221 SDValue V = N.getOperand(0);
41222 for (; V.hasOneUse(); V = V.getOperand(0)) {
41223 switch (V.getOpcode()) {
41224 default:
41225 return SDValue(); // Nothing combined!
41226
41227 case ISD::BITCAST:
41228 // Skip bitcasts as we always know the type for the target specific
41229 // instructions.
41230 continue;
41231
41232 case X86ISD::PSHUFD:
41233 // Found another dword shuffle.
41234 break;
41235
41236 case X86ISD::PSHUFLW:
41237 // Check that the low words (being shuffled) are the identity in the
41238 // dword shuffle, and the high words are self-contained.
41239 if (Mask[0] != 0 || Mask[1] != 1 ||
41240 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41241 return SDValue();
41242
41243 Chain.push_back(V);
41244 continue;
41245
41246 case X86ISD::PSHUFHW:
41247 // Check that the high words (being shuffled) are the identity in the
41248 // dword shuffle, and the low words are self-contained.
41249 if (Mask[2] != 2 || Mask[3] != 3 ||
41250 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41251 return SDValue();
41252
41253 Chain.push_back(V);
41254 continue;
41255
41256 case X86ISD::UNPCKL:
41257 case X86ISD::UNPCKH:
41258 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41259 // shuffle into a preceding word shuffle.
41260 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41261 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41262 return SDValue();
41263
41264 // Search for a half-shuffle which we can combine with.
41265 unsigned CombineOp =
41266 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41267 if (V.getOperand(0) != V.getOperand(1) ||
41268 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41269 return SDValue();
41270 Chain.push_back(V);
41271 V = V.getOperand(0);
41272 do {
41273 switch (V.getOpcode()) {
41274 default:
41275 return SDValue(); // Nothing to combine.
41276
41277 case X86ISD::PSHUFLW:
41278 case X86ISD::PSHUFHW:
41279 if (V.getOpcode() == CombineOp)
41280 break;
41281
41282 Chain.push_back(V);
41283
41284 [[fallthrough]];
41285 case ISD::BITCAST:
41286 V = V.getOperand(0);
41287 continue;
41288 }
41289 break;
41290 } while (V.hasOneUse());
41291 break;
41292 }
41293 // Break out of the loop if we break out of the switch.
41294 break;
41295 }
41296
41297 if (!V.hasOneUse())
41298 // We fell out of the loop without finding a viable combining instruction.
41299 return SDValue();
41300
41301 // Merge this node's mask and our incoming mask.
41302 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
41303 for (int &M : Mask)
41304 M = VMask[M];
41305 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41306 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41307
41308 // Rebuild the chain around this new shuffle.
41309 while (!Chain.empty()) {
41310 SDValue W = Chain.pop_back_val();
41311
41312 if (V.getValueType() != W.getOperand(0).getValueType())
41313 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41314
41315 switch (W.getOpcode()) {
41316 default:
41317 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41317)
;
41318
41319 case X86ISD::UNPCKL:
41320 case X86ISD::UNPCKH:
41321 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41322 break;
41323
41324 case X86ISD::PSHUFD:
41325 case X86ISD::PSHUFLW:
41326 case X86ISD::PSHUFHW:
41327 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41328 break;
41329 }
41330 }
41331 if (V.getValueType() != N.getValueType())
41332 V = DAG.getBitcast(N.getValueType(), V);
41333
41334 // Return the new chain to replace N.
41335 return V;
41336}
41337
41338// Attempt to commute shufps LHS loads:
41339// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41340static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
41341 SelectionDAG &DAG) {
41342 // TODO: Add vXf64 support.
41343 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41344 return SDValue();
41345
41346 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41347 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41348 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41349 return SDValue();
41350 SDValue N0 = V.getOperand(0);
41351 SDValue N1 = V.getOperand(1);
41352 unsigned Imm = V.getConstantOperandVal(2);
41353 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41354 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41355 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
41356 return SDValue();
41357 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41358 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41359 DAG.getTargetConstant(Imm, DL, MVT::i8));
41360 };
41361
41362 switch (N.getOpcode()) {
41363 case X86ISD::VPERMILPI:
41364 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41365 unsigned Imm = N.getConstantOperandVal(1);
41366 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41367 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41368 }
41369 break;
41370 case X86ISD::SHUFP: {
41371 SDValue N0 = N.getOperand(0);
41372 SDValue N1 = N.getOperand(1);
41373 unsigned Imm = N.getConstantOperandVal(2);
41374 if (N0 == N1) {
41375 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41376 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41377 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41378 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41379 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41380 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41381 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41382 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41383 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41384 }
41385 break;
41386 }
41387 }
41388
41389 return SDValue();
41390}
41391
41392// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41393static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
41394 const SDLoc &DL) {
41395 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41396 EVT ShuffleVT = N.getValueType();
41397
41398 auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
41399 // AllZeros/AllOnes constants are freely shuffled and will peek through
41400 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41401 // merge with target shuffles if it has one use so shuffle combining is
41402 // likely to kick in. Shuffles of splats are expected to be removed.
41403 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41404 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41405 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
41406 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
41407 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41408 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41409 (FoldLoad && isShuffleFoldableLoad(Op)) ||
41410 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41411 };
41412 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41413 // Ensure we only shuffle whole vector src elements, unless its a logical
41414 // binops where we can more aggressively move shuffles from dst to src.
41415 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
41416 BinOp == X86ISD::ANDNP ||
41417 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41418 };
41419
41420 unsigned Opc = N.getOpcode();
41421 switch (Opc) {
41422 // Unary and Unary+Permute Shuffles.
41423 case X86ISD::PSHUFB: {
41424 // Don't merge PSHUFB if it contains zero'd elements.
41425 SmallVector<int> Mask;
41426 SmallVector<SDValue> Ops;
41427 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
41428 Mask))
41429 break;
41430 [[fallthrough]];
41431 }
41432 case X86ISD::VBROADCAST:
41433 case X86ISD::MOVDDUP:
41434 case X86ISD::PSHUFD:
41435 case X86ISD::PSHUFHW:
41436 case X86ISD::PSHUFLW:
41437 case X86ISD::VPERMI:
41438 case X86ISD::VPERMILPI: {
41439 if (N.getOperand(0).getValueType() == ShuffleVT &&
41440 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41441 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41442 unsigned SrcOpcode = N0.getOpcode();
41443 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41444 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41445 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41446 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
41447 IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
41448 SDValue LHS, RHS;
41449 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41450 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41451 if (N.getNumOperands() == 2) {
41452 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41453 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41454 } else {
41455 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41456 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41457 }
41458 EVT OpVT = N0.getValueType();
41459 return DAG.getBitcast(ShuffleVT,
41460 DAG.getNode(SrcOpcode, DL, OpVT,
41461 DAG.getBitcast(OpVT, LHS),
41462 DAG.getBitcast(OpVT, RHS)));
41463 }
41464 }
41465 }
41466 break;
41467 }
41468 // Binary and Binary+Permute Shuffles.
41469 case X86ISD::INSERTPS: {
41470 // Don't merge INSERTPS if it contains zero'd elements.
41471 unsigned InsertPSMask = N.getConstantOperandVal(2);
41472 unsigned ZeroMask = InsertPSMask & 0xF;
41473 if (ZeroMask != 0)
41474 break;
41475 [[fallthrough]];
41476 }
41477 case X86ISD::MOVSD:
41478 case X86ISD::MOVSS:
41479 case X86ISD::BLENDI:
41480 case X86ISD::SHUFP:
41481 case X86ISD::UNPCKH:
41482 case X86ISD::UNPCKL: {
41483 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41484 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41485 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41486 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41487 unsigned SrcOpcode = N0.getOpcode();
41488 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41489 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41490 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41491 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41492 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
41493 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41494 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
41495 // Ensure the total number of shuffles doesn't increase by folding this
41496 // shuffle through to the source ops.
41497 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41498 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41499 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41500 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41501 SDValue LHS, RHS;
41502 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41503 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41504 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41505 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41506 if (N.getNumOperands() == 3) {
41507 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41508 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41509 } else {
41510 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41511 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41512 }
41513 EVT OpVT = N0.getValueType();
41514 return DAG.getBitcast(ShuffleVT,
41515 DAG.getNode(SrcOpcode, DL, OpVT,
41516 DAG.getBitcast(OpVT, LHS),
41517 DAG.getBitcast(OpVT, RHS)));
41518 }
41519 }
41520 }
41521 break;
41522 }
41523 }
41524 return SDValue();
41525}
41526
41527/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41528static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
41529 SelectionDAG &DAG,
41530 const SDLoc &DL) {
41531 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41531, __extension__
__PRETTY_FUNCTION__))
;
41532
41533 MVT VT = V.getSimpleValueType();
41534 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41535 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41536 unsigned SrcOpc0 = Src0.getOpcode();
41537 unsigned SrcOpc1 = Src1.getOpcode();
41538 EVT SrcVT0 = Src0.getValueType();
41539 EVT SrcVT1 = Src1.getValueType();
41540
41541 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41542 return SDValue();
41543
41544 switch (SrcOpc0) {
41545 case X86ISD::MOVDDUP: {
41546 SDValue LHS = Src0.getOperand(0);
41547 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41548 SDValue Res =
41549 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41550 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41551 return DAG.getBitcast(VT, Res);
41552 }
41553 case X86ISD::VPERMILPI:
41554 // TODO: Handle v4f64 permutes with different low/high lane masks.
41555 if (SrcVT0 == MVT::v4f64) {
41556 uint64_t Mask = Src0.getConstantOperandVal(1);
41557 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41558 break;
41559 }
41560 [[fallthrough]];
41561 case X86ISD::VSHLI:
41562 case X86ISD::VSRLI:
41563 case X86ISD::VSRAI:
41564 case X86ISD::PSHUFD:
41565 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41566 SDValue LHS = Src0.getOperand(0);
41567 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41568 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41569 V.getOperand(2));
41570 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41571 return DAG.getBitcast(VT, Res);
41572 }
41573 break;
41574 }
41575
41576 return SDValue();
41577}
41578
41579/// Try to combine x86 target specific shuffles.
41580static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
41581 TargetLowering::DAGCombinerInfo &DCI,
41582 const X86Subtarget &Subtarget) {
41583 SDLoc DL(N);
41584 MVT VT = N.getSimpleValueType();
41585 SmallVector<int, 4> Mask;
41586 unsigned Opcode = N.getOpcode();
41587
41588 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41589 return R;
41590
41591 // Handle specific target shuffles.
41592 switch (Opcode) {
41593 case X86ISD::MOVDDUP: {
41594 SDValue Src = N.getOperand(0);
41595 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41596 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41597 ISD::isNormalLoad(Src.getNode())) {
41598 LoadSDNode *LN = cast<LoadSDNode>(Src);
41599 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41600 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41601 DCI.CombineTo(N.getNode(), Movddup);
41602 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41603 DCI.recursivelyDeleteUnusedNodes(LN);
41604 return N; // Return N so it doesn't get rechecked!
41605 }
41606 }
41607
41608 return SDValue();
41609 }
41610 case X86ISD::VBROADCAST: {
41611 SDValue Src = N.getOperand(0);
41612 SDValue BC = peekThroughBitcasts(Src);
41613 EVT SrcVT = Src.getValueType();
41614 EVT BCVT = BC.getValueType();
41615
41616 // If broadcasting from another shuffle, attempt to simplify it.
41617 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41618 if (isTargetShuffle(BC.getOpcode()) &&
41619 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41620 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41621 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41622 SM_SentinelUndef);
41623 for (unsigned i = 0; i != Scale; ++i)
41624 DemandedMask[i] = i;
41625 if (SDValue Res = combineX86ShufflesRecursively(
41626 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41627 X86::MaxShuffleCombineDepth,
41628 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41629 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41630 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41631 DAG.getBitcast(SrcVT, Res));
41632 }
41633
41634 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41635 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41636 if (Src.getOpcode() == ISD::BITCAST &&
41637 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41638 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
41639 FixedVectorType::isValidElementType(
41640 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41641 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41642 VT.getVectorNumElements());
41643 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41644 }
41645
41646 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41647 // If we're re-broadcasting a smaller type then broadcast with that type and
41648 // bitcast.
41649 // TODO: Do this for any splat?
41650 if (Src.getOpcode() == ISD::BITCAST &&
41651 (BC.getOpcode() == X86ISD::VBROADCAST ||
41652 BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
41653 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41654 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41655 MVT NewVT =
41656 MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
41657 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41658 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41659 }
41660
41661 // Reduce broadcast source vector to lowest 128-bits.
41662 if (SrcVT.getSizeInBits() > 128)
41663 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41664 extract128BitVector(Src, 0, DAG, DL));
41665
41666 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41667 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
41668 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41669
41670 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41671 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41672 isNullConstant(Src.getOperand(1)) &&
41673 DAG.getTargetLoweringInfo().isTypeLegal(
41674 Src.getOperand(0).getValueType()))
41675 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41676
41677 // Share broadcast with the longest vector and extract low subvector (free).
41678 // Ensure the same SDValue from the SDNode use is being used.
41679 for (SDNode *User : Src->uses())
41680 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41681 Src == User->getOperand(0) &&
41682 User->getValueSizeInBits(0).getFixedValue() >
41683 VT.getFixedSizeInBits()) {
41684 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41685 VT.getSizeInBits());
41686 }
41687
41688 // vbroadcast(scalarload X) -> vbroadcast_load X
41689 // For float loads, extract other uses of the scalar from the broadcast.
41690 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41691 ISD::isNormalLoad(Src.getNode())) {
41692 LoadSDNode *LN = cast<LoadSDNode>(Src);
41693 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41694 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41695 SDValue BcastLd =
41696 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41697 LN->getMemoryVT(), LN->getMemOperand());
41698 // If the load value is used only by N, replace it via CombineTo N.
41699 bool NoReplaceExtract = Src.hasOneUse();
41700 DCI.CombineTo(N.getNode(), BcastLd);
41701 if (NoReplaceExtract) {
41702 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41703 DCI.recursivelyDeleteUnusedNodes(LN);
41704 } else {
41705 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41706 DAG.getIntPtrConstant(0, DL));
41707 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41708 }
41709 return N; // Return N so it doesn't get rechecked!
41710 }
41711
41712 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41713 // i16. So shrink it ourselves if we can make a broadcast_load.
41714 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41715 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41716 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41716, __extension__
__PRETTY_FUNCTION__))
;
41717 SDValue TruncIn = Src.getOperand(0);
41718
41719 // If this is a truncate of a non extending load we can just narrow it to
41720 // use a broadcast_load.
41721 if (ISD::isNormalLoad(TruncIn.getNode())) {
41722 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41723 // Unless its volatile or atomic.
41724 if (LN->isSimple()) {
41725 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41726 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41727 SDValue BcastLd = DAG.getMemIntrinsicNode(
41728 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41729 LN->getPointerInfo(), LN->getOriginalAlign(),
41730 LN->getMemOperand()->getFlags());
41731 DCI.CombineTo(N.getNode(), BcastLd);
41732 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41733 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41734 return N; // Return N so it doesn't get rechecked!
41735 }
41736 }
41737
41738 // If this is a truncate of an i16 extload, we can directly replace it.
41739 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41740 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41741 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41742 if (LN->getMemoryVT().getSizeInBits() == 16) {
41743 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41744 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41745 SDValue BcastLd =
41746 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41747 LN->getMemoryVT(), LN->getMemOperand());
41748 DCI.CombineTo(N.getNode(), BcastLd);
41749 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41750 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41751 return N; // Return N so it doesn't get rechecked!
41752 }
41753 }
41754
41755 // If this is a truncate of load that has been shifted right, we can
41756 // offset the pointer and use a narrower load.
41757 if (TruncIn.getOpcode() == ISD::SRL &&
41758 TruncIn.getOperand(0).hasOneUse() &&
41759 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
41760 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
41761 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
41762 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
41763 // Make sure the shift amount and the load size are divisible by 16.
41764 // Don't do this if the load is volatile or atomic.
41765 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
41766 LN->isSimple()) {
41767 unsigned Offset = ShiftAmt / 8;
41768 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41769 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
41770 TypeSize::Fixed(Offset), DL);
41771 SDValue Ops[] = { LN->getChain(), Ptr };
41772 SDValue BcastLd = DAG.getMemIntrinsicNode(
41773 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41774 LN->getPointerInfo().getWithOffset(Offset),
41775 LN->getOriginalAlign(),
41776 LN->getMemOperand()->getFlags());
41777 DCI.CombineTo(N.getNode(), BcastLd);
41778 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41779 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41780 return N; // Return N so it doesn't get rechecked!
41781 }
41782 }
41783 }
41784
41785 // vbroadcast(vzload X) -> vbroadcast_load X
41786 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
41787 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
41788 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41789 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41790 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41791 SDValue BcastLd =
41792 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41793 LN->getMemoryVT(), LN->getMemOperand());
41794 DCI.CombineTo(N.getNode(), BcastLd);
41795 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41796 DCI.recursivelyDeleteUnusedNodes(LN);
41797 return N; // Return N so it doesn't get rechecked!
41798 }
41799 }
41800
41801 // vbroadcast(vector load X) -> vbroadcast_load
41802 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
41803 SrcVT == MVT::v4i32) &&
41804 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
41805 LoadSDNode *LN = cast<LoadSDNode>(Src);
41806 // Unless the load is volatile or atomic.
41807 if (LN->isSimple()) {
41808 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41809 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
41810 SDValue BcastLd = DAG.getMemIntrinsicNode(
41811 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
41812 LN->getPointerInfo(), LN->getOriginalAlign(),
41813 LN->getMemOperand()->getFlags());
41814 DCI.CombineTo(N.getNode(), BcastLd);
41815 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41816 DCI.recursivelyDeleteUnusedNodes(LN);
41817 return N; // Return N so it doesn't get rechecked!
41818 }
41819 }
41820
41821 return SDValue();
41822 }
41823 case X86ISD::VZEXT_MOVL: {
41824 SDValue N0 = N.getOperand(0);
41825
41826 // If this a vzmovl of a full vector load, replace it with a vzload, unless
41827 // the load is volatile.
41828 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
41829 auto *LN = cast<LoadSDNode>(N0);
41830 if (SDValue VZLoad =
41831 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
41832 DCI.CombineTo(N.getNode(), VZLoad);
41833 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41834 DCI.recursivelyDeleteUnusedNodes(LN);
41835 return N;
41836 }
41837 }
41838
41839 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
41840 // and can just use a VZEXT_LOAD.
41841 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
41842 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
41843 auto *LN = cast<MemSDNode>(N0);
41844 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
41845 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41846 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
41847 SDValue VZLoad =
41848 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
41849 LN->getMemoryVT(), LN->getMemOperand());
41850 DCI.CombineTo(N.getNode(), VZLoad);
41851 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41852 DCI.recursivelyDeleteUnusedNodes(LN);
41853 return N;
41854 }
41855 }
41856
41857 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
41858 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
41859 // if the upper bits of the i64 are zero.
41860 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
41861 N0.getOperand(0).hasOneUse() &&
41862 N0.getOperand(0).getValueType() == MVT::i64) {
41863 SDValue In = N0.getOperand(0);
41864 APInt Mask = APInt::getHighBitsSet(64, 32);
41865 if (DAG.MaskedValueIsZero(In, Mask)) {
41866 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
41867 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
41868 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
41869 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
41870 return DAG.getBitcast(VT, Movl);
41871 }
41872 }
41873
41874 // Load a scalar integer constant directly to XMM instead of transferring an
41875 // immediate value from GPR.
41876 // vzext_movl (scalar_to_vector C) --> load [C,0...]
41877 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
41878 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
41879 // Create a vector constant - scalar constant followed by zeros.
41880 EVT ScalarVT = N0.getOperand(0).getValueType();
41881 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
41882 unsigned NumElts = VT.getVectorNumElements();
41883 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
41884 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
41885 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
41886
41887 // Load the vector constant from constant pool.
41888 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
41889 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
41890 MachinePointerInfo MPI =
41891 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
41892 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
41893 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
41894 MachineMemOperand::MOLoad);
41895 }
41896 }
41897
41898 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
41899 // insert into a zero vector. This helps get VZEXT_MOVL closer to
41900 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
41901 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
41902 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
41903 SDValue V = peekThroughOneUseBitcasts(N0);
41904
41905 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
41906 isNullConstant(V.getOperand(2))) {
41907 SDValue In = V.getOperand(1);
41908 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
41909 In.getValueSizeInBits() /
41910 VT.getScalarSizeInBits());
41911 In = DAG.getBitcast(SubVT, In);
41912 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
41913 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
41914 getZeroVector(VT, Subtarget, DAG, DL), Movl,
41915 V.getOperand(2));
41916 }
41917 }
41918
41919 return SDValue();
41920 }
41921 case X86ISD::BLENDI: {
41922 SDValue N0 = N.getOperand(0);
41923 SDValue N1 = N.getOperand(1);
41924
41925 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41926 // TODO: Handle MVT::v16i16 repeated blend mask.
41927 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
41928 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41929 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41930 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
41931 SrcVT.getScalarSizeInBits() >= 32) {
41932 unsigned BlendMask = N.getConstantOperandVal(2);
41933 unsigned Size = VT.getVectorNumElements();
41934 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
41935 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
41936 return DAG.getBitcast(
41937 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41938 N1.getOperand(0),
41939 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
41940 }
41941 }
41942 return SDValue();
41943 }
41944 case X86ISD::SHUFP: {
41945 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
41946 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
41947 // TODO: Support types other than v4f32.
41948 if (VT == MVT::v4f32) {
41949 bool Updated = false;
41950 SmallVector<int> Mask;
41951 SmallVector<SDValue> Ops;
41952 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
41953 Ops.size() == 2) {
41954 for (int i = 0; i != 2; ++i) {
41955 SmallVector<SDValue> SubOps;
41956 SmallVector<int> SubMask, SubScaledMask;
41957 SDValue Sub = peekThroughBitcasts(Ops[i]);
41958 // TODO: Scaling might be easier if we specify the demanded elts.
41959 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
41960 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
41961 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
41962 int Ofs = i * 2;
41963 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
41964 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
41965 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
41966 Updated = true;
41967 }
41968 }
41969 }
41970 if (Updated) {
41971 for (int &M : Mask)
41972 M %= 4;
41973 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41974 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
41975 }
41976 }
41977 return SDValue();
41978 }
41979 case X86ISD::VPERMI: {
41980 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
41981 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
41982 SDValue N0 = N.getOperand(0);
41983 SDValue N1 = N.getOperand(1);
41984 unsigned EltSizeInBits = VT.getScalarSizeInBits();
41985 if (N0.getOpcode() == ISD::BITCAST &&
41986 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
41987 SDValue Src = N0.getOperand(0);
41988 EVT SrcVT = Src.getValueType();
41989 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
41990 return DAG.getBitcast(VT, Res);
41991 }
41992 return SDValue();
41993 }
41994 case X86ISD::VPERM2X128: {
41995 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
41996 SDValue LHS = N->getOperand(0);
41997 SDValue RHS = N->getOperand(1);
41998 if (LHS.getOpcode() == ISD::BITCAST &&
41999 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42000 EVT SrcVT = LHS.getOperand(0).getValueType();
42001 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42002 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42003 DAG.getBitcast(SrcVT, LHS),
42004 DAG.getBitcast(SrcVT, RHS),
42005 N->getOperand(2)));
42006 }
42007 }
42008
42009 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42010 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
42011 return Res;
42012
42013 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42014 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42015 auto FindSubVector128 = [&](unsigned Idx) {
42016 if (Idx > 3)
42017 return SDValue();
42018 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42019 SmallVector<SDValue> SubOps;
42020 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42021 return SubOps[Idx & 1];
42022 unsigned NumElts = Src.getValueType().getVectorNumElements();
42023 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42024 Src.getOperand(1).getValueSizeInBits() == 128 &&
42025 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42026 return Src.getOperand(1);
42027 }
42028 return SDValue();
42029 };
42030 unsigned Imm = N.getConstantOperandVal(2);
42031 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42032 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42033 MVT SubVT = VT.getHalfNumVectorElementsVT();
42034 SubLo = DAG.getBitcast(SubVT, SubLo);
42035 SubHi = DAG.getBitcast(SubVT, SubHi);
42036 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42037 }
42038 }
42039 return SDValue();
42040 }
42041 case X86ISD::PSHUFD:
42042 case X86ISD::PSHUFLW:
42043 case X86ISD::PSHUFHW:
42044 Mask = getPSHUFShuffleMask(N);
42045 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42045, __extension__ __PRETTY_FUNCTION__))
;
42046 break;
42047 case X86ISD::MOVSD:
42048 case X86ISD::MOVSH:
42049 case X86ISD::MOVSS: {
42050 SDValue N0 = N.getOperand(0);
42051 SDValue N1 = N.getOperand(1);
42052
42053 // Canonicalize scalar FPOps:
42054 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42055 // If commutable, allow OP(N1[0], N0[0]).
42056 unsigned Opcode1 = N1.getOpcode();
42057 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42058 Opcode1 == ISD::FDIV) {
42059 SDValue N10 = N1.getOperand(0);
42060 SDValue N11 = N1.getOperand(1);
42061 if (N10 == N0 ||
42062 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42063 if (N10 != N0)
42064 std::swap(N10, N11);
42065 MVT SVT = VT.getVectorElementType();
42066 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
42067 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42068 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42069 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42070 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42071 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42072 }
42073 }
42074
42075 return SDValue();
42076 }
42077 case X86ISD::INSERTPS: {
42078 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42078, __extension__
__PRETTY_FUNCTION__))
;
42079 SDValue Op0 = N.getOperand(0);
42080 SDValue Op1 = N.getOperand(1);
42081 unsigned InsertPSMask = N.getConstantOperandVal(2);
42082 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42083 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42084 unsigned ZeroMask = InsertPSMask & 0xF;
42085
42086 // If we zero out all elements from Op0 then we don't need to reference it.
42087 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42088 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42089 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42090
42091 // If we zero out the element from Op1 then we don't need to reference it.
42092 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42093 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42094 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42095
42096 // Attempt to merge insertps Op1 with an inner target shuffle node.
42097 SmallVector<int, 8> TargetMask1;
42098 SmallVector<SDValue, 2> Ops1;
42099 APInt KnownUndef1, KnownZero1;
42100 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42101 KnownZero1)) {
42102 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42103 // Zero/UNDEF insertion - zero out element and remove dependency.
42104 InsertPSMask |= (1u << DstIdx);
42105 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42106 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42107 }
42108 // Update insertps mask srcidx and reference the source input directly.
42109 int M = TargetMask1[SrcIdx];
42110 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42110, __extension__
__PRETTY_FUNCTION__))
;
42111 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42112 Op1 = Ops1[M < 4 ? 0 : 1];
42113 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42114 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42115 }
42116
42117 // Attempt to merge insertps Op0 with an inner target shuffle node.
42118 SmallVector<int, 8> TargetMask0;
42119 SmallVector<SDValue, 2> Ops0;
42120 APInt KnownUndef0, KnownZero0;
42121 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42122 KnownZero0)) {
42123 bool Updated = false;
42124 bool UseInput00 = false;
42125 bool UseInput01 = false;
42126 for (int i = 0; i != 4; ++i) {
42127 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42128 // No change if element is already zero or the inserted element.
42129 continue;
42130 }
42131
42132 if (KnownUndef0[i] || KnownZero0[i]) {
42133 // If the target mask is undef/zero then we must zero the element.
42134 InsertPSMask |= (1u << i);
42135 Updated = true;
42136 continue;
42137 }
42138
42139 // The input vector element must be inline.
42140 int M = TargetMask0[i];
42141 if (M != i && M != (i + 4))
42142 return SDValue();
42143
42144 // Determine which inputs of the target shuffle we're using.
42145 UseInput00 |= (0 <= M && M < 4);
42146 UseInput01 |= (4 <= M);
42147 }
42148
42149 // If we're not using both inputs of the target shuffle then use the
42150 // referenced input directly.
42151 if (UseInput00 && !UseInput01) {
42152 Updated = true;
42153 Op0 = Ops0[0];
42154 } else if (!UseInput00 && UseInput01) {
42155 Updated = true;
42156 Op0 = Ops0[1];
42157 }
42158
42159 if (Updated)
42160 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42161 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42162 }
42163
42164 // If we're inserting an element from a vbroadcast load, fold the
42165 // load into the X86insertps instruction. We need to convert the scalar
42166 // load to a vector and clear the source lane of the INSERTPS control.
42167 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42168 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42169 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42170 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42171 MemIntr->getBasePtr(),
42172 MemIntr->getMemOperand());
42173 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42174 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
42175 Load),
42176 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42177 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42178 return Insert;
42179 }
42180 }
42181
42182 return SDValue();
42183 }
42184 default:
42185 return SDValue();
42186 }
42187
42188 // Nuke no-op shuffles that show up after combining.
42189 if (isNoopShuffleMask(Mask))
42190 return N.getOperand(0);
42191
42192 // Look for simplifications involving one or two shuffle instructions.
42193 SDValue V = N.getOperand(0);
42194 switch (N.getOpcode()) {
42195 default:
42196 break;
42197 case X86ISD::PSHUFLW:
42198 case X86ISD::PSHUFHW:
42199 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42199, __extension__
__PRETTY_FUNCTION__))
;
42200
42201 // See if this reduces to a PSHUFD which is no more expensive and can
42202 // combine with more operations. Note that it has to at least flip the
42203 // dwords as otherwise it would have been removed as a no-op.
42204 if (ArrayRef(Mask).equals({2, 3, 0, 1})) {
42205 int DMask[] = {0, 1, 2, 3};
42206 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42207 DMask[DOffset + 0] = DOffset + 1;
42208 DMask[DOffset + 1] = DOffset + 0;
42209 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
42210 V = DAG.getBitcast(DVT, V);
42211 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42212 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42213 return DAG.getBitcast(VT, V);
42214 }
42215
42216 // Look for shuffle patterns which can be implemented as a single unpack.
42217 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42218 // only works when we have a PSHUFD followed by two half-shuffles.
42219 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42220 (V.getOpcode() == X86ISD::PSHUFLW ||
42221 V.getOpcode() == X86ISD::PSHUFHW) &&
42222 V.getOpcode() != N.getOpcode() &&
42223 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42224 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42225 if (D.getOpcode() == X86ISD::PSHUFD) {
42226 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
42227 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
42228 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42229 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42230 int WordMask[8];
42231 for (int i = 0; i < 4; ++i) {
42232 WordMask[i + NOffset] = Mask[i] + NOffset;
42233 WordMask[i + VOffset] = VMask[i] + VOffset;
42234 }
42235 // Map the word mask through the DWord mask.
42236 int MappedMask[8];
42237 for (int i = 0; i < 8; ++i)
42238 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42239 if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42240 ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42241 // We can replace all three shuffles with an unpack.
42242 V = DAG.getBitcast(VT, D.getOperand(0));
42243 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42244 : X86ISD::UNPCKH,
42245 DL, VT, V, V);
42246 }
42247 }
42248 }
42249
42250 break;
42251
42252 case X86ISD::PSHUFD:
42253 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
42254 return NewN;
42255
42256 break;
42257 }
42258
42259 return SDValue();
42260}
42261
42262/// Checks if the shuffle mask takes subsequent elements
42263/// alternately from two vectors.
42264/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42265static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42266
42267 int ParitySrc[2] = {-1, -1};
42268 unsigned Size = Mask.size();
42269 for (unsigned i = 0; i != Size; ++i) {
42270 int M = Mask[i];
42271 if (M < 0)
42272 continue;
42273
42274 // Make sure we are using the matching element from the input.
42275 if ((M % Size) != i)
42276 return false;
42277
42278 // Make sure we use the same input for all elements of the same parity.
42279 int Src = M / Size;
42280 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42281 return false;
42282 ParitySrc[i % 2] = Src;
42283 }
42284
42285 // Make sure each input is used.
42286 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42287 return false;
42288
42289 Op0Even = ParitySrc[0] == 0;
42290 return true;
42291}
42292
42293/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42294/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42295/// are written to the parameters \p Opnd0 and \p Opnd1.
42296///
42297/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42298/// so it is easier to generically match. We also insert dummy vector shuffle
42299/// nodes for the operands which explicitly discard the lanes which are unused
42300/// by this operation to try to flow through the rest of the combiner
42301/// the fact that they're unused.
42302static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42303 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42304 bool &IsSubAdd) {
42305
42306 EVT VT = N->getValueType(0);
42307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42308 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42309 !VT.getSimpleVT().isFloatingPoint())
42310 return false;
42311
42312 // We only handle target-independent shuffles.
42313 // FIXME: It would be easy and harmless to use the target shuffle mask
42314 // extraction tool to support more.
42315 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42316 return false;
42317
42318 SDValue V1 = N->getOperand(0);
42319 SDValue V2 = N->getOperand(1);
42320
42321 // Make sure we have an FADD and an FSUB.
42322 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42323 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42324 V1.getOpcode() == V2.getOpcode())
42325 return false;
42326
42327 // If there are other uses of these operations we can't fold them.
42328 if (!V1->hasOneUse() || !V2->hasOneUse())
42329 return false;
42330
42331 // Ensure that both operations have the same operands. Note that we can
42332 // commute the FADD operands.
42333 SDValue LHS, RHS;
42334 if (V1.getOpcode() == ISD::FSUB) {
42335 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42336 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42337 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42338 return false;
42339 } else {
42340 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42340, __extension__
__PRETTY_FUNCTION__))
;
42341 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42342 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42343 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42344 return false;
42345 }
42346
42347 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42348 bool Op0Even;
42349 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42350 return false;
42351
42352 // It's a subadd if the vector in the even parity is an FADD.
42353 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42354 : V2->getOpcode() == ISD::FADD;
42355
42356 Opnd0 = LHS;
42357 Opnd1 = RHS;
42358 return true;
42359}
42360
42361/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42362static SDValue combineShuffleToFMAddSub(SDNode *N,
42363 const X86Subtarget &Subtarget,
42364 SelectionDAG &DAG) {
42365 // We only handle target-independent shuffles.
42366 // FIXME: It would be easy and harmless to use the target shuffle mask
42367 // extraction tool to support more.
42368 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42369 return SDValue();
42370
42371 MVT VT = N->getSimpleValueType(0);
42372 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42373 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42374 return SDValue();
42375
42376 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42377 SDValue Op0 = N->getOperand(0);
42378 SDValue Op1 = N->getOperand(1);
42379 SDValue FMAdd = Op0, FMSub = Op1;
42380 if (FMSub.getOpcode() != X86ISD::FMSUB)
42381 std::swap(FMAdd, FMSub);
42382
42383 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42384 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42385 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42386 FMAdd.getOperand(2) != FMSub.getOperand(2))
42387 return SDValue();
42388
42389 // Check for correct shuffle mask.
42390 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42391 bool Op0Even;
42392 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42393 return SDValue();
42394
42395 // FMAddSub takes zeroth operand from FMSub node.
42396 SDLoc DL(N);
42397 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42398 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42399 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42400 FMAdd.getOperand(2));
42401}
42402
42403/// Try to combine a shuffle into a target-specific add-sub or
42404/// mul-add-sub node.
42405static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
42406 const X86Subtarget &Subtarget,
42407 SelectionDAG &DAG) {
42408 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
42409 return V;
42410
42411 SDValue Opnd0, Opnd1;
42412 bool IsSubAdd;
42413 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42414 return SDValue();
42415
42416 MVT VT = N->getSimpleValueType(0);
42417 SDLoc DL(N);
42418
42419 // Try to generate X86ISD::FMADDSUB node here.
42420 SDValue Opnd2;
42421 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42422 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42423 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42424 }
42425
42426 if (IsSubAdd)
42427 return SDValue();
42428
42429 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42430 // the ADDSUB idiom has been successfully recognized. There are no known
42431 // X86 targets with 512-bit ADDSUB instructions!
42432 if (VT.is512BitVector())
42433 return SDValue();
42434
42435 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42436 // the ADDSUB idiom has been successfully recognized. There are no known
42437 // X86 targets with FP16 ADDSUB instructions!
42438 if (VT.getVectorElementType() == MVT::f16)
42439 return SDValue();
42440
42441 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42442}
42443
42444// We are looking for a shuffle where both sources are concatenated with undef
42445// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42446// if we can express this as a single-source shuffle, that's preferable.
42447static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
42448 const X86Subtarget &Subtarget) {
42449 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42450 return SDValue();
42451
42452 EVT VT = N->getValueType(0);
42453
42454 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42455 if (!VT.is128BitVector() && !VT.is256BitVector())
42456 return SDValue();
42457
42458 if (VT.getVectorElementType() != MVT::i32 &&
42459 VT.getVectorElementType() != MVT::i64 &&
42460 VT.getVectorElementType() != MVT::f32 &&
42461 VT.getVectorElementType() != MVT::f64)
42462 return SDValue();
42463
42464 SDValue N0 = N->getOperand(0);
42465 SDValue N1 = N->getOperand(1);
42466
42467 // Check that both sources are concats with undef.
42468 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42469 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42470 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42471 !N1.getOperand(1).isUndef())
42472 return SDValue();
42473
42474 // Construct the new shuffle mask. Elements from the first source retain their
42475 // index, but elements from the second source no longer need to skip an undef.
42476 SmallVector<int, 8> Mask;
42477 int NumElts = VT.getVectorNumElements();
42478
42479 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
42480 for (int Elt : SVOp->getMask())
42481 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42482
42483 SDLoc DL(N);
42484 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
42485 N1.getOperand(0));
42486 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42487}
42488
42489/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42490/// low half of each source vector and does not set any high half elements in
42491/// the destination vector, narrow the shuffle to half its original size.
42492static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
42493 if (!Shuf->getValueType(0).isSimple())
42494 return SDValue();
42495 MVT VT = Shuf->getSimpleValueType(0);
42496 if (!VT.is256BitVector() && !VT.is512BitVector())
42497 return SDValue();
42498
42499 // See if we can ignore all of the high elements of the shuffle.
42500 ArrayRef<int> Mask = Shuf->getMask();
42501 if (!isUndefUpperHalf(Mask))
42502 return SDValue();
42503
42504 // Check if the shuffle mask accesses only the low half of each input vector
42505 // (half-index output is 0 or 2).
42506 int HalfIdx1, HalfIdx2;
42507 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42508 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42509 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42510 return SDValue();
42511
42512 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42513 // The trick is knowing that all of the insert/extract are actually free
42514 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42515 // of narrow inputs into a narrow output, and that is always cheaper than
42516 // the wide shuffle that we started with.
42517 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42518 Shuf->getOperand(1), HalfMask, HalfIdx1,
42519 HalfIdx2, false, DAG, /*UseConcat*/true);
42520}
42521
42522static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
42523 TargetLowering::DAGCombinerInfo &DCI,
42524 const X86Subtarget &Subtarget) {
42525 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42526 if (SDValue V = narrowShuffle(Shuf, DAG))
42527 return V;
42528
42529 // If we have legalized the vector types, look for blends of FADD and FSUB
42530 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42531 SDLoc dl(N);
42532 EVT VT = N->getValueType(0);
42533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42534 if (TLI.isTypeLegal(VT))
42535 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
42536 return AddSub;
42537
42538 // Attempt to combine into a vector load/broadcast.
42539 if (SDValue LD = combineToConsecutiveLoads(
42540 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42541 return LD;
42542
42543 // For AVX2, we sometimes want to combine
42544 // (vector_shuffle <mask> (concat_vectors t1, undef)
42545 // (concat_vectors t2, undef))
42546 // Into:
42547 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42548 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42549 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
42550 return ShufConcat;
42551
42552 if (isTargetShuffle(N->getOpcode())) {
42553 SDValue Op(N, 0);
42554 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
42555 return Shuffle;
42556
42557 // Try recursively combining arbitrary sequences of x86 shuffle
42558 // instructions into higher-order shuffles. We do this after combining
42559 // specific PSHUF instruction sequences into their minimal form so that we
42560 // can evaluate how many specialized shuffle instructions are involved in
42561 // a particular chain.
42562 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42563 return Res;
42564
42565 // Simplify source operands based on shuffle mask.
42566 // TODO - merge this into combineX86ShufflesRecursively.
42567 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42568 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42569 return SDValue(N, 0);
42570
42571 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42572 // Perform this after other shuffle combines to allow inner shuffles to be
42573 // combined away first.
42574 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
42575 return BinOp;
42576 }
42577
42578 return SDValue();
42579}
42580
42581// Simplify variable target shuffle masks based on the demanded elements.
42582// TODO: Handle DemandedBits in mask indices as well?
42583bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
42584 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42585 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42586 // If we're demanding all elements don't bother trying to simplify the mask.
42587 unsigned NumElts = DemandedElts.getBitWidth();
42588 if (DemandedElts.isAllOnes())
42589 return false;
42590
42591 SDValue Mask = Op.getOperand(MaskIndex);
42592 if (!Mask.hasOneUse())
42593 return false;
42594
42595 // Attempt to generically simplify the variable shuffle mask.
42596 APInt MaskUndef, MaskZero;
42597 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42598 Depth + 1))
42599 return true;
42600
42601 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42602 // TODO: Support other types from getTargetShuffleMaskIndices?
42603 SDValue BC = peekThroughOneUseBitcasts(Mask);
42604 EVT BCVT = BC.getValueType();
42605 auto *Load = dyn_cast<LoadSDNode>(BC);
42606 if (!Load)
42607 return false;
42608
42609 const Constant *C = getTargetConstantFromNode(Load);
42610 if (!C)
42611 return false;
42612
42613 Type *CTy = C->getType();
42614 if (!CTy->isVectorTy() ||
42615 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42616 return false;
42617
42618 // Handle scaling for i64 elements on 32-bit targets.
42619 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42620 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
42621 return false;
42622 unsigned Scale = NumCstElts / NumElts;
42623
42624 // Simplify mask if we have an undemanded element that is not undef.
42625 bool Simplified = false;
42626 SmallVector<Constant *, 32> ConstVecOps;
42627 for (unsigned i = 0; i != NumCstElts; ++i) {
42628 Constant *Elt = C->getAggregateElement(i);
42629 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
42630 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42631 Simplified = true;
42632 continue;
42633 }
42634 ConstVecOps.push_back(Elt);
42635 }
42636 if (!Simplified)
42637 return false;
42638
42639 // Generate new constant pool entry + legalize immediately for the load.
42640 SDLoc DL(Op);
42641 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
42642 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
42643 SDValue NewMask = TLO.DAG.getLoad(
42644 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
42645 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
42646 Load->getAlign());
42647 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
42648}
42649
42650bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
42651 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
42652 TargetLoweringOpt &TLO, unsigned Depth) const {
42653 int NumElts = DemandedElts.getBitWidth();
42654 unsigned Opc = Op.getOpcode();
42655 EVT VT = Op.getValueType();
42656
42657 // Handle special case opcodes.
42658 switch (Opc) {
42659 case X86ISD::PMULDQ:
42660 case X86ISD::PMULUDQ: {
42661 APInt LHSUndef, LHSZero;
42662 APInt RHSUndef, RHSZero;
42663 SDValue LHS = Op.getOperand(0);
42664 SDValue RHS = Op.getOperand(1);
42665 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42666 Depth + 1))
42667 return true;
42668 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42669 Depth + 1))
42670 return true;
42671 // Multiply by zero.
42672 KnownZero = LHSZero | RHSZero;
42673 break;
42674 }
42675 case X86ISD::VPMADDWD: {
42676 APInt LHSUndef, LHSZero;
42677 APInt RHSUndef, RHSZero;
42678 SDValue LHS = Op.getOperand(0);
42679 SDValue RHS = Op.getOperand(1);
42680 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
42681
42682 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
42683 Depth + 1))
42684 return true;
42685 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
42686 Depth + 1))
42687 return true;
42688
42689 // TODO: Multiply by zero.
42690
42691 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
42692 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
42693 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
42694 Depth + 1))
42695 return true;
42696 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
42697 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
42698 Depth + 1))
42699 return true;
42700 break;
42701 }
42702 case X86ISD::PSADBW: {
42703 SDValue LHS = Op.getOperand(0);
42704 SDValue RHS = Op.getOperand(1);
42705 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))
42706 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))
42707 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))
42708 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))
;
42709
42710 // Aggressively peek through ops to get at the demanded elts.
42711 if (!DemandedElts.isAllOnes()) {
42712 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
42713 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
42714 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
42715 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42716 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
42717 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
42718 if (NewLHS || NewRHS) {
42719 NewLHS = NewLHS ? NewLHS : LHS;
42720 NewRHS = NewRHS ? NewRHS : RHS;
42721 return TLO.CombineTo(
42722 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42723 }
42724 }
42725 break;
42726 }
42727 case X86ISD::VSHL:
42728 case X86ISD::VSRL:
42729 case X86ISD::VSRA: {
42730 // We only need the bottom 64-bits of the (128-bit) shift amount.
42731 SDValue Amt = Op.getOperand(1);
42732 MVT AmtVT = Amt.getSimpleValueType();
42733 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42733, __extension__
__PRETTY_FUNCTION__))
;
42734
42735 // If we reuse the shift amount just for sse shift amounts then we know that
42736 // only the bottom 64-bits are only ever used.
42737 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
42738 unsigned UseOpc = Use->getOpcode();
42739 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
42740 UseOpc == X86ISD::VSRA) &&
42741 Use->getOperand(0) != Amt;
42742 });
42743
42744 APInt AmtUndef, AmtZero;
42745 unsigned NumAmtElts = AmtVT.getVectorNumElements();
42746 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
42747 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
42748 Depth + 1, AssumeSingleUse))
42749 return true;
42750 [[fallthrough]];
42751 }
42752 case X86ISD::VSHLI:
42753 case X86ISD::VSRLI:
42754 case X86ISD::VSRAI: {
42755 SDValue Src = Op.getOperand(0);
42756 APInt SrcUndef;
42757 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
42758 Depth + 1))
42759 return true;
42760
42761 // Fold shift(0,x) -> 0
42762 if (DemandedElts.isSubsetOf(KnownZero))
42763 return TLO.CombineTo(
42764 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42765
42766 // Aggressively peek through ops to get at the demanded elts.
42767 if (!DemandedElts.isAllOnes())
42768 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
42769 Src, DemandedElts, TLO.DAG, Depth + 1))
42770 return TLO.CombineTo(
42771 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
42772 break;
42773 }
42774 case X86ISD::VPSHA:
42775 case X86ISD::VPSHL:
42776 case X86ISD::VSHLV:
42777 case X86ISD::VSRLV:
42778 case X86ISD::VSRAV: {
42779 APInt LHSUndef, LHSZero;
42780 APInt RHSUndef, RHSZero;
42781 SDValue LHS = Op.getOperand(0);
42782 SDValue RHS = Op.getOperand(1);
42783 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42784 Depth + 1))
42785 return true;
42786
42787 // Fold shift(0,x) -> 0
42788 if (DemandedElts.isSubsetOf(LHSZero))
42789 return TLO.CombineTo(
42790 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42791
42792 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42793 Depth + 1))
42794 return true;
42795
42796 KnownZero = LHSZero;
42797 break;
42798 }
42799 case X86ISD::KSHIFTL: {
42800 SDValue Src = Op.getOperand(0);
42801 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42802 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42802, __extension__
__PRETTY_FUNCTION__))
;
42803 unsigned ShiftAmt = Amt->getZExtValue();
42804
42805 if (ShiftAmt == 0)
42806 return TLO.CombineTo(Op, Src);
42807
42808 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42809 // single shift. We can do this if the bottom bits (which are shifted
42810 // out) are never demanded.
42811 if (Src.getOpcode() == X86ISD::KSHIFTR) {
42812 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
42813 unsigned C1 = Src.getConstantOperandVal(1);
42814 unsigned NewOpc = X86ISD::KSHIFTL;
42815 int Diff = ShiftAmt - C1;
42816 if (Diff < 0) {
42817 Diff = -Diff;
42818 NewOpc = X86ISD::KSHIFTR;
42819 }
42820
42821 SDLoc dl(Op);
42822 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42823 return TLO.CombineTo(
42824 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42825 }
42826 }
42827
42828 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
42829 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42830 Depth + 1))
42831 return true;
42832
42833 KnownUndef <<= ShiftAmt;
42834 KnownZero <<= ShiftAmt;
42835 KnownZero.setLowBits(ShiftAmt);
42836 break;
42837 }
42838 case X86ISD::KSHIFTR: {
42839 SDValue Src = Op.getOperand(0);
42840 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42841 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42841, __extension__
__PRETTY_FUNCTION__))
;
42842 unsigned ShiftAmt = Amt->getZExtValue();
42843
42844 if (ShiftAmt == 0)
42845 return TLO.CombineTo(Op, Src);
42846
42847 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
42848 // single shift. We can do this if the top bits (which are shifted
42849 // out) are never demanded.
42850 if (Src.getOpcode() == X86ISD::KSHIFTL) {
42851 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
42852 unsigned C1 = Src.getConstantOperandVal(1);
42853 unsigned NewOpc = X86ISD::KSHIFTR;
42854 int Diff = ShiftAmt - C1;
42855 if (Diff < 0) {
42856 Diff = -Diff;
42857 NewOpc = X86ISD::KSHIFTL;
42858 }
42859
42860 SDLoc dl(Op);
42861 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42862 return TLO.CombineTo(
42863 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42864 }
42865 }
42866
42867 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
42868 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42869 Depth + 1))
42870 return true;
42871
42872 KnownUndef.lshrInPlace(ShiftAmt);
42873 KnownZero.lshrInPlace(ShiftAmt);
42874 KnownZero.setHighBits(ShiftAmt);
42875 break;
42876 }
42877 case X86ISD::ANDNP: {
42878 // ANDNP = (~LHS & RHS);
42879 SDValue LHS = Op.getOperand(0);
42880 SDValue RHS = Op.getOperand(1);
42881
42882 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
42883 APInt UndefElts;
42884 SmallVector<APInt> EltBits;
42885 int NumElts = VT.getVectorNumElements();
42886 int EltSizeInBits = VT.getScalarSizeInBits();
42887 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
42888 APInt OpElts = DemandedElts;
42889 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
42890 EltBits)) {
42891 OpBits.clearAllBits();
42892 OpElts.clearAllBits();
42893 for (int I = 0; I != NumElts; ++I) {
42894 if (!DemandedElts[I])
42895 continue;
42896 if (UndefElts[I]) {
42897 // We can't assume an undef src element gives an undef dst - the
42898 // other src might be zero.
42899 OpBits.setAllBits();
42900 OpElts.setBit(I);
42901 } else if ((Invert && !EltBits[I].isAllOnes()) ||
42902 (!Invert && !EltBits[I].isZero())) {
42903 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
42904 OpElts.setBit(I);
42905 }
42906 }
42907 }
42908 return std::make_pair(OpBits, OpElts);
42909 };
42910 APInt BitsLHS, EltsLHS;
42911 APInt BitsRHS, EltsRHS;
42912 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
42913 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
42914
42915 APInt LHSUndef, LHSZero;
42916 APInt RHSUndef, RHSZero;
42917 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
42918 Depth + 1))
42919 return true;
42920 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
42921 Depth + 1))
42922 return true;
42923
42924 if (!DemandedElts.isAllOnes()) {
42925 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
42926 TLO.DAG, Depth + 1);
42927 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
42928 TLO.DAG, Depth + 1);
42929 if (NewLHS || NewRHS) {
42930 NewLHS = NewLHS ? NewLHS : LHS;
42931 NewRHS = NewRHS ? NewRHS : RHS;
42932 return TLO.CombineTo(
42933 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42934 }
42935 }
42936 break;
42937 }
42938 case X86ISD::CVTSI2P:
42939 case X86ISD::CVTUI2P: {
42940 SDValue Src = Op.getOperand(0);
42941 MVT SrcVT = Src.getSimpleValueType();
42942 APInt SrcUndef, SrcZero;
42943 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42944 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42945 Depth + 1))
42946 return true;
42947 break;
42948 }
42949 case X86ISD::PACKSS:
42950 case X86ISD::PACKUS: {
42951 SDValue N0 = Op.getOperand(0);
42952 SDValue N1 = Op.getOperand(1);
42953
42954 APInt DemandedLHS, DemandedRHS;
42955 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42956
42957 APInt LHSUndef, LHSZero;
42958 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42959 Depth + 1))
42960 return true;
42961 APInt RHSUndef, RHSZero;
42962 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42963 Depth + 1))
42964 return true;
42965
42966 // TODO - pass on known zero/undef.
42967
42968 // Aggressively peek through ops to get at the demanded elts.
42969 // TODO - we should do this for all target/faux shuffles ops.
42970 if (!DemandedElts.isAllOnes()) {
42971 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42972 TLO.DAG, Depth + 1);
42973 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42974 TLO.DAG, Depth + 1);
42975 if (NewN0 || NewN1) {
42976 NewN0 = NewN0 ? NewN0 : N0;
42977 NewN1 = NewN1 ? NewN1 : N1;
42978 return TLO.CombineTo(Op,
42979 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42980 }
42981 }
42982 break;
42983 }
42984 case X86ISD::HADD:
42985 case X86ISD::HSUB:
42986 case X86ISD::FHADD:
42987 case X86ISD::FHSUB: {
42988 SDValue N0 = Op.getOperand(0);
42989 SDValue N1 = Op.getOperand(1);
42990
42991 APInt DemandedLHS, DemandedRHS;
42992 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42993
42994 APInt LHSUndef, LHSZero;
42995 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42996 Depth + 1))
42997 return true;
42998 APInt RHSUndef, RHSZero;
42999 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43000 Depth + 1))
43001 return true;
43002
43003 // TODO - pass on known zero/undef.
43004
43005 // Aggressively peek through ops to get at the demanded elts.
43006 // TODO: Handle repeated operands.
43007 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43008 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43009 TLO.DAG, Depth + 1);
43010 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43011 TLO.DAG, Depth + 1);
43012 if (NewN0 || NewN1) {
43013 NewN0 = NewN0 ? NewN0 : N0;
43014 NewN1 = NewN1 ? NewN1 : N1;
43015 return TLO.CombineTo(Op,
43016 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43017 }
43018 }
43019 break;
43020 }
43021 case X86ISD::VTRUNC:
43022 case X86ISD::VTRUNCS:
43023 case X86ISD::VTRUNCUS: {
43024 SDValue Src = Op.getOperand(0);
43025 MVT SrcVT = Src.getSimpleValueType();
43026 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43027 APInt SrcUndef, SrcZero;
43028 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43029 Depth + 1))
43030 return true;
43031 KnownZero = SrcZero.zextOrTrunc(NumElts);
43032 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43033 break;
43034 }
43035 case X86ISD::BLENDV: {
43036 APInt SelUndef, SelZero;
43037 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43038 SelZero, TLO, Depth + 1))
43039 return true;
43040
43041 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43042 APInt LHSUndef, LHSZero;
43043 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43044 LHSZero, TLO, Depth + 1))
43045 return true;
43046
43047 APInt RHSUndef, RHSZero;
43048 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43049 RHSZero, TLO, Depth + 1))
43050 return true;
43051
43052 KnownZero = LHSZero & RHSZero;
43053 KnownUndef = LHSUndef & RHSUndef;
43054 break;
43055 }
43056 case X86ISD::VZEXT_MOVL: {
43057 // If upper demanded elements are already zero then we have nothing to do.
43058 SDValue Src = Op.getOperand(0);
43059 APInt DemandedUpperElts = DemandedElts;
43060 DemandedUpperElts.clearLowBits(1);
43061 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43062 return TLO.CombineTo(Op, Src);
43063 break;
43064 }
43065 case X86ISD::VBROADCAST: {
43066 SDValue Src = Op.getOperand(0);
43067 MVT SrcVT = Src.getSimpleValueType();
43068 if (!SrcVT.isVector())
43069 break;
43070 // Don't bother broadcasting if we just need the 0'th element.
43071 if (DemandedElts == 1) {
43072 if (Src.getValueType() != VT)
43073 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43074 SDLoc(Op));
43075 return TLO.CombineTo(Op, Src);
43076 }
43077 APInt SrcUndef, SrcZero;
43078 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43079 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43080 Depth + 1))
43081 return true;
43082 // Aggressively peek through src to get at the demanded elt.
43083 // TODO - we should do this for all target/faux shuffles ops.
43084 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43085 Src, SrcElts, TLO.DAG, Depth + 1))
43086 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43087 break;
43088 }
43089 case X86ISD::VPERMV:
43090 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43091 Depth))
43092 return true;
43093 break;
43094 case X86ISD::PSHUFB:
43095 case X86ISD::VPERMV3:
43096 case X86ISD::VPERMILPV:
43097 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43098 Depth))
43099 return true;
43100 break;
43101 case X86ISD::VPPERM:
43102 case X86ISD::VPERMIL2:
43103 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43104 Depth))
43105 return true;
43106 break;
43107 }
43108
43109 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43110 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43111 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43112 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43113 DemandedElts.lshr(NumElts / 2) == 0) {
43114 unsigned SizeInBits = VT.getSizeInBits();
43115 unsigned ExtSizeInBits = SizeInBits / 2;
43116
43117 // See if 512-bit ops only use the bottom 128-bits.
43118 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43119 ExtSizeInBits = SizeInBits / 4;
43120
43121 switch (Opc) {
43122 // Scalar broadcast.
43123 case X86ISD::VBROADCAST: {
43124 SDLoc DL(Op);
43125 SDValue Src = Op.getOperand(0);
43126 if (Src.getValueSizeInBits() > ExtSizeInBits)
43127 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43128 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43129 ExtSizeInBits / VT.getScalarSizeInBits());
43130 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43131 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43132 TLO.DAG, DL, ExtSizeInBits));
43133 }
43134 case X86ISD::VBROADCAST_LOAD: {
43135 SDLoc DL(Op);
43136 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43137 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43138 ExtSizeInBits / VT.getScalarSizeInBits());
43139 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43140 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43141 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43142 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43143 MemIntr->getMemOperand());
43144 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43145 Bcst.getValue(1));
43146 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43147 TLO.DAG, DL, ExtSizeInBits));
43148 }
43149 // Subvector broadcast.
43150 case X86ISD::SUBV_BROADCAST_LOAD: {
43151 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43152 EVT MemVT = MemIntr->getMemoryVT();
43153 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43154 SDLoc DL(Op);
43155 SDValue Ld =
43156 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43157 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43158 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43159 Ld.getValue(1));
43160 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43161 TLO.DAG, DL, ExtSizeInBits));
43162 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43163 SDLoc DL(Op);
43164 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43165 ExtSizeInBits / VT.getScalarSizeInBits());
43166 if (SDValue BcstLd =
43167 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43168 return TLO.CombineTo(Op,
43169 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43170 TLO.DAG, DL, ExtSizeInBits));
43171 }
43172 break;
43173 }
43174 // Byte shifts by immediate.
43175 case X86ISD::VSHLDQ:
43176 case X86ISD::VSRLDQ:
43177 // Shift by uniform.
43178 case X86ISD::VSHL:
43179 case X86ISD::VSRL:
43180 case X86ISD::VSRA:
43181 // Shift by immediate.
43182 case X86ISD::VSHLI:
43183 case X86ISD::VSRLI:
43184 case X86ISD::VSRAI: {
43185 SDLoc DL(Op);
43186 SDValue Ext0 =
43187 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43188 SDValue ExtOp =
43189 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43190 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43191 SDValue Insert =
43192 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43193 return TLO.CombineTo(Op, Insert);
43194 }
43195 case X86ISD::VPERMI: {
43196 // Simplify PERMPD/PERMQ to extract_subvector.
43197 // TODO: This should be done in shuffle combining.
43198 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43199 SmallVector<int, 4> Mask;
43200 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43201 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43202 SDLoc DL(Op);
43203 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43204 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43205 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43206 return TLO.CombineTo(Op, Insert);
43207 }
43208 }
43209 break;
43210 }
43211 case X86ISD::VPERM2X128: {
43212 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43213 SDLoc DL(Op);
43214 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43215 if (LoMask & 0x8)
43216 return TLO.CombineTo(
43217 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43218 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43219 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43220 SDValue ExtOp =
43221 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43222 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43223 SDValue Insert =
43224 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43225 return TLO.CombineTo(Op, Insert);
43226 }
43227 // Zero upper elements.
43228 case X86ISD::VZEXT_MOVL:
43229 // Target unary shuffles by immediate:
43230 case X86ISD::PSHUFD:
43231 case X86ISD::PSHUFLW:
43232 case X86ISD::PSHUFHW:
43233 case X86ISD::VPERMILPI:
43234 // (Non-Lane Crossing) Target Shuffles.
43235 case X86ISD::VPERMILPV:
43236 case X86ISD::VPERMIL2:
43237 case X86ISD::PSHUFB:
43238 case X86ISD::UNPCKL:
43239 case X86ISD::UNPCKH:
43240 case X86ISD::BLENDI:
43241 // Integer ops.
43242 case X86ISD::PACKSS:
43243 case X86ISD::PACKUS:
43244 // Horizontal Ops.
43245 case X86ISD::HADD:
43246 case X86ISD::HSUB:
43247 case X86ISD::FHADD:
43248 case X86ISD::FHSUB: {
43249 SDLoc DL(Op);
43250 SmallVector<SDValue, 4> Ops;
43251 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43252 SDValue SrcOp = Op.getOperand(i);
43253 EVT SrcVT = SrcOp.getValueType();
43254 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43255, __extension__
__PRETTY_FUNCTION__))
43255 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43255, __extension__
__PRETTY_FUNCTION__))
;
43256 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43257 ExtSizeInBits)
43258 : SrcOp);
43259 }
43260 MVT ExtVT = VT.getSimpleVT();
43261 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43262 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43263 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43264 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43265 SDValue Insert =
43266 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43267 return TLO.CombineTo(Op, Insert);
43268 }
43269 }
43270 }
43271
43272 // For splats, unless we *only* demand the 0'th element,
43273 // stop attempts at simplification here, we aren't going to improve things,
43274 // this is better than any potential shuffle.
43275 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43276 return false;
43277
43278 // Get target/faux shuffle mask.
43279 APInt OpUndef, OpZero;
43280 SmallVector<int, 64> OpMask;
43281 SmallVector<SDValue, 2> OpInputs;
43282 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43283 OpZero, TLO.DAG, Depth, false))
43284 return false;
43285
43286 // Shuffle inputs must be the same size as the result.
43287 if (OpMask.size() != (unsigned)NumElts ||
43288 llvm::any_of(OpInputs, [VT](SDValue V) {
43289 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43290 !V.getValueType().isVector();
43291 }))
43292 return false;
43293
43294 KnownZero = OpZero;
43295 KnownUndef = OpUndef;
43296
43297 // Check if shuffle mask can be simplified to undef/zero/identity.
43298 int NumSrcs = OpInputs.size();
43299 for (int i = 0; i != NumElts; ++i)
43300 if (!DemandedElts[i])
43301 OpMask[i] = SM_SentinelUndef;
43302
43303 if (isUndefInRange(OpMask, 0, NumElts)) {
43304 KnownUndef.setAllBits();
43305 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43306 }
43307 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43308 KnownZero.setAllBits();
43309 return TLO.CombineTo(
43310 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43311 }
43312 for (int Src = 0; Src != NumSrcs; ++Src)
43313 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43314 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43315
43316 // Attempt to simplify inputs.
43317 for (int Src = 0; Src != NumSrcs; ++Src) {
43318 // TODO: Support inputs of different types.
43319 if (OpInputs[Src].getValueType() != VT)
43320 continue;
43321
43322 int Lo = Src * NumElts;
43323 APInt SrcElts = APInt::getZero(NumElts);
43324 for (int i = 0; i != NumElts; ++i)
43325 if (DemandedElts[i]) {
43326 int M = OpMask[i] - Lo;
43327 if (0 <= M && M < NumElts)
43328 SrcElts.setBit(M);
43329 }
43330
43331 // TODO - Propagate input undef/zero elts.
43332 APInt SrcUndef, SrcZero;
43333 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43334 TLO, Depth + 1))
43335 return true;
43336 }
43337
43338 // If we don't demand all elements, then attempt to combine to a simpler
43339 // shuffle.
43340 // We need to convert the depth to something combineX86ShufflesRecursively
43341 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43342 // to match. This prevents combineX86ShuffleChain from returning a
43343 // combined shuffle that's the same as the original root, causing an
43344 // infinite loop.
43345 if (!DemandedElts.isAllOnes()) {
43346 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43346, __extension__
__PRETTY_FUNCTION__))
;
43347
43348 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43349 for (int i = 0; i != NumElts; ++i)
43350 if (DemandedElts[i])
43351 DemandedMask[i] = i;
43352
43353 SDValue NewShuffle = combineX86ShufflesRecursively(
43354 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43355 /*HasVarMask*/ false,
43356 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43357 Subtarget);
43358 if (NewShuffle)
43359 return TLO.CombineTo(Op, NewShuffle);
43360 }
43361
43362 return false;
43363}
43364
43365bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
43366 SDValue Op, const APInt &OriginalDemandedBits,
43367 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43368 unsigned Depth) const {
43369 EVT VT = Op.getValueType();
43370 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43371 unsigned Opc = Op.getOpcode();
43372 switch(Opc) {
43373 case X86ISD::VTRUNC: {
43374 KnownBits KnownOp;
43375 SDValue Src = Op.getOperand(0);
43376 MVT SrcVT = Src.getSimpleValueType();
43377
43378 // Simplify the input, using demanded bit information.
43379 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43380 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43381 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43382 return true;
43383 break;
43384 }
43385 case X86ISD::PMULDQ:
43386 case X86ISD::PMULUDQ: {
43387 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43388 KnownBits KnownLHS, KnownRHS;
43389 SDValue LHS = Op.getOperand(0);
43390 SDValue RHS = Op.getOperand(1);
43391
43392 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43393 // FIXME: Can we bound this better?
43394 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43395 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43396 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43397
43398 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43399 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43400 DemandedMaskLHS = DemandedMask;
43401 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43402 DemandedMaskRHS = DemandedMask;
43403
43404 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43405 KnownLHS, TLO, Depth + 1))
43406 return true;
43407 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43408 KnownRHS, TLO, Depth + 1))
43409 return true;
43410
43411 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43412 KnownRHS = KnownRHS.trunc(32);
43413 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43414 KnownRHS.getConstant().isOne()) {
43415 SDLoc DL(Op);
43416 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43417 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43418 }
43419
43420 // Aggressively peek through ops to get at the demanded low bits.
43421 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
43422 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43423 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
43424 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43425 if (DemandedLHS || DemandedRHS) {
43426 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43427 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43428 return TLO.CombineTo(
43429 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43430 }
43431 break;
43432 }
43433 case X86ISD::VSHLI: {
43434 SDValue Op0 = Op.getOperand(0);
43435
43436 unsigned ShAmt = Op.getConstantOperandVal(1);
43437 if (ShAmt >= BitWidth)
43438 break;
43439
43440 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43441
43442 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43443 // single shift. We can do this if the bottom bits (which are shifted
43444 // out) are never demanded.
43445 if (Op0.getOpcode() == X86ISD::VSRLI &&
43446 OriginalDemandedBits.countr_zero() >= ShAmt) {
43447 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43448 if (Shift2Amt < BitWidth) {
43449 int Diff = ShAmt - Shift2Amt;
43450 if (Diff == 0)
43451 return TLO.CombineTo(Op, Op0.getOperand(0));
43452
43453 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43454 SDValue NewShift = TLO.DAG.getNode(
43455 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43456 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43457 return TLO.CombineTo(Op, NewShift);
43458 }
43459 }
43460
43461 // If we are only demanding sign bits then we can use the shift source directly.
43462 unsigned NumSignBits =
43463 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43464 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43465 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43466 return TLO.CombineTo(Op, Op0);
43467
43468 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43469 TLO, Depth + 1))
43470 return true;
43471
43472 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43472, __extension__
__PRETTY_FUNCTION__))
;
43473 Known.Zero <<= ShAmt;
43474 Known.One <<= ShAmt;
43475
43476 // Low bits known zero.
43477 Known.Zero.setLowBits(ShAmt);
43478 return false;
43479 }
43480 case X86ISD::VSRLI: {
43481 unsigned ShAmt = Op.getConstantOperandVal(1);
43482 if (ShAmt >= BitWidth)
43483 break;
43484
43485 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43486
43487 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
43488 OriginalDemandedElts, Known, TLO, Depth + 1))
43489 return true;
43490
43491 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43491, __extension__
__PRETTY_FUNCTION__))
;
43492 Known.Zero.lshrInPlace(ShAmt);
43493 Known.One.lshrInPlace(ShAmt);
43494
43495 // High bits known zero.
43496 Known.Zero.setHighBits(ShAmt);
43497 return false;
43498 }
43499 case X86ISD::VSRAI: {
43500 SDValue Op0 = Op.getOperand(0);
43501 SDValue Op1 = Op.getOperand(1);
43502
43503 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
43504 if (ShAmt >= BitWidth)
43505 break;
43506
43507 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43508
43509 // If we just want the sign bit then we don't need to shift it.
43510 if (OriginalDemandedBits.isSignMask())
43511 return TLO.CombineTo(Op, Op0);
43512
43513 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43514 if (Op0.getOpcode() == X86ISD::VSHLI &&
43515 Op.getOperand(1) == Op0.getOperand(1)) {
43516 SDValue Op00 = Op0.getOperand(0);
43517 unsigned NumSignBits =
43518 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43519 if (ShAmt < NumSignBits)
43520 return TLO.CombineTo(Op, Op00);
43521 }
43522
43523 // If any of the demanded bits are produced by the sign extension, we also
43524 // demand the input sign bit.
43525 if (OriginalDemandedBits.countl_zero() < ShAmt)
43526 DemandedMask.setSignBit();
43527
43528 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43529 TLO, Depth + 1))
43530 return true;
43531
43532 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__))
;
43533 Known.Zero.lshrInPlace(ShAmt);
43534 Known.One.lshrInPlace(ShAmt);
43535
43536 // If the input sign bit is known to be zero, or if none of the top bits
43537 // are demanded, turn this into an unsigned shift right.
43538 if (Known.Zero[BitWidth - ShAmt - 1] ||
43539 OriginalDemandedBits.countl_zero() >= ShAmt)
43540 return TLO.CombineTo(
43541 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
43542
43543 // High bits are known one.
43544 if (Known.One[BitWidth - ShAmt - 1])
43545 Known.One.setHighBits(ShAmt);
43546 return false;
43547 }
43548 case X86ISD::BLENDV: {
43549 SDValue Sel = Op.getOperand(0);
43550 SDValue LHS = Op.getOperand(1);
43551 SDValue RHS = Op.getOperand(2);
43552
43553 APInt SignMask = APInt::getSignMask(BitWidth);
43554 SDValue NewSel = SimplifyMultipleUseDemandedBits(
43555 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
43556 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
43557 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43558 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
43559 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
43560
43561 if (NewSel || NewLHS || NewRHS) {
43562 NewSel = NewSel ? NewSel : Sel;
43563 NewLHS = NewLHS ? NewLHS : LHS;
43564 NewRHS = NewRHS ? NewRHS : RHS;
43565 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
43566 NewSel, NewLHS, NewRHS));
43567 }
43568 break;
43569 }
43570 case X86ISD::PEXTRB:
43571 case X86ISD::PEXTRW: {
43572 SDValue Vec = Op.getOperand(0);
43573 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
43574 MVT VecVT = Vec.getSimpleValueType();
43575 unsigned NumVecElts = VecVT.getVectorNumElements();
43576
43577 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
43578 unsigned Idx = CIdx->getZExtValue();
43579 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
43580
43581 // If we demand no bits from the vector then we must have demanded
43582 // bits from the implict zext - simplify to zero.
43583 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
43584 if (DemandedVecBits == 0)
43585 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43586
43587 APInt KnownUndef, KnownZero;
43588 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
43589 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
43590 KnownZero, TLO, Depth + 1))
43591 return true;
43592
43593 KnownBits KnownVec;
43594 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
43595 KnownVec, TLO, Depth + 1))
43596 return true;
43597
43598 if (SDValue V = SimplifyMultipleUseDemandedBits(
43599 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
43600 return TLO.CombineTo(
43601 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
43602
43603 Known = KnownVec.zext(BitWidth);
43604 return false;
43605 }
43606 break;
43607 }
43608 case X86ISD::PINSRB:
43609 case X86ISD::PINSRW: {
43610 SDValue Vec = Op.getOperand(0);
43611 SDValue Scl = Op.getOperand(1);
43612 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43613 MVT VecVT = Vec.getSimpleValueType();
43614
43615 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
43616 unsigned Idx = CIdx->getZExtValue();
43617 if (!OriginalDemandedElts[Idx])
43618 return TLO.CombineTo(Op, Vec);
43619
43620 KnownBits KnownVec;
43621 APInt DemandedVecElts(OriginalDemandedElts);
43622 DemandedVecElts.clearBit(Idx);
43623 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
43624 KnownVec, TLO, Depth + 1))
43625 return true;
43626
43627 KnownBits KnownScl;
43628 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
43629 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
43630 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
43631 return true;
43632
43633 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
43634 Known = KnownBits::commonBits(KnownVec, KnownScl);
43635 return false;
43636 }
43637 break;
43638 }
43639 case X86ISD::PACKSS:
43640 // PACKSS saturates to MIN/MAX integer values. So if we just want the
43641 // sign bit then we can just ask for the source operands sign bit.
43642 // TODO - add known bits handling.
43643 if (OriginalDemandedBits.isSignMask()) {
43644 APInt DemandedLHS, DemandedRHS;
43645 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
43646
43647 KnownBits KnownLHS, KnownRHS;
43648 APInt SignMask = APInt::getSignMask(BitWidth * 2);
43649 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
43650 KnownLHS, TLO, Depth + 1))
43651 return true;
43652 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
43653 KnownRHS, TLO, Depth + 1))
43654 return true;
43655
43656 // Attempt to avoid multi-use ops if we don't need anything from them.
43657 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43658 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
43659 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
43660 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
43661 if (DemandedOp0 || DemandedOp1) {
43662 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
43663 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
43664 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
43665 }
43666 }
43667 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
43668 break;
43669 case X86ISD::VBROADCAST: {
43670 SDValue Src = Op.getOperand(0);
43671 MVT SrcVT = Src.getSimpleValueType();
43672 APInt DemandedElts = APInt::getOneBitSet(
43673 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
43674 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
43675 TLO, Depth + 1))
43676 return true;
43677 // If we don't need the upper bits, attempt to narrow the broadcast source.
43678 // Don't attempt this on AVX512 as it might affect broadcast folding.
43679 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
43680 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
43681 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
43682 Src->hasOneUse()) {
43683 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
43684 SDValue NewSrc =
43685 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
43686 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
43687 SDValue NewBcst =
43688 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
43689 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
43690 }
43691 break;
43692 }
43693 case X86ISD::PCMPGT:
43694 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43695 // iff we only need the sign bit then we can use R directly.
43696 if (OriginalDemandedBits.isSignMask() &&
43697 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43698 return TLO.CombineTo(Op, Op.getOperand(1));
43699 break;
43700 case X86ISD::MOVMSK: {
43701 SDValue Src = Op.getOperand(0);
43702 MVT SrcVT = Src.getSimpleValueType();
43703 unsigned SrcBits = SrcVT.getScalarSizeInBits();
43704 unsigned NumElts = SrcVT.getVectorNumElements();
43705
43706 // If we don't need the sign bits at all just return zero.
43707 if (OriginalDemandedBits.countr_zero() >= NumElts)
43708 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43709
43710 // See if we only demand bits from the lower 128-bit vector.
43711 if (SrcVT.is256BitVector() &&
43712 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
43713 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
43714 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43715 }
43716
43717 // Only demand the vector elements of the sign bits we need.
43718 APInt KnownUndef, KnownZero;
43719 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
43720 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43721 TLO, Depth + 1))
43722 return true;
43723
43724 Known.Zero = KnownZero.zext(BitWidth);
43725 Known.Zero.setHighBits(BitWidth - NumElts);
43726
43727 // MOVMSK only uses the MSB from each vector element.
43728 KnownBits KnownSrc;
43729 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
43730 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
43731 Depth + 1))
43732 return true;
43733
43734 if (KnownSrc.One[SrcBits - 1])
43735 Known.One.setLowBits(NumElts);
43736 else if (KnownSrc.Zero[SrcBits - 1])
43737 Known.Zero.setLowBits(NumElts);
43738
43739 // Attempt to avoid multi-use os if we don't need anything from it.
43740 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
43741 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
43742 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43743 return false;
43744 }
43745 case X86ISD::TESTP: {
43746 SDValue Op0 = Op.getOperand(0);
43747 SDValue Op1 = Op.getOperand(1);
43748 MVT OpVT = Op0.getSimpleValueType();
43749 assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43751, __extension__
__PRETTY_FUNCTION__))
43750 OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43751, __extension__
__PRETTY_FUNCTION__))
43751 "Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43751, __extension__
__PRETTY_FUNCTION__))
;
43752
43753 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
43754 KnownBits KnownSrc;
43755 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
43756 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1) ||
43757 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1);
43758 }
43759 case X86ISD::BEXTR:
43760 case X86ISD::BEXTRI: {
43761 SDValue Op0 = Op.getOperand(0);
43762 SDValue Op1 = Op.getOperand(1);
43763
43764 // Only bottom 16-bits of the control bits are required.
43765 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
43766 // NOTE: SimplifyDemandedBits won't do this for constants.
43767 uint64_t Val1 = Cst1->getZExtValue();
43768 uint64_t MaskedVal1 = Val1 & 0xFFFF;
43769 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
43770 SDLoc DL(Op);
43771 return TLO.CombineTo(
43772 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
43773 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
43774 }
43775
43776 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
43777 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
43778
43779 // If the length is 0, the result is 0.
43780 if (Length == 0) {
43781 Known.setAllZero();
43782 return false;
43783 }
43784
43785 if ((Shift + Length) <= BitWidth) {
43786 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
43787 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
43788 return true;
43789
43790 Known = Known.extractBits(Length, Shift);
43791 Known = Known.zextOrTrunc(BitWidth);
43792 return false;
43793 }
43794 } else {
43795 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43795, __extension__
__PRETTY_FUNCTION__))
;
43796 KnownBits Known1;
43797 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
43798 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
43799 return true;
43800
43801 // If the length is 0, replace with 0.
43802 KnownBits LengthBits = Known1.extractBits(8, 8);
43803 if (LengthBits.isZero())
43804 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43805 }
43806
43807 break;
43808 }
43809 case X86ISD::PDEP: {
43810 SDValue Op0 = Op.getOperand(0);
43811 SDValue Op1 = Op.getOperand(1);
43812
43813 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
43814 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
43815
43816 // If the demanded bits has leading zeroes, we don't demand those from the
43817 // mask.
43818 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
43819 return true;
43820
43821 // The number of possible 1s in the mask determines the number of LSBs of
43822 // operand 0 used. Undemanded bits from the mask don't matter so filter
43823 // them before counting.
43824 KnownBits Known2;
43825 uint64_t Count = (~Known.Zero & LoMask).popcount();
43826 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
43827 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
43828 return true;
43829
43830 // Zeroes are retained from the mask, but not ones.
43831 Known.One.clearAllBits();
43832 // The result will have at least as many trailing zeros as the non-mask
43833 // operand since bits can only map to the same or higher bit position.
43834 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
43835 return false;
43836 }
43837 }
43838
43839 return TargetLowering::SimplifyDemandedBitsForTargetNode(
43840 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
43841}
43842
43843SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
43844 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
43845 SelectionDAG &DAG, unsigned Depth) const {
43846 int NumElts = DemandedElts.getBitWidth();
43847 unsigned Opc = Op.getOpcode();
43848 EVT VT = Op.getValueType();
43849
43850 switch (Opc) {
43851 case X86ISD::PINSRB:
43852 case X86ISD::PINSRW: {
43853 // If we don't demand the inserted element, return the base vector.
43854 SDValue Vec = Op.getOperand(0);
43855 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43856 MVT VecVT = Vec.getSimpleValueType();
43857 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
43858 !DemandedElts[CIdx->getZExtValue()])
43859 return Vec;
43860 break;
43861 }
43862 case X86ISD::VSHLI: {
43863 // If we are only demanding sign bits then we can use the shift source
43864 // directly.
43865 SDValue Op0 = Op.getOperand(0);
43866 unsigned ShAmt = Op.getConstantOperandVal(1);
43867 unsigned BitWidth = DemandedBits.getBitWidth();
43868 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
43869 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
43870 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43871 return Op0;
43872 break;
43873 }
43874 case X86ISD::VSRAI:
43875 // iff we only need the sign bit then we can use the source directly.
43876 // TODO: generalize where we only demand extended signbits.
43877 if (DemandedBits.isSignMask())
43878 return Op.getOperand(0);
43879 break;
43880 case X86ISD::PCMPGT:
43881 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43882 // iff we only need the sign bit then we can use R directly.
43883 if (DemandedBits.isSignMask() &&
43884 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43885 return Op.getOperand(1);
43886 break;
43887 case X86ISD::ANDNP: {
43888 // ANDNP = (~LHS & RHS);
43889 SDValue LHS = Op.getOperand(0);
43890 SDValue RHS = Op.getOperand(1);
43891
43892 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
43893 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
43894
43895 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
43896 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
43897 // this context, so return RHS.
43898 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
43899 return RHS;
43900 break;
43901 }
43902 }
43903
43904 APInt ShuffleUndef, ShuffleZero;
43905 SmallVector<int, 16> ShuffleMask;
43906 SmallVector<SDValue, 2> ShuffleOps;
43907 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
43908 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
43909 // If all the demanded elts are from one operand and are inline,
43910 // then we can use the operand directly.
43911 int NumOps = ShuffleOps.size();
43912 if (ShuffleMask.size() == (unsigned)NumElts &&
43913 llvm::all_of(ShuffleOps, [VT](SDValue V) {
43914 return VT.getSizeInBits() == V.getValueSizeInBits();
43915 })) {
43916
43917 if (DemandedElts.isSubsetOf(ShuffleUndef))
43918 return DAG.getUNDEF(VT);
43919 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
43920 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
43921
43922 // Bitmask that indicates which ops have only been accessed 'inline'.
43923 APInt IdentityOp = APInt::getAllOnes(NumOps);
43924 for (int i = 0; i != NumElts; ++i) {
43925 int M = ShuffleMask[i];
43926 if (!DemandedElts[i] || ShuffleUndef[i])
43927 continue;
43928 int OpIdx = M / NumElts;
43929 int EltIdx = M % NumElts;
43930 if (M < 0 || EltIdx != i) {
43931 IdentityOp.clearAllBits();
43932 break;
43933 }
43934 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
43935 if (IdentityOp == 0)
43936 break;
43937 }
43938 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43939, __extension__
__PRETTY_FUNCTION__))
43939 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43939, __extension__
__PRETTY_FUNCTION__))
;
43940
43941 if (IdentityOp != 0)
43942 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
43943 }
43944 }
43945
43946 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
43947 Op, DemandedBits, DemandedElts, DAG, Depth);
43948}
43949
43950bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
43951 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43952 bool PoisonOnly, unsigned Depth) const {
43953 unsigned EltsBits = Op.getScalarValueSizeInBits();
43954 unsigned NumElts = DemandedElts.getBitWidth();
43955
43956 // TODO: Add more target shuffles.
43957 switch (Op.getOpcode()) {
43958 case X86ISD::PSHUFD:
43959 case X86ISD::VPERMILPI: {
43960 SmallVector<int, 8> Mask;
43961 DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
43962
43963 APInt DemandedSrcElts = APInt::getZero(NumElts);
43964 for (unsigned I = 0; I != NumElts; ++I)
43965 if (DemandedElts[I])
43966 DemandedSrcElts.setBit(Mask[I]);
43967
43968 return DAG.isGuaranteedNotToBeUndefOrPoison(
43969 Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
43970 }
43971 }
43972 return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
43973 Op, DemandedElts, DAG, PoisonOnly, Depth);
43974}
43975
43976bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
43977 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43978 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
43979
43980 // TODO: Add more target shuffles.
43981 switch (Op.getOpcode()) {
43982 case X86ISD::PSHUFD:
43983 case X86ISD::VPERMILPI:
43984 return false;
43985 }
43986 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
43987 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
43988}
43989
43990bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
43991 const APInt &DemandedElts,
43992 APInt &UndefElts,
43993 const SelectionDAG &DAG,
43994 unsigned Depth) const {
43995 unsigned NumElts = DemandedElts.getBitWidth();
43996 unsigned Opc = Op.getOpcode();
43997
43998 switch (Opc) {
43999 case X86ISD::VBROADCAST:
44000 case X86ISD::VBROADCAST_LOAD:
44001 UndefElts = APInt::getZero(NumElts);
44002 return true;
44003 }
44004
44005 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44006 DAG, Depth);
44007}
44008
44009// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44010// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44011static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44012 bool AllowTruncate) {
44013 switch (Src.getOpcode()) {
44014 case ISD::TRUNCATE:
44015 if (!AllowTruncate)
44016 return false;
44017 [[fallthrough]];
44018 case ISD::SETCC:
44019 return Src.getOperand(0).getValueSizeInBits() == Size;
44020 case ISD::AND:
44021 case ISD::XOR:
44022 case ISD::OR:
44023 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44024 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44025 case ISD::SELECT:
44026 case ISD::VSELECT:
44027 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44028 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44029 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44030 case ISD::BUILD_VECTOR:
44031 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44032 ISD::isBuildVectorAllOnes(Src.getNode());
44033 }
44034 return false;
44035}
44036
44037// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44038static unsigned getAltBitOpcode(unsigned Opcode) {
44039 switch(Opcode) {
44040 case ISD::AND: return X86ISD::FAND;
44041 case ISD::OR: return X86ISD::FOR;
44042 case ISD::XOR: return X86ISD::FXOR;
44043 case X86ISD::ANDNP: return X86ISD::FANDN;
44044 }
44045 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44045)
;
44046}
44047
44048// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44049static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
44050 const SDLoc &DL) {
44051 EVT SrcVT = Src.getValueType();
44052 if (SrcVT != MVT::v4i1)
44053 return SDValue();
44054
44055 switch (Src.getOpcode()) {
44056 case ISD::SETCC:
44057 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44058 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44059 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44060 SDValue Op0 = Src.getOperand(0);
44061 if (ISD::isNormalLoad(Op0.getNode()))
44062 return DAG.getBitcast(MVT::v4f32, Op0);
44063 if (Op0.getOpcode() == ISD::BITCAST &&
44064 Op0.getOperand(0).getValueType() == MVT::v4f32)
44065 return Op0.getOperand(0);
44066 }
44067 break;
44068 case ISD::AND:
44069 case ISD::XOR:
44070 case ISD::OR: {
44071 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44072 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44073 if (Op0 && Op1)
44074 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44075 Op1);
44076 break;
44077 }
44078 }
44079 return SDValue();
44080}
44081
44082// Helper to push sign extension of vXi1 SETCC result through bitops.
44083static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
44084 SDValue Src, const SDLoc &DL) {
44085 switch (Src.getOpcode()) {
44086 case ISD::SETCC:
44087 case ISD::TRUNCATE:
44088 case ISD::BUILD_VECTOR:
44089 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44090 case ISD::AND:
44091 case ISD::XOR:
44092 case ISD::OR:
44093 return DAG.getNode(
44094 Src.getOpcode(), DL, SExtVT,
44095 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44096 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44097 case ISD::SELECT:
44098 case ISD::VSELECT:
44099 return DAG.getSelect(
44100 DL, SExtVT, Src.getOperand(0),
44101 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44102 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44103 }
44104 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44104)
;
44105}
44106
44107// Try to match patterns such as
44108// (i16 bitcast (v16i1 x))
44109// ->
44110// (i16 movmsk (16i8 sext (v16i1 x)))
44111// before the illegal vector is scalarized on subtargets that don't have legal
44112// vxi1 types.
44113static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
44114 const SDLoc &DL,
44115 const X86Subtarget &Subtarget) {
44116 EVT SrcVT = Src.getValueType();
44117 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44118 return SDValue();
44119
44120 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44121 // legalization destroys the v4i32 type.
44122 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44123 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44124 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44125 DAG.getBitcast(MVT::v4f32, V));
44126 return DAG.getZExtOrTrunc(V, DL, VT);
44127 }
44128 }
44129
44130 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44131 // movmskb even with avx512. This will be better than truncating to vXi1 and
44132 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44133 // vpcmpeqb/vpcmpgtb.
44134 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44135 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44136 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44137 Src.getOperand(0).getValueType() == MVT::v64i8);
44138
44139 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44140 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44141 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44142 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44143 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44144 EVT CmpVT = Src.getOperand(0).getValueType();
44145 EVT EltVT = CmpVT.getVectorElementType();
44146 if (CmpVT.getSizeInBits() <= 256 &&
44147 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44148 PreferMovMsk = true;
44149 }
44150
44151 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44152 // MOVMSK is supported in SSE2 or later.
44153 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44154 return SDValue();
44155
44156 // If the upper ops of a concatenation are undef, then try to bitcast the
44157 // lower op and extend.
44158 SmallVector<SDValue, 4> SubSrcOps;
44159 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44160 SubSrcOps.size() >= 2) {
44161 SDValue LowerOp = SubSrcOps[0];
44162 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44163 if (LowerOp.getOpcode() == ISD::SETCC &&
44164 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44165 EVT SubVT = VT.getIntegerVT(
44166 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44167 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44168 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44169 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44170 }
44171 }
44172 }
44173
44174 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44175 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44176 // v8i16 and v16i16.
44177 // For these two cases, we can shuffle the upper element bytes to a
44178 // consecutive sequence at the start of the vector and treat the results as
44179 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44180 // for v16i16 this is not the case, because the shuffle is expensive, so we
44181 // avoid sign-extending to this type entirely.
44182 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44183 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44184 MVT SExtVT;
44185 bool PropagateSExt = false;
44186 switch (SrcVT.getSimpleVT().SimpleTy) {
44187 default:
44188 return SDValue();
44189 case MVT::v2i1:
44190 SExtVT = MVT::v2i64;
44191 break;
44192 case MVT::v4i1:
44193 SExtVT = MVT::v4i32;
44194 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44195 // sign-extend to a 256-bit operation to avoid truncation.
44196 if (Subtarget.hasAVX() &&
44197 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44198 SExtVT = MVT::v4i64;
44199 PropagateSExt = true;
44200 }
44201 break;
44202 case MVT::v8i1:
44203 SExtVT = MVT::v8i16;
44204 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44205 // sign-extend to a 256-bit operation to match the compare.
44206 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44207 // 256-bit because the shuffle is cheaper than sign extending the result of
44208 // the compare.
44209 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44210 checkBitcastSrcVectorSize(Src, 512, true))) {
44211 SExtVT = MVT::v8i32;
44212 PropagateSExt = true;
44213 }
44214 break;
44215 case MVT::v16i1:
44216 SExtVT = MVT::v16i8;
44217 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44218 // it is not profitable to sign-extend to 256-bit because this will
44219 // require an extra cross-lane shuffle which is more expensive than
44220 // truncating the result of the compare to 128-bits.
44221 break;
44222 case MVT::v32i1:
44223 SExtVT = MVT::v32i8;
44224 break;
44225 case MVT::v64i1:
44226 // If we have AVX512F, but not AVX512BW and the input is truncated from
44227 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44228 if (Subtarget.hasAVX512()) {
44229 if (Subtarget.hasBWI())
44230 return SDValue();
44231 SExtVT = MVT::v64i8;
44232 break;
44233 }
44234 // Split if this is a <64 x i8> comparison result.
44235 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44236 SExtVT = MVT::v64i8;
44237 break;
44238 }
44239 return SDValue();
44240 };
44241
44242 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44243 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44244
44245 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44246 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44247 } else {
44248 if (SExtVT == MVT::v8i16)
44249 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
44250 DAG.getUNDEF(MVT::v8i16));
44251 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44252 }
44253
44254 EVT IntVT =
44255 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
44256 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44257 return DAG.getBitcast(VT, V);
44258}
44259
44260// Convert a vXi1 constant build vector to the same width scalar integer.
44261static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
44262 EVT SrcVT = Op.getValueType();
44263 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44264, __extension__
__PRETTY_FUNCTION__))
44264 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44264, __extension__
__PRETTY_FUNCTION__))
;
44265 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44266, __extension__
__PRETTY_FUNCTION__))
44266 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44266, __extension__
__PRETTY_FUNCTION__))
;
44267
44268 APInt Imm(SrcVT.getVectorNumElements(), 0);
44269 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44270 SDValue In = Op.getOperand(Idx);
44271 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
44272 Imm.setBit(Idx);
44273 }
44274 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44275 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44276}
44277
44278static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44279 TargetLowering::DAGCombinerInfo &DCI,
44280 const X86Subtarget &Subtarget) {
44281 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44281, __extension__
__PRETTY_FUNCTION__))
;
44282
44283 if (!DCI.isBeforeLegalizeOps())
44284 return SDValue();
44285
44286 // Only do this if we have k-registers.
44287 if (!Subtarget.hasAVX512())
44288 return SDValue();
44289
44290 EVT DstVT = N->getValueType(0);
44291 SDValue Op = N->getOperand(0);
44292 EVT SrcVT = Op.getValueType();
44293
44294 if (!Op.hasOneUse())
44295 return SDValue();
44296
44297 // Look for logic ops.
44298 if (Op.getOpcode() != ISD::AND &&
44299 Op.getOpcode() != ISD::OR &&
44300 Op.getOpcode() != ISD::XOR)
44301 return SDValue();
44302
44303 // Make sure we have a bitcast between mask registers and a scalar type.
44304 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44305 DstVT.isScalarInteger()) &&
44306 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44307 SrcVT.isScalarInteger()))
44308 return SDValue();
44309
44310 SDValue LHS = Op.getOperand(0);
44311 SDValue RHS = Op.getOperand(1);
44312
44313 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44314 LHS.getOperand(0).getValueType() == DstVT)
44315 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44316 DAG.getBitcast(DstVT, RHS));
44317
44318 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44319 RHS.getOperand(0).getValueType() == DstVT)
44320 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44321 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44322
44323 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44324 // Most of these have to move a constant from the scalar domain anyway.
44325 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
44326 RHS = combinevXi1ConstantToInteger(RHS, DAG);
44327 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44328 DAG.getBitcast(DstVT, LHS), RHS);
44329 }
44330
44331 return SDValue();
44332}
44333
44334static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
44335 const X86Subtarget &Subtarget) {
44336 SDLoc DL(BV);
44337 unsigned NumElts = BV->getNumOperands();
44338 SDValue Splat = BV->getSplatValue();
44339
44340 // Build MMX element from integer GPR or SSE float values.
44341 auto CreateMMXElement = [&](SDValue V) {
44342 if (V.isUndef())
44343 return DAG.getUNDEF(MVT::x86mmx);
44344 if (V.getValueType().isFloatingPoint()) {
44345 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44346 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44347 V = DAG.getBitcast(MVT::v2i64, V);
44348 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44349 }
44350 V = DAG.getBitcast(MVT::i32, V);
44351 } else {
44352 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44353 }
44354 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44355 };
44356
44357 // Convert build vector ops to MMX data in the bottom elements.
44358 SmallVector<SDValue, 8> Ops;
44359
44360 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44361
44362 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44363 if (Splat) {
44364 if (Splat.isUndef())
44365 return DAG.getUNDEF(MVT::x86mmx);
44366
44367 Splat = CreateMMXElement(Splat);
44368
44369 if (Subtarget.hasSSE1()) {
44370 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44371 if (NumElts == 8)
44372 Splat = DAG.getNode(
44373 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44374 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44375 TLI.getPointerTy(DAG.getDataLayout())),
44376 Splat, Splat);
44377
44378 // Use PSHUFW to repeat 16-bit elements.
44379 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44380 return DAG.getNode(
44381 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44382 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44383 TLI.getPointerTy(DAG.getDataLayout())),
44384 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44385 }
44386 Ops.append(NumElts, Splat);
44387 } else {
44388 for (unsigned i = 0; i != NumElts; ++i)
44389 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44390 }
44391
44392 // Use tree of PUNPCKLs to build up general MMX vector.
44393 while (Ops.size() > 1) {
44394 unsigned NumOps = Ops.size();
44395 unsigned IntrinOp =
44396 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44397 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44398 : Intrinsic::x86_mmx_punpcklbw));
44399 SDValue Intrin = DAG.getTargetConstant(
44400 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44401 for (unsigned i = 0; i != NumOps; i += 2)
44402 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44403 Ops[i], Ops[i + 1]);
44404 Ops.resize(NumOps / 2);
44405 }
44406
44407 return Ops[0];
44408}
44409
44410// Recursive function that attempts to find if a bool vector node was originally
44411// a vector/float/double that got truncated/extended/bitcast to/from a scalar
44412// integer. If so, replace the scalar ops with bool vector equivalents back down
44413// the chain.
44414static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
44415 SelectionDAG &DAG,
44416 const X86Subtarget &Subtarget) {
44417 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44418 unsigned Opc = V.getOpcode();
44419 switch (Opc) {
44420 case ISD::BITCAST: {
44421 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44422 SDValue Src = V.getOperand(0);
44423 EVT SrcVT = Src.getValueType();
44424 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44425 return DAG.getBitcast(VT, Src);
44426 break;
44427 }
44428 case ISD::TRUNCATE: {
44429 // If we find a suitable source, a truncated scalar becomes a subvector.
44430 SDValue Src = V.getOperand(0);
44431 EVT NewSrcVT =
44432 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44433 if (TLI.isTypeLegal(NewSrcVT))
44434 if (SDValue N0 =
44435 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44436 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44437 DAG.getIntPtrConstant(0, DL));
44438 break;
44439 }
44440 case ISD::ANY_EXTEND:
44441 case ISD::ZERO_EXTEND: {
44442 // If we find a suitable source, an extended scalar becomes a subvector.
44443 SDValue Src = V.getOperand(0);
44444 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
44445 Src.getScalarValueSizeInBits());
44446 if (TLI.isTypeLegal(NewSrcVT))
44447 if (SDValue N0 =
44448 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44449 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
44450 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
44451 : DAG.getConstant(0, DL, VT),
44452 N0, DAG.getIntPtrConstant(0, DL));
44453 break;
44454 }
44455 case ISD::OR: {
44456 // If we find suitable sources, we can just move an OR to the vector domain.
44457 SDValue Src0 = V.getOperand(0);
44458 SDValue Src1 = V.getOperand(1);
44459 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44460 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
44461 return DAG.getNode(Opc, DL, VT, N0, N1);
44462 break;
44463 }
44464 case ISD::SHL: {
44465 // If we find a suitable source, a SHL becomes a KSHIFTL.
44466 SDValue Src0 = V.getOperand(0);
44467 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
44468 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
44469 break;
44470
44471 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
44472 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44473 return DAG.getNode(
44474 X86ISD::KSHIFTL, DL, VT, N0,
44475 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
44476 break;
44477 }
44478 }
44479 return SDValue();
44480}
44481
44482static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
44483 TargetLowering::DAGCombinerInfo &DCI,
44484 const X86Subtarget &Subtarget) {
44485 SDValue N0 = N->getOperand(0);
44486 EVT VT = N->getValueType(0);
44487 EVT SrcVT = N0.getValueType();
44488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44489
44490 // Try to match patterns such as
44491 // (i16 bitcast (v16i1 x))
44492 // ->
44493 // (i16 movmsk (16i8 sext (v16i1 x)))
44494 // before the setcc result is scalarized on subtargets that don't have legal
44495 // vxi1 types.
44496 if (DCI.isBeforeLegalize()) {
44497 SDLoc dl(N);
44498 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
44499 return V;
44500
44501 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44502 // type, widen both sides to avoid a trip through memory.
44503 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
44504 Subtarget.hasAVX512()) {
44505 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
44506 N0 = DAG.getBitcast(MVT::v8i1, N0);
44507 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
44508 DAG.getIntPtrConstant(0, dl));
44509 }
44510
44511 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44512 // type, widen both sides to avoid a trip through memory.
44513 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
44514 Subtarget.hasAVX512()) {
44515 // Use zeros for the widening if we already have some zeroes. This can
44516 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
44517 // stream of this.
44518 // FIXME: It might make sense to detect a concat_vectors with a mix of
44519 // zeroes and undef and turn it into insert_subvector for i1 vectors as
44520 // a separate combine. What we can't do is canonicalize the operands of
44521 // such a concat or we'll get into a loop with SimplifyDemandedBits.
44522 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
44523 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
44524 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
44525 SrcVT = LastOp.getValueType();
44526 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44527 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
44528 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
44529 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44530 N0 = DAG.getBitcast(MVT::i8, N0);
44531 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44532 }
44533 }
44534
44535 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44536 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
44537 Ops[0] = N0;
44538 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44539 N0 = DAG.getBitcast(MVT::i8, N0);
44540 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
44541 }
44542 } else {
44543 // If we're bitcasting from iX to vXi1, see if the integer originally
44544 // began as a vXi1 and whether we can remove the bitcast entirely.
44545 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
44546 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
44547 if (SDValue V =
44548 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
44549 return V;
44550 }
44551 }
44552
44553 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
44554 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
44555 // due to insert_subvector legalization on KNL. By promoting the copy to i16
44556 // we can help with known bits propagation from the vXi1 domain to the
44557 // scalar domain.
44558 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
44559 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44560 N0.getOperand(0).getValueType() == MVT::v16i1 &&
44561 isNullConstant(N0.getOperand(1)))
44562 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
44563 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
44564
44565 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
44566 // and the vbroadcast_load are both integer or both fp. In some cases this
44567 // will remove the bitcast entirely.
44568 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
44569 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
44570 auto *BCast = cast<MemIntrinsicSDNode>(N0);
44571 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
44572 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
44573 // Don't swap i8/i16 since don't have fp types that size.
44574 if (MemSize >= 32) {
44575 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
44576 : MVT::getIntegerVT(MemSize);
44577 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
44578 : MVT::getIntegerVT(SrcVTSize);
44579 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
44580
44581 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
44582 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
44583 SDValue ResNode =
44584 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
44585 MemVT, BCast->getMemOperand());
44586 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
44587 return DAG.getBitcast(VT, ResNode);
44588 }
44589 }
44590
44591 // Since MMX types are special and don't usually play with other vector types,
44592 // it's better to handle them early to be sure we emit efficient code by
44593 // avoiding store-load conversions.
44594 if (VT == MVT::x86mmx) {
44595 // Detect MMX constant vectors.
44596 APInt UndefElts;
44597 SmallVector<APInt, 1> EltBits;
44598 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
44599 SDLoc DL(N0);
44600 // Handle zero-extension of i32 with MOVD.
44601 if (EltBits[0].countl_zero() >= 32)
44602 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
44603 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
44604 // Else, bitcast to a double.
44605 // TODO - investigate supporting sext 32-bit immediates on x86_64.
44606 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
44607 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
44608 }
44609
44610 // Detect bitcasts to x86mmx low word.
44611 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44612 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
44613 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
44614 bool LowUndef = true, AllUndefOrZero = true;
44615 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
44616 SDValue Op = N0.getOperand(i);
44617 LowUndef &= Op.isUndef() || (i >= e/2);
44618 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
44619 }
44620 if (AllUndefOrZero) {
44621 SDValue N00 = N0.getOperand(0);
44622 SDLoc dl(N00);
44623 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
44624 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
44625 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
44626 }
44627 }
44628
44629 // Detect bitcasts of 64-bit build vectors and convert to a
44630 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
44631 // lowest element.
44632 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44633 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
44634 SrcVT == MVT::v8i8))
44635 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
44636
44637 // Detect bitcasts between element or subvector extraction to x86mmx.
44638 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
44639 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
44640 isNullConstant(N0.getOperand(1))) {
44641 SDValue N00 = N0.getOperand(0);
44642 if (N00.getValueType().is128BitVector())
44643 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
44644 DAG.getBitcast(MVT::v2i64, N00));
44645 }
44646
44647 // Detect bitcasts from FP_TO_SINT to x86mmx.
44648 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
44649 SDLoc DL(N0);
44650 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
44651 DAG.getUNDEF(MVT::v2i32));
44652 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
44653 DAG.getBitcast(MVT::v2i64, Res));
44654 }
44655 }
44656
44657 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
44658 // most of these to scalar anyway.
44659 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
44660 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44661 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
44662 return combinevXi1ConstantToInteger(N0, DAG);
44663 }
44664
44665 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44666 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44667 isa<ConstantSDNode>(N0)) {
44668 auto *C = cast<ConstantSDNode>(N0);
44669 if (C->isAllOnes())
44670 return DAG.getConstant(1, SDLoc(N0), VT);
44671 if (C->isZero())
44672 return DAG.getConstant(0, SDLoc(N0), VT);
44673 }
44674
44675 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
44676 // Turn it into a sign bit compare that produces a k-register. This avoids
44677 // a trip through a GPR.
44678 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44679 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44680 isPowerOf2_32(VT.getVectorNumElements())) {
44681 unsigned NumElts = VT.getVectorNumElements();
44682 SDValue Src = N0;
44683
44684 // Peek through truncate.
44685 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
44686 Src = N0.getOperand(0);
44687
44688 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
44689 SDValue MovmskIn = Src.getOperand(0);
44690 MVT MovmskVT = MovmskIn.getSimpleValueType();
44691 unsigned MovMskElts = MovmskVT.getVectorNumElements();
44692
44693 // We allow extra bits of the movmsk to be used since they are known zero.
44694 // We can't convert a VPMOVMSKB without avx512bw.
44695 if (MovMskElts <= NumElts &&
44696 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
44697 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
44698 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
44699 SDLoc dl(N);
44700 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
44701 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
44702 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
44703 if (EVT(CmpVT) == VT)
44704 return Cmp;
44705
44706 // Pad with zeroes up to original VT to replace the zeroes that were
44707 // being used from the MOVMSK.
44708 unsigned NumConcats = NumElts / MovMskElts;
44709 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
44710 Ops[0] = Cmp;
44711 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
44712 }
44713 }
44714 }
44715
44716 // Try to remove bitcasts from input and output of mask arithmetic to
44717 // remove GPR<->K-register crossings.
44718 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
44719 return V;
44720
44721 // Convert a bitcasted integer logic operation that has one bitcasted
44722 // floating-point operand into a floating-point logic operation. This may
44723 // create a load of a constant, but that is cheaper than materializing the
44724 // constant in an integer register and transferring it to an SSE register or
44725 // transferring the SSE operand to integer register and back.
44726 unsigned FPOpcode;
44727 switch (N0.getOpcode()) {
44728 case ISD::AND: FPOpcode = X86ISD::FAND; break;
44729 case ISD::OR: FPOpcode = X86ISD::FOR; break;
44730 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44731 default: return SDValue();
44732 }
44733
44734 // Check if we have a bitcast from another integer type as well.
44735 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
44736 (Subtarget.hasSSE2() && VT == MVT::f64) ||
44737 (Subtarget.hasFP16() && VT == MVT::f16) ||
44738 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
44739 TLI.isTypeLegal(VT))))
44740 return SDValue();
44741
44742 SDValue LogicOp0 = N0.getOperand(0);
44743 SDValue LogicOp1 = N0.getOperand(1);
44744 SDLoc DL0(N0);
44745
44746 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
44747 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
44748 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
44749 LogicOp0.getOperand(0).getValueType() == VT &&
44750 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
44751 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
44752 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44753 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
44754 }
44755 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
44756 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
44757 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
44758 LogicOp1.getOperand(0).getValueType() == VT &&
44759 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
44760 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
44761 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44762 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
44763 }
44764
44765 return SDValue();
44766}
44767
44768// (mul (zext a), (sext, b))
44769static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
44770 SDValue &Op1) {
44771 Op0 = Mul.getOperand(0);
44772 Op1 = Mul.getOperand(1);
44773
44774 // The operand1 should be signed extend
44775 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
44776 std::swap(Op0, Op1);
44777
44778 auto IsFreeTruncation = [](SDValue &Op) -> bool {
44779 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
44780 Op.getOpcode() == ISD::SIGN_EXTEND) &&
44781 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
44782 return true;
44783
44784 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
44785 return (BV && BV->isConstant());
44786 };
44787
44788 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
44789 // value, we need to check Op0 is zero extended value. Op1 should be signed
44790 // value, so we just check the signed bits.
44791 if ((IsFreeTruncation(Op0) &&
44792 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
44793 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
44794 return true;
44795
44796 return false;
44797}
44798
44799// Given a ABS node, detect the following pattern:
44800// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
44801// This is useful as it is the input into a SAD pattern.
44802static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
44803 SDValue AbsOp1 = Abs->getOperand(0);
44804 if (AbsOp1.getOpcode() != ISD::SUB)
44805 return false;
44806
44807 Op0 = AbsOp1.getOperand(0);
44808 Op1 = AbsOp1.getOperand(1);
44809
44810 // Check if the operands of the sub are zero-extended from vectors of i8.
44811 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
44812 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
44813 Op1.getOpcode() != ISD::ZERO_EXTEND ||
44814 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
44815 return false;
44816
44817 return true;
44818}
44819
44820static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
44821 unsigned &LogBias, const SDLoc &DL,
44822 const X86Subtarget &Subtarget) {
44823 // Extend or truncate to MVT::i8 first.
44824 MVT Vi8VT =
44825 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
44826 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
44827 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
44828
44829 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
44830 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
44831 // The src A, B element type is i8, but the dst C element type is i32.
44832 // When we calculate the reduce stage, we use src vector type vXi8 for it
44833 // so we need logbias 2 to avoid extra 2 stages.
44834 LogBias = 2;
44835
44836 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
44837 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
44838 RegSize = std::max(512u, RegSize);
44839
44840 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44841 // fill in the missing vector elements with 0.
44842 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
44843 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
44844 Ops[0] = LHS;
44845 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44846 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44847 Ops[0] = RHS;
44848 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44849
44850 // Actually build the DotProduct, split as 256/512 bits for
44851 // AVXVNNI/AVX512VNNI.
44852 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44853 ArrayRef<SDValue> Ops) {
44854 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44855 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
44856 };
44857 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
44858 SDValue Zero = DAG.getConstant(0, DL, DpVT);
44859
44860 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
44861 DpBuilder, false);
44862}
44863
44864// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
44865// to these zexts.
44866static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
44867 const SDValue &Zext1, const SDLoc &DL,
44868 const X86Subtarget &Subtarget) {
44869 // Find the appropriate width for the PSADBW.
44870 EVT InVT = Zext0.getOperand(0).getValueType();
44871 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
44872
44873 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44874 // fill in the missing vector elements with 0.
44875 unsigned NumConcat = RegSize / InVT.getSizeInBits();
44876 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
44877 Ops[0] = Zext0.getOperand(0);
44878 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44879 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44880 Ops[0] = Zext1.getOperand(0);
44881 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44882
44883 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44884 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44885 ArrayRef<SDValue> Ops) {
44886 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44887 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
44888 };
44889 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
44890 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
44891 PSADBWBuilder);
44892}
44893
44894// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
44895// PHMINPOSUW.
44896static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
44897 const X86Subtarget &Subtarget) {
44898 // Bail without SSE41.
44899 if (!Subtarget.hasSSE41())
44900 return SDValue();
44901
44902 EVT ExtractVT = Extract->getValueType(0);
44903 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
44904 return SDValue();
44905
44906 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
44907 ISD::NodeType BinOp;
44908 SDValue Src = DAG.matchBinOpReduction(
44909 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
44910 if (!Src)
44911 return SDValue();
44912
44913 EVT SrcVT = Src.getValueType();
44914 EVT SrcSVT = SrcVT.getScalarType();
44915 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
44916 return SDValue();
44917
44918 SDLoc DL(Extract);
44919 SDValue MinPos = Src;
44920
44921 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
44922 while (SrcVT.getSizeInBits() > 128) {
44923 SDValue Lo, Hi;
44924 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
44925 SrcVT = Lo.getValueType();
44926 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
44927 }
44928 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44930, __extension__
__PRETTY_FUNCTION__))
44929 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44930, __extension__
__PRETTY_FUNCTION__))
44930 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44930, __extension__
__PRETTY_FUNCTION__))
;
44931
44932 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
44933 // to flip the value accordingly.
44934 SDValue Mask;
44935 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
44936 if (BinOp == ISD::SMAX)
44937 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
44938 else if (BinOp == ISD::SMIN)
44939 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
44940 else if (BinOp == ISD::UMAX)
44941 Mask = DAG.getAllOnesConstant(DL, SrcVT);
44942
44943 if (Mask)
44944 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44945
44946 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
44947 // shuffling each upper element down and insert zeros. This means that the
44948 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
44949 // ready for the PHMINPOS.
44950 if (ExtractVT == MVT::i8) {
44951 SDValue Upper = DAG.getVectorShuffle(
44952 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
44953 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
44954 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
44955 }
44956
44957 // Perform the PHMINPOS on a v8i16 vector,
44958 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
44959 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
44960 MinPos = DAG.getBitcast(SrcVT, MinPos);
44961
44962 if (Mask)
44963 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44964
44965 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
44966 DAG.getIntPtrConstant(0, DL));
44967}
44968
44969// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
44970static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
44971 const X86Subtarget &Subtarget) {
44972 // Bail without SSE2.
44973 if (!Subtarget.hasSSE2())
44974 return SDValue();
44975
44976 EVT ExtractVT = Extract->getValueType(0);
44977 unsigned BitWidth = ExtractVT.getSizeInBits();
44978 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
44979 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
44980 return SDValue();
44981
44982 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
44983 ISD::NodeType BinOp;
44984 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
44985 if (!Match && ExtractVT == MVT::i1)
44986 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
44987 if (!Match)
44988 return SDValue();
44989
44990 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
44991 // which we can't support here for now.
44992 if (Match.getScalarValueSizeInBits() != BitWidth)
44993 return SDValue();
44994
44995 SDValue Movmsk;
44996 SDLoc DL(Extract);
44997 EVT MatchVT = Match.getValueType();
44998 unsigned NumElts = MatchVT.getVectorNumElements();
44999 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45000 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45001 LLVMContext &Ctx = *DAG.getContext();
45002
45003 if (ExtractVT == MVT::i1) {
45004 // Special case for (pre-legalization) vXi1 reductions.
45005 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45006 return SDValue();
45007 if (Match.getOpcode() == ISD::SETCC) {
45008 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45009 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45010 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45011 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45012 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45013 X86::CondCode X86CC;
45014 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45015 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45016 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45017 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45018 DAG, X86CC))
45019 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45020 getSETCC(X86CC, V, DL, DAG));
45021 }
45022 }
45023 if (TLI.isTypeLegal(MatchVT)) {
45024 // If this is a legal AVX512 predicate type then we can just bitcast.
45025 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45026 Movmsk = DAG.getBitcast(MovmskVT, Match);
45027 } else {
45028 // Use combineBitcastvxi1 to create the MOVMSK.
45029 while (NumElts > MaxElts) {
45030 SDValue Lo, Hi;
45031 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45032 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45033 NumElts /= 2;
45034 }
45035 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45036 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45037 }
45038 if (!Movmsk)
45039 return SDValue();
45040 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45041 } else {
45042 // FIXME: Better handling of k-registers or 512-bit vectors?
45043 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45044 if (!(MatchSizeInBits == 128 ||
45045 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45046 return SDValue();
45047
45048 // Make sure this isn't a vector of 1 element. The perf win from using
45049 // MOVMSK diminishes with less elements in the reduction, but it is
45050 // generally better to get the comparison over to the GPRs as soon as
45051 // possible to reduce the number of vector ops.
45052 if (Match.getValueType().getVectorNumElements() < 2)
45053 return SDValue();
45054
45055 // Check that we are extracting a reduction of all sign bits.
45056 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45057 return SDValue();
45058
45059 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45060 SDValue Lo, Hi;
45061 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45062 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45063 MatchSizeInBits = Match.getValueSizeInBits();
45064 }
45065
45066 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45067 MVT MaskSrcVT;
45068 if (64 == BitWidth || 32 == BitWidth)
45069 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
45070 MatchSizeInBits / BitWidth);
45071 else
45072 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45073
45074 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45075 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45076 NumElts = MaskSrcVT.getVectorNumElements();
45077 }
45078 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45079, __extension__
__PRETTY_FUNCTION__))
45079 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45079, __extension__
__PRETTY_FUNCTION__))
;
45080
45081 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45082 if (BinOp == ISD::XOR) {
45083 // parity -> (PARITY(MOVMSK X))
45084 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45085 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45086 }
45087
45088 SDValue CmpC;
45089 ISD::CondCode CondCode;
45090 if (BinOp == ISD::OR) {
45091 // any_of -> MOVMSK != 0
45092 CmpC = DAG.getConstant(0, DL, CmpVT);
45093 CondCode = ISD::CondCode::SETNE;
45094 } else {
45095 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45096 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45097 DL, CmpVT);
45098 CondCode = ISD::CondCode::SETEQ;
45099 }
45100
45101 // The setcc produces an i8 of 0/1, so extend that to the result width and
45102 // negate to get the final 0/-1 mask value.
45103 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45104 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45105 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45106 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
45107 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
45108}
45109
45110static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
45111 const X86Subtarget &Subtarget) {
45112 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45113 return SDValue();
45114
45115 EVT ExtractVT = Extract->getValueType(0);
45116 // Verify the type we're extracting is i32, as the output element type of
45117 // vpdpbusd is i32.
45118 if (ExtractVT != MVT::i32)
45119 return SDValue();
45120
45121 EVT VT = Extract->getOperand(0).getValueType();
45122 if (!isPowerOf2_32(VT.getVectorNumElements()))
45123 return SDValue();
45124
45125 // Match shuffle + add pyramid.
45126 ISD::NodeType BinOp;
45127 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45128
45129 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45130 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45131 // before adding into the accumulator.
45132 // TODO:
45133 // We also need to verify that the multiply has at least 2x the number of bits
45134 // of the input. We shouldn't match
45135 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45136 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45137 // Root = Root.getOperand(0);
45138
45139 // If there was a match, we want Root to be a mul.
45140 if (!Root || Root.getOpcode() != ISD::MUL)
45141 return SDValue();
45142
45143 // Check whether we have an extend and mul pattern
45144 SDValue LHS, RHS;
45145 if (!detectExtMul(DAG, Root, LHS, RHS))
45146 return SDValue();
45147
45148 // Create the dot product instruction.
45149 SDLoc DL(Extract);
45150 unsigned StageBias;
45151 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45152
45153 // If the original vector was wider than 4 elements, sum over the results
45154 // in the DP vector.
45155 unsigned Stages = Log2_32(VT.getVectorNumElements());
45156 EVT DpVT = DP.getValueType();
45157
45158 if (Stages > StageBias) {
45159 unsigned DpElems = DpVT.getVectorNumElements();
45160
45161 for (unsigned i = Stages - StageBias; i > 0; --i) {
45162 SmallVector<int, 16> Mask(DpElems, -1);
45163 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45164 Mask[j] = MaskEnd + j;
45165
45166 SDValue Shuffle =
45167 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45168 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45169 }
45170 }
45171
45172 // Return the lowest ExtractSizeInBits bits.
45173 EVT ResVT =
45174 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45175 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45176 DP = DAG.getBitcast(ResVT, DP);
45177 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45178 Extract->getOperand(1));
45179}
45180
45181static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
45182 const X86Subtarget &Subtarget) {
45183 // PSADBW is only supported on SSE2 and up.
45184 if (!Subtarget.hasSSE2())
45185 return SDValue();
45186
45187 EVT ExtractVT = Extract->getValueType(0);
45188 // Verify the type we're extracting is either i32 or i64.
45189 // FIXME: Could support other types, but this is what we have coverage for.
45190 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45191 return SDValue();
45192
45193 EVT VT = Extract->getOperand(0).getValueType();
45194 if (!isPowerOf2_32(VT.getVectorNumElements()))
45195 return SDValue();
45196
45197 // Match shuffle + add pyramid.
45198 ISD::NodeType BinOp;
45199 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45200
45201 // The operand is expected to be zero extended from i8
45202 // (verified in detectZextAbsDiff).
45203 // In order to convert to i64 and above, additional any/zero/sign
45204 // extend is expected.
45205 // The zero extend from 32 bit has no mathematical effect on the result.
45206 // Also the sign extend is basically zero extend
45207 // (extends the sign bit which is zero).
45208 // So it is correct to skip the sign/zero extend instruction.
45209 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45210 Root.getOpcode() == ISD::ZERO_EXTEND ||
45211 Root.getOpcode() == ISD::ANY_EXTEND))
45212 Root = Root.getOperand(0);
45213
45214 // If there was a match, we want Root to be a select that is the root of an
45215 // abs-diff pattern.
45216 if (!Root || Root.getOpcode() != ISD::ABS)
45217 return SDValue();
45218
45219 // Check whether we have an abs-diff pattern feeding into the select.
45220 SDValue Zext0, Zext1;
45221 if (!detectZextAbsDiff(Root, Zext0, Zext1))
45222 return SDValue();
45223
45224 // Create the SAD instruction.
45225 SDLoc DL(Extract);
45226 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45227
45228 // If the original vector was wider than 8 elements, sum over the results
45229 // in the SAD vector.
45230 unsigned Stages = Log2_32(VT.getVectorNumElements());
45231 EVT SadVT = SAD.getValueType();
45232 if (Stages > 3) {
45233 unsigned SadElems = SadVT.getVectorNumElements();
45234
45235 for(unsigned i = Stages - 3; i > 0; --i) {
45236 SmallVector<int, 16> Mask(SadElems, -1);
45237 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45238 Mask[j] = MaskEnd + j;
45239
45240 SDValue Shuffle =
45241 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45242 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45243 }
45244 }
45245
45246 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45247 // Return the lowest ExtractSizeInBits bits.
45248 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45249 SadVT.getSizeInBits() / ExtractSizeInBits);
45250 SAD = DAG.getBitcast(ResVT, SAD);
45251 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45252 Extract->getOperand(1));
45253}
45254
45255// Attempt to peek through a target shuffle and extract the scalar from the
45256// source.
45257static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
45258 TargetLowering::DAGCombinerInfo &DCI,
45259 const X86Subtarget &Subtarget) {
45260 if (DCI.isBeforeLegalizeOps())
45261 return SDValue();
45262
45263 SDLoc dl(N);
45264 SDValue Src = N->getOperand(0);
45265 SDValue Idx = N->getOperand(1);
45266
45267 EVT VT = N->getValueType(0);
45268 EVT SrcVT = Src.getValueType();
45269 EVT SrcSVT = SrcVT.getVectorElementType();
45270 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45271 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45272
45273 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45274 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45275 return SDValue();
45276
45277 const APInt &IdxC = N->getConstantOperandAPInt(1);
45278 if (IdxC.uge(NumSrcElts))
45279 return SDValue();
45280
45281 SDValue SrcBC = peekThroughBitcasts(Src);
45282
45283 // Handle extract(bitcast(broadcast(scalar_value))).
45284 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45285 SDValue SrcOp = SrcBC.getOperand(0);
45286 EVT SrcOpVT = SrcOp.getValueType();
45287 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45288 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45289 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45290 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45291 // TODO support non-zero offsets.
45292 if (Offset == 0) {
45293 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45294 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45295 return SrcOp;
45296 }
45297 }
45298 }
45299
45300 // If we're extracting a single element from a broadcast load and there are
45301 // no other users, just create a single load.
45302 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45303 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45304 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45305 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45306 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45307 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45308 MemIntr->getBasePtr(),
45309 MemIntr->getPointerInfo(),
45310 MemIntr->getOriginalAlign(),
45311 MemIntr->getMemOperand()->getFlags());
45312 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45313 return Load;
45314 }
45315 }
45316
45317 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45318 // TODO: Move to DAGCombine?
45319 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45320 SrcBC.getValueType().isInteger() &&
45321 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45322 SrcBC.getScalarValueSizeInBits() ==
45323 SrcBC.getOperand(0).getValueSizeInBits()) {
45324 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45325 if (IdxC.ult(Scale)) {
45326 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45327 SDValue Scl = SrcBC.getOperand(0);
45328 EVT SclVT = Scl.getValueType();
45329 if (Offset) {
45330 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45331 DAG.getShiftAmountConstant(Offset, SclVT, dl));
45332 }
45333 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45334 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45335 return Scl;
45336 }
45337 }
45338
45339 // Handle extract(truncate(x)) for 0'th index.
45340 // TODO: Treat this as a faux shuffle?
45341 // TODO: When can we use this for general indices?
45342 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45343 (SrcVT.getSizeInBits() % 128) == 0) {
45344 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45345 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45346 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45347 Idx);
45348 }
45349
45350 // We can only legally extract other elements from 128-bit vectors and in
45351 // certain circumstances, depending on SSE-level.
45352 // TODO: Investigate float/double extraction if it will be just stored.
45353 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45354 unsigned Idx) {
45355 EVT VecSVT = VecVT.getScalarType();
45356 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45357 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45358 VecSVT == MVT::i64)) {
45359 unsigned EltSizeInBits = VecSVT.getSizeInBits();
45360 unsigned NumEltsPerLane = 128 / EltSizeInBits;
45361 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45362 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45363 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45364 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45365 Idx &= (NumEltsPerLane - 1);
45366 }
45367 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45368 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45369 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45370 DAG.getBitcast(VecVT, Vec),
45371 DAG.getIntPtrConstant(Idx, dl));
45372 }
45373 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45374 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45375 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45376 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45377 DAG.getTargetConstant(Idx, dl, MVT::i8));
45378 }
45379 return SDValue();
45380 };
45381
45382 // Resolve the target shuffle inputs and mask.
45383 SmallVector<int, 16> Mask;
45384 SmallVector<SDValue, 2> Ops;
45385 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45386 return SDValue();
45387
45388 // Shuffle inputs must be the same size as the result.
45389 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
45390 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
45391 }))
45392 return SDValue();
45393
45394 // Attempt to narrow/widen the shuffle mask to the correct size.
45395 if (Mask.size() != NumSrcElts) {
45396 if ((NumSrcElts % Mask.size()) == 0) {
45397 SmallVector<int, 16> ScaledMask;
45398 int Scale = NumSrcElts / Mask.size();
45399 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
45400 Mask = std::move(ScaledMask);
45401 } else if ((Mask.size() % NumSrcElts) == 0) {
45402 // Simplify Mask based on demanded element.
45403 int ExtractIdx = (int)IdxC.getZExtValue();
45404 int Scale = Mask.size() / NumSrcElts;
45405 int Lo = Scale * ExtractIdx;
45406 int Hi = Scale * (ExtractIdx + 1);
45407 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
45408 if (i < Lo || Hi <= i)
45409 Mask[i] = SM_SentinelUndef;
45410
45411 SmallVector<int, 16> WidenedMask;
45412 while (Mask.size() > NumSrcElts &&
45413 canWidenShuffleElements(Mask, WidenedMask))
45414 Mask = std::move(WidenedMask);
45415 }
45416 }
45417
45418 // If narrowing/widening failed, see if we can extract+zero-extend.
45419 int ExtractIdx;
45420 EVT ExtractVT;
45421 if (Mask.size() == NumSrcElts) {
45422 ExtractIdx = Mask[IdxC.getZExtValue()];
45423 ExtractVT = SrcVT;
45424 } else {
45425 unsigned Scale = Mask.size() / NumSrcElts;
45426 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
45427 return SDValue();
45428 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
45429 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
45430 return SDValue();
45431 ExtractIdx = Mask[ScaledIdx];
45432 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
45433 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
45434 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45435, __extension__
__PRETTY_FUNCTION__))
45435 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45435, __extension__
__PRETTY_FUNCTION__))
;
45436 }
45437
45438 // If the shuffle source element is undef/zero then we can just accept it.
45439 if (ExtractIdx == SM_SentinelUndef)
45440 return DAG.getUNDEF(VT);
45441
45442 if (ExtractIdx == SM_SentinelZero)
45443 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
45444 : DAG.getConstant(0, dl, VT);
45445
45446 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
45447 ExtractIdx = ExtractIdx % Mask.size();
45448 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
45449 return DAG.getZExtOrTrunc(V, dl, VT);
45450
45451 return SDValue();
45452}
45453
45454/// Extracting a scalar FP value from vector element 0 is free, so extract each
45455/// operand first, then perform the math as a scalar op.
45456static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
45457 const X86Subtarget &Subtarget) {
45458 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45458, __extension__
__PRETTY_FUNCTION__))
;
45459 SDValue Vec = ExtElt->getOperand(0);
45460 SDValue Index = ExtElt->getOperand(1);
45461 EVT VT = ExtElt->getValueType(0);
45462 EVT VecVT = Vec.getValueType();
45463
45464 // TODO: If this is a unary/expensive/expand op, allow extraction from a
45465 // non-zero element because the shuffle+scalar op will be cheaper?
45466 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
45467 return SDValue();
45468
45469 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
45470 // extract, the condition code), so deal with those as a special-case.
45471 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
45472 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
45473 if (OpVT != MVT::f32 && OpVT != MVT::f64)
45474 return SDValue();
45475
45476 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
45477 SDLoc DL(ExtElt);
45478 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45479 Vec.getOperand(0), Index);
45480 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45481 Vec.getOperand(1), Index);
45482 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
45483 }
45484
45485 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
45486 VT != MVT::f64)
45487 return SDValue();
45488
45489 // Vector FP selects don't fit the pattern of FP math ops (because the
45490 // condition has a different type and we have to change the opcode), so deal
45491 // with those here.
45492 // FIXME: This is restricted to pre type legalization by ensuring the setcc
45493 // has i1 elements. If we loosen this we need to convert vector bool to a
45494 // scalar bool.
45495 if (Vec.getOpcode() == ISD::VSELECT &&
45496 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
45497 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
45498 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
45499 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
45500 SDLoc DL(ExtElt);
45501 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
45502 Vec.getOperand(0).getValueType().getScalarType(),
45503 Vec.getOperand(0), Index);
45504 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45505 Vec.getOperand(1), Index);
45506 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45507 Vec.getOperand(2), Index);
45508 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
45509 }
45510
45511 // TODO: This switch could include FNEG and the x86-specific FP logic ops
45512 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
45513 // missed load folding and fma+fneg combining.
45514 switch (Vec.getOpcode()) {
45515 case ISD::FMA: // Begin 3 operands
45516 case ISD::FMAD:
45517 case ISD::FADD: // Begin 2 operands
45518 case ISD::FSUB:
45519 case ISD::FMUL:
45520 case ISD::FDIV:
45521 case ISD::FREM:
45522 case ISD::FCOPYSIGN:
45523 case ISD::FMINNUM:
45524 case ISD::FMAXNUM:
45525 case ISD::FMINNUM_IEEE:
45526 case ISD::FMAXNUM_IEEE:
45527 case ISD::FMAXIMUM:
45528 case ISD::FMINIMUM:
45529 case X86ISD::FMAX:
45530 case X86ISD::FMIN:
45531 case ISD::FABS: // Begin 1 operand
45532 case ISD::FSQRT:
45533 case ISD::FRINT:
45534 case ISD::FCEIL:
45535 case ISD::FTRUNC:
45536 case ISD::FNEARBYINT:
45537 case ISD::FROUND:
45538 case ISD::FFLOOR:
45539 case X86ISD::FRCP:
45540 case X86ISD::FRSQRT: {
45541 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
45542 SDLoc DL(ExtElt);
45543 SmallVector<SDValue, 4> ExtOps;
45544 for (SDValue Op : Vec->ops())
45545 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
45546 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
45547 }
45548 default:
45549 return SDValue();
45550 }
45551 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45551)
;
45552}
45553
45554/// Try to convert a vector reduction sequence composed of binops and shuffles
45555/// into horizontal ops.
45556static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
45557 const X86Subtarget &Subtarget) {
45558 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45558, __extension__
__PRETTY_FUNCTION__))
;
45559
45560 // We need at least SSE2 to anything here.
45561 if (!Subtarget.hasSSE2())
45562 return SDValue();
45563
45564 ISD::NodeType Opc;
45565 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
45566 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
45567 if (!Rdx)
45568 return SDValue();
45569
45570 SDValue Index = ExtElt->getOperand(1);
45571 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45572, __extension__
__PRETTY_FUNCTION__))
45572 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45572, __extension__
__PRETTY_FUNCTION__))
;
45573
45574 EVT VT = ExtElt->getValueType(0);
45575 EVT VecVT = Rdx.getValueType();
45576 if (VecVT.getScalarType() != VT)
45577 return SDValue();
45578
45579 SDLoc DL(ExtElt);
45580 unsigned NumElts = VecVT.getVectorNumElements();
45581 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
45582
45583 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
45584 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
45585 if (V.getValueType() == MVT::v4i8) {
45586 if (ZeroExtend && Subtarget.hasSSE41()) {
45587 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
45588 DAG.getConstant(0, DL, MVT::v4i32),
45589 DAG.getBitcast(MVT::i32, V),
45590 DAG.getIntPtrConstant(0, DL));
45591 return DAG.getBitcast(MVT::v16i8, V);
45592 }
45593 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
45594 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
45595 : DAG.getUNDEF(MVT::v4i8));
45596 }
45597 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
45598 DAG.getUNDEF(MVT::v8i8));
45599 };
45600
45601 // vXi8 mul reduction - promote to vXi16 mul reduction.
45602 if (Opc == ISD::MUL) {
45603 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
45604 return SDValue();
45605 if (VecVT.getSizeInBits() >= 128) {
45606 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
45607 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45608 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45609 Lo = DAG.getBitcast(WideVT, Lo);
45610 Hi = DAG.getBitcast(WideVT, Hi);
45611 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
45612 while (Rdx.getValueSizeInBits() > 128) {
45613 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45614 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
45615 }
45616 } else {
45617 Rdx = WidenToV16I8(Rdx, false);
45618 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
45619 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
45620 }
45621 if (NumElts >= 8)
45622 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45623 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45624 {4, 5, 6, 7, -1, -1, -1, -1}));
45625 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45626 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45627 {2, 3, -1, -1, -1, -1, -1, -1}));
45628 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45629 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45630 {1, -1, -1, -1, -1, -1, -1, -1}));
45631 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45632 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45633 }
45634
45635 // vXi8 add reduction - sub 128-bit vector.
45636 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
45637 Rdx = WidenToV16I8(Rdx, true);
45638 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45639 DAG.getConstant(0, DL, MVT::v16i8));
45640 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45641 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45642 }
45643
45644 // Must be a >=128-bit vector with pow2 elements.
45645 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
45646 return SDValue();
45647
45648 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
45649 if (VT == MVT::i8) {
45650 while (Rdx.getValueSizeInBits() > 128) {
45651 SDValue Lo, Hi;
45652 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45653 VecVT = Lo.getValueType();
45654 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45655 }
45656 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45656, __extension__
__PRETTY_FUNCTION__))
;
45657
45658 SDValue Hi = DAG.getVectorShuffle(
45659 MVT::v16i8, DL, Rdx, Rdx,
45660 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45661 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
45662 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45663 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
45664 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45665 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45666 }
45667
45668 // See if we can use vXi8 PSADBW add reduction for larger zext types.
45669 // If the source vector values are 0-255, then we can use PSADBW to
45670 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
45671 // TODO: See if its worth avoiding vXi16/i32 truncations?
45672 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
45673 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
45674 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
45675 Subtarget.hasAVX512())) {
45676 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
45677 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
45678 if (ByteVT.getSizeInBits() < 128)
45679 Rdx = WidenToV16I8(Rdx, true);
45680
45681 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45682 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45683 ArrayRef<SDValue> Ops) {
45684 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45685 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
45686 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
45687 };
45688 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
45689 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
45690
45691 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
45692 while (Rdx.getValueSizeInBits() > 128) {
45693 SDValue Lo, Hi;
45694 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45695 VecVT = Lo.getValueType();
45696 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45697 }
45698 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45698, __extension__
__PRETTY_FUNCTION__))
;
45699
45700 if (NumElts > 8) {
45701 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45702 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
45703 }
45704
45705 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
45706 Rdx = DAG.getBitcast(VecVT, Rdx);
45707 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45708 }
45709
45710 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
45711 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
45712 return SDValue();
45713
45714 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
45715
45716 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45717 // across the whole vector, so we need an extract + hop preliminary stage.
45718 // This is the only step where the operands of the hop are not the same value.
45719 // TODO: We could extend this to handle 512-bit or even longer vectors.
45720 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
45721 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
45722 unsigned NumElts = VecVT.getVectorNumElements();
45723 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
45724 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
45725 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
45726 VecVT = Rdx.getValueType();
45727 }
45728 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
45729 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
45730 return SDValue();
45731
45732 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
45733 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
45734 for (unsigned i = 0; i != ReductionSteps; ++i)
45735 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
45736
45737 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45738}
45739
45740/// Detect vector gather/scatter index generation and convert it from being a
45741/// bunch of shuffles and extracts into a somewhat faster sequence.
45742/// For i686, the best sequence is apparently storing the value and loading
45743/// scalars back, while for x64 we should use 64-bit extracts and shifts.
45744static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
45745 TargetLowering::DAGCombinerInfo &DCI,
45746 const X86Subtarget &Subtarget) {
45747 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
45748 return NewOp;
45749
45750 SDValue InputVector = N->getOperand(0);
45751 SDValue EltIdx = N->getOperand(1);
45752 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
45753
45754 EVT SrcVT = InputVector.getValueType();
45755 EVT VT = N->getValueType(0);
45756 SDLoc dl(InputVector);
45757 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
45758 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45759 unsigned NumEltBits = VT.getScalarSizeInBits();
45760 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45761
45762 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
45763 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45764
45765 // Integer Constant Folding.
45766 if (CIdx && VT.isInteger()) {
45767 APInt UndefVecElts;
45768 SmallVector<APInt, 16> EltBits;
45769 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
45770 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
45771 EltBits, true, false)) {
45772 uint64_t Idx = CIdx->getZExtValue();
45773 if (UndefVecElts[Idx])
45774 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45775 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
45776 }
45777
45778 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
45779 // Improves lowering of bool masks on rust which splits them into byte array.
45780 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
45781 SDValue Src = peekThroughBitcasts(InputVector);
45782 if (Src.getValueType().getScalarType() == MVT::i1 &&
45783 TLI.isTypeLegal(Src.getValueType())) {
45784 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
45785 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
45786 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
45787 return DAG.getBitcast(VT, Sub);
45788 }
45789 }
45790 }
45791
45792 if (IsPextr) {
45793 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
45794 DCI))
45795 return SDValue(N, 0);
45796
45797 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
45798 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
45799 InputVector.getOpcode() == X86ISD::PINSRW) &&
45800 InputVector.getOperand(2) == EltIdx) {
45801 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45802, __extension__
__PRETTY_FUNCTION__))
45802 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45802, __extension__
__PRETTY_FUNCTION__))
;
45803 SDValue Scl = InputVector.getOperand(1);
45804 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
45805 return DAG.getZExtOrTrunc(Scl, dl, VT);
45806 }
45807
45808 // TODO - Remove this once we can handle the implicit zero-extension of
45809 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
45810 // combineBasicSADPattern.
45811 return SDValue();
45812 }
45813
45814 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
45815 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
45816 InputVector.getOpcode() == ISD::BITCAST &&
45817 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45818 isNullConstant(EltIdx) && InputVector.hasOneUse())
45819 return DAG.getBitcast(VT, InputVector);
45820
45821 // Detect mmx to i32 conversion through a v2i32 elt extract.
45822 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
45823 InputVector.getOpcode() == ISD::BITCAST &&
45824 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45825 isNullConstant(EltIdx) && InputVector.hasOneUse())
45826 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
45827 InputVector.getOperand(0));
45828
45829 // Check whether this extract is the root of a sum of absolute differences
45830 // pattern. This has to be done here because we really want it to happen
45831 // pre-legalization,
45832 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
45833 return SAD;
45834
45835 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
45836 return VPDPBUSD;
45837
45838 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
45839 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
45840 return Cmp;
45841
45842 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
45843 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
45844 return MinMax;
45845
45846 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
45847 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
45848 return V;
45849
45850 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
45851 return V;
45852
45853 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
45854 // and then testing the relevant element.
45855 //
45856 // Note that we only combine extracts on the *same* result number, i.e.
45857 // t0 = merge_values a0, a1, a2, a3
45858 // i1 = extract_vector_elt t0, Constant:i64<2>
45859 // i1 = extract_vector_elt t0, Constant:i64<3>
45860 // but not
45861 // i1 = extract_vector_elt t0:1, Constant:i64<2>
45862 // since the latter would need its own MOVMSK.
45863 if (SrcVT.getScalarType() == MVT::i1) {
45864 bool IsVar = !CIdx;
45865 SmallVector<SDNode *, 16> BoolExtracts;
45866 unsigned ResNo = InputVector.getResNo();
45867 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
45868 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45869 Use->getOperand(0).getResNo() == ResNo &&
45870 Use->getValueType(0) == MVT::i1) {
45871 BoolExtracts.push_back(Use);
45872 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
45873 return true;
45874 }
45875 return false;
45876 };
45877 // TODO: Can we drop the oneuse check for constant extracts?
45878 if (all_of(InputVector->uses(), IsBoolExtract) &&
45879 (IsVar || BoolExtracts.size() > 1)) {
45880 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
45881 if (SDValue BC =
45882 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
45883 for (SDNode *Use : BoolExtracts) {
45884 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
45885 // Mask = 1 << MaskIdx
45886 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
45887 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
45888 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
45889 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
45890 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
45891 DCI.CombineTo(Use, Res);
45892 }
45893 return SDValue(N, 0);
45894 }
45895 }
45896 }
45897
45898 // If this extract is from a loaded vector value and will be used as an
45899 // integer, that requires a potentially expensive XMM -> GPR transfer.
45900 // Additionally, if we can convert to a scalar integer load, that will likely
45901 // be folded into a subsequent integer op.
45902 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
45903 // to a single-use of the loaded vector. For the reasons above, we
45904 // expect this to be profitable even if it creates an extra load.
45905 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
45906 return Use->getOpcode() == ISD::STORE ||
45907 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45908 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45909 });
45910 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
45911 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
45912 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
45913 !LikelyUsedAsVector && LoadVec->isSimple()) {
45914 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45915 SDValue NewPtr =
45916 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
45917 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
45918 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45919 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45920 SDValue Load =
45921 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45922 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45923 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
45924 return Load;
45925 }
45926
45927 return SDValue();
45928}
45929
45930// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
45931// This is more or less the reverse of combineBitcastvxi1.
45932static SDValue combineToExtendBoolVectorInReg(
45933 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
45934 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
45935 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
45936 Opcode != ISD::ANY_EXTEND)
45937 return SDValue();
45938 if (!DCI.isBeforeLegalizeOps())
45939 return SDValue();
45940 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
45941 return SDValue();
45942
45943 EVT SVT = VT.getScalarType();
45944 EVT InSVT = N0.getValueType().getScalarType();
45945 unsigned EltSizeInBits = SVT.getSizeInBits();
45946
45947 // Input type must be extending a bool vector (bit-casted from a scalar
45948 // integer) to legal integer types.
45949 if (!VT.isVector())
45950 return SDValue();
45951 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
45952 return SDValue();
45953 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
45954 return SDValue();
45955
45956 SDValue N00 = N0.getOperand(0);
45957 EVT SclVT = N00.getValueType();
45958 if (!SclVT.isScalarInteger())
45959 return SDValue();
45960
45961 SDValue Vec;
45962 SmallVector<int> ShuffleMask;
45963 unsigned NumElts = VT.getVectorNumElements();
45964 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45964, __extension__
__PRETTY_FUNCTION__))
;
45965
45966 // Broadcast the scalar integer to the vector elements.
45967 if (NumElts > EltSizeInBits) {
45968 // If the scalar integer is greater than the vector element size, then we
45969 // must split it down into sub-sections for broadcasting. For example:
45970 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
45971 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
45972 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45972, __extension__
__PRETTY_FUNCTION__))
;
45973 unsigned Scale = NumElts / EltSizeInBits;
45974 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
45975 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45976 Vec = DAG.getBitcast(VT, Vec);
45977
45978 for (unsigned i = 0; i != Scale; ++i)
45979 ShuffleMask.append(EltSizeInBits, i);
45980 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45981 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
45982 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
45983 // If we have register broadcast instructions, use the scalar size as the
45984 // element type for the shuffle. Then cast to the wider element type. The
45985 // widened bits won't be used, and this might allow the use of a broadcast
45986 // load.
45987 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45987, __extension__
__PRETTY_FUNCTION__))
;
45988 unsigned Scale = EltSizeInBits / NumElts;
45989 EVT BroadcastVT =
45990 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
45991 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45992 ShuffleMask.append(NumElts * Scale, 0);
45993 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
45994 Vec = DAG.getBitcast(VT, Vec);
45995 } else {
45996 // For smaller scalar integers, we can simply any-extend it to the vector
45997 // element size (we don't care about the upper bits) and broadcast it to all
45998 // elements.
45999 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
46000 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
46001 ShuffleMask.append(NumElts, 0);
46002 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46003 }
46004
46005 // Now, mask the relevant bit in each element.
46006 SmallVector<SDValue, 32> Bits;
46007 for (unsigned i = 0; i != NumElts; ++i) {
46008 int BitIdx = (i % EltSizeInBits);
46009 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46010 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46011 }
46012 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46013 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46014
46015 // Compare against the bitmask and extend the result.
46016 EVT CCVT = VT.changeVectorElementType(MVT::i1);
46017 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46018 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46019
46020 // For SEXT, this is now done, otherwise shift the result down for
46021 // zero-extension.
46022 if (Opcode == ISD::SIGN_EXTEND)
46023 return Vec;
46024 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46025 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46026}
46027
46028/// If a vector select has an operand that is -1 or 0, try to simplify the
46029/// select to a bitwise logic operation.
46030/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46031static SDValue
46032combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
46033 TargetLowering::DAGCombinerInfo &DCI,
46034 const X86Subtarget &Subtarget) {
46035 SDValue Cond = N->getOperand(0);
46036 SDValue LHS = N->getOperand(1);
46037 SDValue RHS = N->getOperand(2);
46038 EVT VT = LHS.getValueType();
46039 EVT CondVT = Cond.getValueType();
46040 SDLoc DL(N);
46041 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46042
46043 if (N->getOpcode() != ISD::VSELECT)
46044 return SDValue();
46045
46046 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46046, __extension__
__PRETTY_FUNCTION__))
;
46047
46048 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46049 // TODO: Can we assert that both operands are not zeros (because that should
46050 // get simplified at node creation time)?
46051 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46052 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46053
46054 // If both inputs are 0/undef, create a complete zero vector.
46055 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46056 if (TValIsAllZeros && FValIsAllZeros) {
46057 if (VT.isFloatingPoint())
46058 return DAG.getConstantFP(0.0, DL, VT);
46059 return DAG.getConstant(0, DL, VT);
46060 }
46061
46062 // To use the condition operand as a bitwise mask, it must have elements that
46063 // are the same size as the select elements. Ie, the condition operand must
46064 // have already been promoted from the IR select condition type <N x i1>.
46065 // Don't check if the types themselves are equal because that excludes
46066 // vector floating-point selects.
46067 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46068 return SDValue();
46069
46070 // Try to invert the condition if true value is not all 1s and false value is
46071 // not all 0s. Only do this if the condition has one use.
46072 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46073 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46074 // Check if the selector will be produced by CMPP*/PCMP*.
46075 Cond.getOpcode() == ISD::SETCC &&
46076 // Check if SETCC has already been promoted.
46077 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46078 CondVT) {
46079 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46080
46081 if (TValIsAllZeros || FValIsAllOnes) {
46082 SDValue CC = Cond.getOperand(2);
46083 ISD::CondCode NewCC = ISD::getSetCCInverse(
46084 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46085 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46086 NewCC);
46087 std::swap(LHS, RHS);
46088 TValIsAllOnes = FValIsAllOnes;
46089 FValIsAllZeros = TValIsAllZeros;
46090 }
46091 }
46092
46093 // Cond value must be 'sign splat' to be converted to a logical op.
46094 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46095 return SDValue();
46096
46097 // vselect Cond, 111..., 000... -> Cond
46098 if (TValIsAllOnes && FValIsAllZeros)
46099 return DAG.getBitcast(VT, Cond);
46100
46101 if (!TLI.isTypeLegal(CondVT))
46102 return SDValue();
46103
46104 // vselect Cond, 111..., X -> or Cond, X
46105 if (TValIsAllOnes) {
46106 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46107 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46108 return DAG.getBitcast(VT, Or);
46109 }
46110
46111 // vselect Cond, X, 000... -> and Cond, X
46112 if (FValIsAllZeros) {
46113 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46114 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46115 return DAG.getBitcast(VT, And);
46116 }
46117
46118 // vselect Cond, 000..., X -> andn Cond, X
46119 if (TValIsAllZeros) {
46120 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46121 SDValue AndN;
46122 // The canonical form differs for i1 vectors - x86andnp is not used
46123 if (CondVT.getScalarType() == MVT::i1)
46124 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46125 CastRHS);
46126 else
46127 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46128 return DAG.getBitcast(VT, AndN);
46129 }
46130
46131 return SDValue();
46132}
46133
46134/// If both arms of a vector select are concatenated vectors, split the select,
46135/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46136/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46137/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46138static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
46139 const X86Subtarget &Subtarget) {
46140 unsigned Opcode = N->getOpcode();
46141 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46142 return SDValue();
46143
46144 // TODO: Split 512-bit vectors too?
46145 EVT VT = N->getValueType(0);
46146 if (!VT.is256BitVector())
46147 return SDValue();
46148
46149 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46150 SDValue Cond = N->getOperand(0);
46151 SDValue TVal = N->getOperand(1);
46152 SDValue FVal = N->getOperand(2);
46153 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
46154 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46155 !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
46156 !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
46157 return SDValue();
46158
46159 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46160 ArrayRef<SDValue> Ops) {
46161 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46162 };
46163 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
46164 makeBlend, /*CheckBWI*/ false);
46165}
46166
46167static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
46168 SDValue Cond = N->getOperand(0);
46169 SDValue LHS = N->getOperand(1);
46170 SDValue RHS = N->getOperand(2);
46171 SDLoc DL(N);
46172
46173 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46174 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46175 if (!TrueC || !FalseC)
46176 return SDValue();
46177
46178 // Don't do this for crazy integer types.
46179 EVT VT = N->getValueType(0);
46180 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46181 return SDValue();
46182
46183 // We're going to use the condition bit in math or logic ops. We could allow
46184 // this with a wider condition value (post-legalization it becomes an i8),
46185 // but if nothing is creating selects that late, it doesn't matter.
46186 if (Cond.getValueType() != MVT::i1)
46187 return SDValue();
46188
46189 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46190 // 3, 5, or 9 with i32/i64, so those get transformed too.
46191 // TODO: For constants that overflow or do not differ by power-of-2 or small
46192 // multiplier, convert to 'and' + 'add'.
46193 const APInt &TrueVal = TrueC->getAPIntValue();
46194 const APInt &FalseVal = FalseC->getAPIntValue();
46195
46196 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46197 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46198 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46199 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46200 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46201 return SDValue();
46202 }
46203
46204 bool OV;
46205 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46206 if (OV)
46207 return SDValue();
46208
46209 APInt AbsDiff = Diff.abs();
46210 if (AbsDiff.isPowerOf2() ||
46211 ((VT == MVT::i32 || VT == MVT::i64) &&
46212 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46213
46214 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46215 // of the condition can usually be folded into a compare predicate, but even
46216 // without that, the sequence should be cheaper than a CMOV alternative.
46217 if (TrueVal.slt(FalseVal)) {
46218 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46219 std::swap(TrueC, FalseC);
46220 }
46221
46222 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46223 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46224
46225 // Multiply condition by the difference if non-one.
46226 if (!AbsDiff.isOne())
46227 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46228
46229 // Add the base if non-zero.
46230 if (!FalseC->isZero())
46231 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46232
46233 return R;
46234 }
46235
46236 return SDValue();
46237}
46238
46239/// If this is a *dynamic* select (non-constant condition) and we can match
46240/// this node with one of the variable blend instructions, restructure the
46241/// condition so that blends can use the high (sign) bit of each element.
46242/// This function will also call SimplifyDemandedBits on already created
46243/// BLENDV to perform additional simplifications.
46244static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
46245 TargetLowering::DAGCombinerInfo &DCI,
46246 const X86Subtarget &Subtarget) {
46247 SDValue Cond = N->getOperand(0);
46248 if ((N->getOpcode() != ISD::VSELECT &&
46249 N->getOpcode() != X86ISD::BLENDV) ||
46250 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
46251 return SDValue();
46252
46253 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46254 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46255 EVT VT = N->getValueType(0);
46256
46257 // We can only handle the cases where VSELECT is directly legal on the
46258 // subtarget. We custom lower VSELECT nodes with constant conditions and
46259 // this makes it hard to see whether a dynamic VSELECT will correctly
46260 // lower, so we both check the operation's status and explicitly handle the
46261 // cases where a *dynamic* blend will fail even though a constant-condition
46262 // blend could be custom lowered.
46263 // FIXME: We should find a better way to handle this class of problems.
46264 // Potentially, we should combine constant-condition vselect nodes
46265 // pre-legalization into shuffles and not mark as many types as custom
46266 // lowered.
46267 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
46268 return SDValue();
46269 // FIXME: We don't support i16-element blends currently. We could and
46270 // should support them by making *all* the bits in the condition be set
46271 // rather than just the high bit and using an i8-element blend.
46272 if (VT.getVectorElementType() == MVT::i16)
46273 return SDValue();
46274 // Dynamic blending was only available from SSE4.1 onward.
46275 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46276 return SDValue();
46277 // Byte blends are only available in AVX2
46278 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46279 return SDValue();
46280 // There are no 512-bit blend instructions that use sign bits.
46281 if (VT.is512BitVector())
46282 return SDValue();
46283
46284 // Don't optimize before the condition has been transformed to a legal type
46285 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46286 if (BitWidth < 8 || BitWidth > 64)
46287 return SDValue();
46288
46289 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46290 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
46291 UI != UE; ++UI)
46292 if ((UI->getOpcode() != ISD::VSELECT &&
46293 UI->getOpcode() != X86ISD::BLENDV) ||
46294 UI.getOperandNo() != 0)
46295 return false;
46296
46297 return true;
46298 };
46299
46300 APInt DemandedBits(APInt::getSignMask(BitWidth));
46301
46302 if (OnlyUsedAsSelectCond(Cond)) {
46303 KnownBits Known;
46304 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
46305 !DCI.isBeforeLegalizeOps());
46306 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46307 return SDValue();
46308
46309 // If we changed the computation somewhere in the DAG, this change will
46310 // affect all users of Cond. Update all the nodes so that we do not use
46311 // the generic VSELECT anymore. Otherwise, we may perform wrong
46312 // optimizations as we messed with the actual expectation for the vector
46313 // boolean values.
46314 for (SDNode *U : Cond->uses()) {
46315 if (U->getOpcode() == X86ISD::BLENDV)
46316 continue;
46317
46318 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46319 Cond, U->getOperand(1), U->getOperand(2));
46320 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46321 DCI.AddToWorklist(U);
46322 }
46323 DCI.CommitTargetLoweringOpt(TLO);
46324 return SDValue(N, 0);
46325 }
46326
46327 // Otherwise we can still at least try to simplify multiple use bits.
46328 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
46329 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
46330 N->getOperand(1), N->getOperand(2));
46331
46332 return SDValue();
46333}
46334
46335// Try to match:
46336// (or (and (M, (sub 0, X)), (pandn M, X)))
46337// which is a special case of:
46338// (select M, (sub 0, X), X)
46339// Per:
46340// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46341// We know that, if fNegate is 0 or 1:
46342// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46343//
46344// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46345// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46346// ( M ? -X : X) == ((X ^ M ) + (M & 1))
46347// This lets us transform our vselect to:
46348// (add (xor X, M), (and M, 1))
46349// And further to:
46350// (sub (xor X, M), M)
46351static SDValue combineLogicBlendIntoConditionalNegate(
46352 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46353 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46354 EVT MaskVT = Mask.getValueType();
46355 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46357, __extension__
__PRETTY_FUNCTION__))
46356 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46357, __extension__
__PRETTY_FUNCTION__))
46357 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46357, __extension__
__PRETTY_FUNCTION__))
;
46358
46359 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46360 return SDValue();
46361 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
46362 return SDValue();
46363
46364 auto IsNegV = [](SDNode *N, SDValue V) {
46365 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46366 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46367 };
46368
46369 SDValue V;
46370 if (IsNegV(Y.getNode(), X))
46371 V = X;
46372 else if (IsNegV(X.getNode(), Y))
46373 V = Y;
46374 else
46375 return SDValue();
46376
46377 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46378 SDValue SubOp2 = Mask;
46379
46380 // If the negate was on the false side of the select, then
46381 // the operands of the SUB need to be swapped. PR 27251.
46382 // This is because the pattern being matched above is
46383 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46384 // but if the pattern matched was
46385 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46386 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
46387 // pattern also needs to be a negation of the replacement pattern above.
46388 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
46389 // sub accomplishes the negation of the replacement pattern.
46390 if (V == Y)
46391 std::swap(SubOp1, SubOp2);
46392
46393 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
46394 return DAG.getBitcast(VT, Res);
46395}
46396
46397/// Do target-specific dag combines on SELECT and VSELECT nodes.
46398static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
46399 TargetLowering::DAGCombinerInfo &DCI,
46400 const X86Subtarget &Subtarget) {
46401 SDLoc DL(N);
46402 SDValue Cond = N->getOperand(0);
46403 SDValue LHS = N->getOperand(1);
46404 SDValue RHS = N->getOperand(2);
46405
46406 // Try simplification again because we use this function to optimize
46407 // BLENDV nodes that are not handled by the generic combiner.
46408 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
46409 return V;
46410
46411 EVT VT = LHS.getValueType();
46412 EVT CondVT = Cond.getValueType();
46413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46414 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
46415
46416 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
46417 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
46418 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
46419 if (CondVT.isVector() && CondVT.isInteger() &&
46420 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
46421 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
46422 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
46423 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
46424 DL, DAG, Subtarget))
46425 return V;
46426
46427 // Convert vselects with constant condition into shuffles.
46428 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
46429 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
46430 SmallVector<int, 64> Mask;
46431 if (createShuffleMaskFromVSELECT(Mask, Cond,
46432 N->getOpcode() == X86ISD::BLENDV))
46433 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
46434 }
46435
46436 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
46437 // by forcing the unselected elements to zero.
46438 // TODO: Can we handle more shuffles with this?
46439 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
46440 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
46441 LHS.hasOneUse() && RHS.hasOneUse()) {
46442 MVT SimpleVT = VT.getSimpleVT();
46443 SmallVector<SDValue, 1> LHSOps, RHSOps;
46444 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
46445 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
46446 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
46447 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
46448 int NumElts = VT.getVectorNumElements();
46449 for (int i = 0; i != NumElts; ++i) {
46450 // getConstVector sets negative shuffle mask values as undef, so ensure
46451 // we hardcode SM_SentinelZero values to zero (0x80).
46452 if (CondMask[i] < NumElts) {
46453 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
46454 RHSMask[i] = 0x80;
46455 } else {
46456 LHSMask[i] = 0x80;
46457 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
46458 }
46459 }
46460 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
46461 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
46462 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
46463 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
46464 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
46465 }
46466 }
46467
46468 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
46469 // instructions match the semantics of the common C idiom x<y?x:y but not
46470 // x<=y?x:y, because of how they handle negative zero (which can be
46471 // ignored in unsafe-math mode).
46472 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
46473 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
46474 VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
46475 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
46476 (Subtarget.hasSSE2() ||
46477 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
46478 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46479
46480 unsigned Opcode = 0;
46481 // Check for x CC y ? x : y.
46482 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
46483 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
46484 switch (CC) {
46485 default: break;
46486 case ISD::SETULT:
46487 // Converting this to a min would handle NaNs incorrectly, and swapping
46488 // the operands would cause it to handle comparisons between positive
46489 // and negative zero incorrectly.
46490 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46491 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46492 !(DAG.isKnownNeverZeroFloat(LHS) ||
46493 DAG.isKnownNeverZeroFloat(RHS)))
46494 break;
46495 std::swap(LHS, RHS);
46496 }
46497 Opcode = X86ISD::FMIN;
46498 break;
46499 case ISD::SETOLE:
46500 // Converting this to a min would handle comparisons between positive
46501 // and negative zero incorrectly.
46502 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46503 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46504 break;
46505 Opcode = X86ISD::FMIN;
46506 break;
46507 case ISD::SETULE:
46508 // Converting this to a min would handle both negative zeros and NaNs
46509 // incorrectly, but we can swap the operands to fix both.
46510 std::swap(LHS, RHS);
46511 [[fallthrough]];
46512 case ISD::SETOLT:
46513 case ISD::SETLT:
46514 case ISD::SETLE:
46515 Opcode = X86ISD::FMIN;
46516 break;
46517
46518 case ISD::SETOGE:
46519 // Converting this to a max would handle comparisons between positive
46520 // and negative zero incorrectly.
46521 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46522 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
46523 break;
46524 Opcode = X86ISD::FMAX;
46525 break;
46526 case ISD::SETUGT:
46527 // Converting this to a max would handle NaNs incorrectly, and swapping
46528 // the operands would cause it to handle comparisons between positive
46529 // and negative zero incorrectly.
46530 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46531 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46532 !(DAG.isKnownNeverZeroFloat(LHS) ||
46533 DAG.isKnownNeverZeroFloat(RHS)))
46534 break;
46535 std::swap(LHS, RHS);
46536 }
46537 Opcode = X86ISD::FMAX;
46538 break;
46539 case ISD::SETUGE:
46540 // Converting this to a max would handle both negative zeros and NaNs
46541 // incorrectly, but we can swap the operands to fix both.
46542 std::swap(LHS, RHS);
46543 [[fallthrough]];
46544 case ISD::SETOGT:
46545 case ISD::SETGT:
46546 case ISD::SETGE:
46547 Opcode = X86ISD::FMAX;
46548 break;
46549 }
46550 // Check for x CC y ? y : x -- a min/max with reversed arms.
46551 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
46552 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
46553 switch (CC) {
46554 default: break;
46555 case ISD::SETOGE:
46556 // Converting this to a min would handle comparisons between positive
46557 // and negative zero incorrectly, and swapping the operands would
46558 // cause it to handle NaNs incorrectly.
46559 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46560 !(DAG.isKnownNeverZeroFloat(LHS) ||
46561 DAG.isKnownNeverZeroFloat(RHS))) {
46562 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46563 break;
46564 std::swap(LHS, RHS);
46565 }
46566 Opcode = X86ISD::FMIN;
46567 break;
46568 case ISD::SETUGT:
46569 // Converting this to a min would handle NaNs incorrectly.
46570 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46571 break;
46572 Opcode = X86ISD::FMIN;
46573 break;
46574 case ISD::SETUGE:
46575 // Converting this to a min would handle both negative zeros and NaNs
46576 // incorrectly, but we can swap the operands to fix both.
46577 std::swap(LHS, RHS);
46578 [[fallthrough]];
46579 case ISD::SETOGT:
46580 case ISD::SETGT:
46581 case ISD::SETGE:
46582 Opcode = X86ISD::FMIN;
46583 break;
46584
46585 case ISD::SETULT:
46586 // Converting this to a max would handle NaNs incorrectly.
46587 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46588 break;
46589 Opcode = X86ISD::FMAX;
46590 break;
46591 case ISD::SETOLE:
46592 // Converting this to a max would handle comparisons between positive
46593 // and negative zero incorrectly, and swapping the operands would
46594 // cause it to handle NaNs incorrectly.
46595 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46596 !DAG.isKnownNeverZeroFloat(LHS) &&
46597 !DAG.isKnownNeverZeroFloat(RHS)) {
46598 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46599 break;
46600 std::swap(LHS, RHS);
46601 }
46602 Opcode = X86ISD::FMAX;
46603 break;
46604 case ISD::SETULE:
46605 // Converting this to a max would handle both negative zeros and NaNs
46606 // incorrectly, but we can swap the operands to fix both.
46607 std::swap(LHS, RHS);
46608 [[fallthrough]];
46609 case ISD::SETOLT:
46610 case ISD::SETLT:
46611 case ISD::SETLE:
46612 Opcode = X86ISD::FMAX;
46613 break;
46614 }
46615 }
46616
46617 if (Opcode)
46618 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46619 }
46620
46621 // Some mask scalar intrinsics rely on checking if only one bit is set
46622 // and implement it in C code like this:
46623 // A[0] = (U & 1) ? A[0] : W[0];
46624 // This creates some redundant instructions that break pattern matching.
46625 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46626 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46627 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
46628 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46629 SDValue AndNode = Cond.getOperand(0);
46630 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
46631 isNullConstant(Cond.getOperand(1)) &&
46632 isOneConstant(AndNode.getOperand(1))) {
46633 // LHS and RHS swapped due to
46634 // setcc outputting 1 when AND resulted in 0 and vice versa.
46635 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
46636 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
46637 }
46638 }
46639
46640 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
46641 // lowering on KNL. In this case we convert it to
46642 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
46643 // The same situation all vectors of i8 and i16 without BWI.
46644 // Make sure we extend these even before type legalization gets a chance to
46645 // split wide vectors.
46646 // Since SKX these selects have a proper lowering.
46647 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
46648 CondVT.getVectorElementType() == MVT::i1 &&
46649 (VT.getVectorElementType() == MVT::i8 ||
46650 VT.getVectorElementType() == MVT::i16)) {
46651 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
46652 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46653 }
46654
46655 // AVX512 - Extend select with zero to merge with target shuffle.
46656 // select(mask, extract_subvector(shuffle(x)), zero) -->
46657 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
46658 // TODO - support non target shuffles as well.
46659 if (Subtarget.hasAVX512() && CondVT.isVector() &&
46660 CondVT.getVectorElementType() == MVT::i1) {
46661 auto SelectableOp = [&TLI](SDValue Op) {
46662 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46663 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
46664 isNullConstant(Op.getOperand(1)) &&
46665 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
46666 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
46667 };
46668
46669 bool SelectableLHS = SelectableOp(LHS);
46670 bool SelectableRHS = SelectableOp(RHS);
46671 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
46672 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
46673
46674 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
46675 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
46676 : RHS.getOperand(0).getValueType();
46677 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
46678 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
46679 VT.getSizeInBits());
46680 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
46681 VT.getSizeInBits());
46682 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
46683 DAG.getUNDEF(SrcCondVT), Cond,
46684 DAG.getIntPtrConstant(0, DL));
46685 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
46686 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
46687 }
46688 }
46689
46690 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
46691 return V;
46692
46693 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46694 Cond.hasOneUse()) {
46695 EVT CondVT = Cond.getValueType();
46696 SDValue Cond0 = Cond.getOperand(0);
46697 SDValue Cond1 = Cond.getOperand(1);
46698 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46699
46700 // Canonicalize min/max:
46701 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46702 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46703 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
46704 // the need for an extra compare against zero. e.g.
46705 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46706 // subl %esi, %edi
46707 // testl %edi, %edi
46708 // movl $0, %eax
46709 // cmovgl %edi, %eax
46710 // =>
46711 // xorl %eax, %eax
46712 // subl %esi, $edi
46713 // cmovsl %eax, %edi
46714 //
46715 // We can also canonicalize
46716 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46717 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46718 // This allows the use of a test instruction for the compare.
46719 if (LHS == Cond0 && RHS == Cond1) {
46720 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
46721 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
46722 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
46723 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46724 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46725 }
46726 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
46727 ISD::CondCode NewCC = ISD::SETUGE;
46728 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46729 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46730 }
46731 }
46732
46733 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
46734 // fold eq + gt/lt nested selects into ge/le selects
46735 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
46736 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
46737 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
46738 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
46739 // .. etc ..
46740 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
46741 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
46742 SDValue InnerSetCC = RHS.getOperand(0);
46743 ISD::CondCode InnerCC =
46744 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
46745 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
46746 Cond0 == InnerSetCC.getOperand(0) &&
46747 Cond1 == InnerSetCC.getOperand(1)) {
46748 ISD::CondCode NewCC;
46749 switch (CC == ISD::SETEQ ? InnerCC : CC) {
46750 case ISD::SETGT: NewCC = ISD::SETGE; break;
46751 case ISD::SETLT: NewCC = ISD::SETLE; break;
46752 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
46753 case ISD::SETULT: NewCC = ISD::SETULE; break;
46754 default: NewCC = ISD::SETCC_INVALID; break;
46755 }
46756 if (NewCC != ISD::SETCC_INVALID) {
46757 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
46758 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
46759 }
46760 }
46761 }
46762 }
46763
46764 // Check if the first operand is all zeros and Cond type is vXi1.
46765 // If this an avx512 target we can improve the use of zero masking by
46766 // swapping the operands and inverting the condition.
46767 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
46768 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
46769 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
46770 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
46771 // Invert the cond to not(cond) : xor(op,allones)=not(op)
46772 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
46773 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
46774 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
46775 }
46776
46777 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
46778 // get split by legalization.
46779 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
46780 CondVT.getVectorElementType() == MVT::i1 &&
46781 TLI.isTypeLegal(VT.getScalarType())) {
46782 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
46783 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
46784 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
46785 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
46786 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
46787 }
46788 }
46789
46790 // Early exit check
46791 if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
46792 return SDValue();
46793
46794 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
46795 return V;
46796
46797 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
46798 return V;
46799
46800 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
46801 return V;
46802
46803 // select(~Cond, X, Y) -> select(Cond, Y, X)
46804 if (CondVT.getScalarType() != MVT::i1) {
46805 if (SDValue CondNot = IsNOT(Cond, DAG))
46806 return DAG.getNode(N->getOpcode(), DL, VT,
46807 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
46808
46809 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
46810 // signbit.
46811 if (Cond.getOpcode() == X86ISD::PCMPGT &&
46812 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
46813 Cond.hasOneUse()) {
46814 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
46815 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
46816 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
46817 }
46818 }
46819
46820 // Try to optimize vXi1 selects if both operands are either all constants or
46821 // bitcasts from scalar integer type. In that case we can convert the operands
46822 // to integer and use an integer select which will be converted to a CMOV.
46823 // We need to take a little bit of care to avoid creating an i64 type after
46824 // type legalization.
46825 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
46826 VT.getVectorElementType() == MVT::i1 &&
46827 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
46828 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46829 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
46830 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
46831 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
46832
46833 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
46834 LHS.getOperand(0).getValueType() == IntVT)) &&
46835 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
46836 RHS.getOperand(0).getValueType() == IntVT))) {
46837 if (LHSIsConst)
46838 LHS = combinevXi1ConstantToInteger(LHS, DAG);
46839 else
46840 LHS = LHS.getOperand(0);
46841
46842 if (RHSIsConst)
46843 RHS = combinevXi1ConstantToInteger(RHS, DAG);
46844 else
46845 RHS = RHS.getOperand(0);
46846
46847 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
46848 return DAG.getBitcast(VT, Select);
46849 }
46850 }
46851 }
46852
46853 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
46854 // single bits, then invert the predicate and swap the select operands.
46855 // This can lower using a vector shift bit-hack rather than mask and compare.
46856 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
46857 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
46858 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
46859 Cond.getOperand(0).getOpcode() == ISD::AND &&
46860 isNullOrNullSplat(Cond.getOperand(1)) &&
46861 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
46862 Cond.getOperand(0).getValueType() == VT) {
46863 // The 'and' mask must be composed of power-of-2 constants.
46864 SDValue And = Cond.getOperand(0);
46865 auto *C = isConstOrConstSplat(And.getOperand(1));
46866 if (C && C->getAPIntValue().isPowerOf2()) {
46867 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
46868 SDValue NotCond =
46869 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
46870 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
46871 }
46872
46873 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
46874 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
46875 // 16-bit lacks a proper blendv.
46876 unsigned EltBitWidth = VT.getScalarSizeInBits();
46877 bool CanShiftBlend =
46878 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
46879 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
46880 (Subtarget.hasXOP()));
46881 if (CanShiftBlend &&
46882 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
46883 return C->getAPIntValue().isPowerOf2();
46884 })) {
46885 // Create a left-shift constant to get the mask bits over to the sign-bit.
46886 SDValue Mask = And.getOperand(1);
46887 SmallVector<int, 32> ShlVals;
46888 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
46889 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
46890 ShlVals.push_back(EltBitWidth - 1 -
46891 MaskVal->getAPIntValue().exactLogBase2());
46892 }
46893 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
46894 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
46895 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
46896 SDValue NewCond =
46897 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
46898 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
46899 }
46900 }
46901
46902 return SDValue();
46903}
46904
46905/// Combine:
46906/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
46907/// to:
46908/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
46909/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
46910/// Note that this is only legal for some op/cc combinations.
46911static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
46912 SelectionDAG &DAG,
46913 const X86Subtarget &Subtarget) {
46914 // This combine only operates on CMP-like nodes.
46915 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46916 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46917 return SDValue();
46918
46919 // Can't replace the cmp if it has more uses than the one we're looking at.
46920 // FIXME: We would like to be able to handle this, but would need to make sure
46921 // all uses were updated.
46922 if (!Cmp.hasOneUse())
46923 return SDValue();
46924
46925 // This only applies to variations of the common case:
46926 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46927 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46928 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46929 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46930 // Using the proper condcodes (see below), overflow is checked for.
46931
46932 // FIXME: We can generalize both constraints:
46933 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46934 // - LHS != 1
46935 // if the result is compared.
46936
46937 SDValue CmpLHS = Cmp.getOperand(0);
46938 SDValue CmpRHS = Cmp.getOperand(1);
46939 EVT CmpVT = CmpLHS.getValueType();
46940
46941 if (!CmpLHS.hasOneUse())
46942 return SDValue();
46943
46944 unsigned Opc = CmpLHS.getOpcode();
46945 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
46946 return SDValue();
46947
46948 SDValue OpRHS = CmpLHS.getOperand(2);
46949 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
46950 if (!OpRHSC)
46951 return SDValue();
46952
46953 APInt Addend = OpRHSC->getAPIntValue();
46954 if (Opc == ISD::ATOMIC_LOAD_SUB)
46955 Addend = -Addend;
46956
46957 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
46958 if (!CmpRHSC)
46959 return SDValue();
46960
46961 APInt Comparison = CmpRHSC->getAPIntValue();
46962 APInt NegAddend = -Addend;
46963
46964 // See if we can adjust the CC to make the comparison match the negated
46965 // addend.
46966 if (Comparison != NegAddend) {
46967 APInt IncComparison = Comparison + 1;
46968 if (IncComparison == NegAddend) {
46969 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
46970 Comparison = IncComparison;
46971 CC = X86::COND_AE;
46972 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
46973 Comparison = IncComparison;
46974 CC = X86::COND_L;
46975 }
46976 }
46977 APInt DecComparison = Comparison - 1;
46978 if (DecComparison == NegAddend) {
46979 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
46980 Comparison = DecComparison;
46981 CC = X86::COND_A;
46982 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
46983 Comparison = DecComparison;
46984 CC = X86::COND_LE;
46985 }
46986 }
46987 }
46988
46989 // If the addend is the negation of the comparison value, then we can do
46990 // a full comparison by emitting the atomic arithmetic as a locked sub.
46991 if (Comparison == NegAddend) {
46992 // The CC is fine, but we need to rewrite the LHS of the comparison as an
46993 // atomic sub.
46994 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
46995 auto AtomicSub = DAG.getAtomic(
46996 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
46997 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
46998 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
46999 AN->getMemOperand());
47000 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47001 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47002 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47003 return LockOp;
47004 }
47005
47006 // We can handle comparisons with zero in a number of cases by manipulating
47007 // the CC used.
47008 if (!Comparison.isZero())
47009 return SDValue();
47010
47011 if (CC == X86::COND_S && Addend == 1)
47012 CC = X86::COND_LE;
47013 else if (CC == X86::COND_NS && Addend == 1)
47014 CC = X86::COND_G;
47015 else if (CC == X86::COND_G && Addend == -1)
47016 CC = X86::COND_GE;
47017 else if (CC == X86::COND_LE && Addend == -1)
47018 CC = X86::COND_L;
47019 else
47020 return SDValue();
47021
47022 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47023 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47024 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47025 return LockOp;
47026}
47027
47028// Check whether a boolean test is testing a boolean value generated by
47029// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47030// code.
47031//
47032// Simplify the following patterns:
47033// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47034// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47035// to (Op EFLAGS Cond)
47036//
47037// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47038// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47039// to (Op EFLAGS !Cond)
47040//
47041// where Op could be BRCOND or CMOV.
47042//
47043static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
47044 // This combine only operates on CMP-like nodes.
47045 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47046 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47047 return SDValue();
47048
47049 // Quit if not used as a boolean value.
47050 if (CC != X86::COND_E && CC != X86::COND_NE)
47051 return SDValue();
47052
47053 // Check CMP operands. One of them should be 0 or 1 and the other should be
47054 // an SetCC or extended from it.
47055 SDValue Op1 = Cmp.getOperand(0);
47056 SDValue Op2 = Cmp.getOperand(1);
47057
47058 SDValue SetCC;
47059 const ConstantSDNode* C = nullptr;
47060 bool needOppositeCond = (CC == X86::COND_E);
47061 bool checkAgainstTrue = false; // Is it a comparison against 1?
47062
47063 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47064 SetCC = Op2;
47065 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47066 SetCC = Op1;
47067 else // Quit if all operands are not constants.
47068 return SDValue();
47069
47070 if (C->getZExtValue() == 1) {
47071 needOppositeCond = !needOppositeCond;
47072 checkAgainstTrue = true;
47073 } else if (C->getZExtValue() != 0)
47074 // Quit if the constant is neither 0 or 1.
47075 return SDValue();
47076
47077 bool truncatedToBoolWithAnd = false;
47078 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47079 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47080 SetCC.getOpcode() == ISD::TRUNCATE ||
47081 SetCC.getOpcode() == ISD::AND) {
47082 if (SetCC.getOpcode() == ISD::AND) {
47083 int OpIdx = -1;
47084 if (isOneConstant(SetCC.getOperand(0)))
47085 OpIdx = 1;
47086 if (isOneConstant(SetCC.getOperand(1)))
47087 OpIdx = 0;
47088 if (OpIdx < 0)
47089 break;
47090 SetCC = SetCC.getOperand(OpIdx);
47091 truncatedToBoolWithAnd = true;
47092 } else
47093 SetCC = SetCC.getOperand(0);
47094 }
47095
47096 switch (SetCC.getOpcode()) {
47097 case X86ISD::SETCC_CARRY:
47098 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47099 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47100 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47101 // truncated to i1 using 'and'.
47102 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47103 break;
47104 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47105, __extension__
__PRETTY_FUNCTION__))
47105 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47105, __extension__
__PRETTY_FUNCTION__))
;
47106 [[fallthrough]];
47107 case X86ISD::SETCC:
47108 // Set the condition code or opposite one if necessary.
47109 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
47110 if (needOppositeCond)
47111 CC = X86::GetOppositeBranchCondition(CC);
47112 return SetCC.getOperand(1);
47113 case X86ISD::CMOV: {
47114 // Check whether false/true value has canonical one, i.e. 0 or 1.
47115 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47116 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47117 // Quit if true value is not a constant.
47118 if (!TVal)
47119 return SDValue();
47120 // Quit if false value is not a constant.
47121 if (!FVal) {
47122 SDValue Op = SetCC.getOperand(0);
47123 // Skip 'zext' or 'trunc' node.
47124 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47125 Op.getOpcode() == ISD::TRUNCATE)
47126 Op = Op.getOperand(0);
47127 // A special case for rdrand/rdseed, where 0 is set if false cond is
47128 // found.
47129 if ((Op.getOpcode() != X86ISD::RDRAND &&
47130 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47131 return SDValue();
47132 }
47133 // Quit if false value is not the constant 0 or 1.
47134 bool FValIsFalse = true;
47135 if (FVal && FVal->getZExtValue() != 0) {
47136 if (FVal->getZExtValue() != 1)
47137 return SDValue();
47138 // If FVal is 1, opposite cond is needed.
47139 needOppositeCond = !needOppositeCond;
47140 FValIsFalse = false;
47141 }
47142 // Quit if TVal is not the constant opposite of FVal.
47143 if (FValIsFalse && TVal->getZExtValue() != 1)
47144 return SDValue();
47145 if (!FValIsFalse && TVal->getZExtValue() != 0)
47146 return SDValue();
47147 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
47148 if (needOppositeCond)
47149 CC = X86::GetOppositeBranchCondition(CC);
47150 return SetCC.getOperand(3);
47151 }
47152 }
47153
47154 return SDValue();
47155}
47156
47157/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47158/// Match:
47159/// (X86or (X86setcc) (X86setcc))
47160/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47161static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
47162 X86::CondCode &CC1, SDValue &Flags,
47163 bool &isAnd) {
47164 if (Cond->getOpcode() == X86ISD::CMP) {
47165 if (!isNullConstant(Cond->getOperand(1)))
47166 return false;
47167
47168 Cond = Cond->getOperand(0);
47169 }
47170
47171 isAnd = false;
47172
47173 SDValue SetCC0, SetCC1;
47174 switch (Cond->getOpcode()) {
47175 default: return false;
47176 case ISD::AND:
47177 case X86ISD::AND:
47178 isAnd = true;
47179 [[fallthrough]];
47180 case ISD::OR:
47181 case X86ISD::OR:
47182 SetCC0 = Cond->getOperand(0);
47183 SetCC1 = Cond->getOperand(1);
47184 break;
47185 };
47186
47187 // Make sure we have SETCC nodes, using the same flags value.
47188 if (SetCC0.getOpcode() != X86ISD::SETCC ||
47189 SetCC1.getOpcode() != X86ISD::SETCC ||
47190 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47191 return false;
47192
47193 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47194 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47195 Flags = SetCC0->getOperand(1);
47196 return true;
47197}
47198
47199// When legalizing carry, we create carries via add X, -1
47200// If that comes from an actual carry, via setcc, we use the
47201// carry directly.
47202static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
47203 if (EFLAGS.getOpcode() == X86ISD::ADD) {
47204 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47205 bool FoundAndLSB = false;
47206 SDValue Carry = EFLAGS.getOperand(0);
47207 while (Carry.getOpcode() == ISD::TRUNCATE ||
47208 Carry.getOpcode() == ISD::ZERO_EXTEND ||
47209 (Carry.getOpcode() == ISD::AND &&
47210 isOneConstant(Carry.getOperand(1)))) {
47211 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47212 Carry = Carry.getOperand(0);
47213 }
47214 if (Carry.getOpcode() == X86ISD::SETCC ||
47215 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47216 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47217 uint64_t CarryCC = Carry.getConstantOperandVal(0);
47218 SDValue CarryOp1 = Carry.getOperand(1);
47219 if (CarryCC == X86::COND_B)
47220 return CarryOp1;
47221 if (CarryCC == X86::COND_A) {
47222 // Try to convert COND_A into COND_B in an attempt to facilitate
47223 // materializing "setb reg".
47224 //
47225 // Do not flip "e > c", where "c" is a constant, because Cmp
47226 // instruction cannot take an immediate as its first operand.
47227 //
47228 if (CarryOp1.getOpcode() == X86ISD::SUB &&
47229 CarryOp1.getNode()->hasOneUse() &&
47230 CarryOp1.getValueType().isInteger() &&
47231 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
47232 SDValue SubCommute =
47233 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
47234 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
47235 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
47236 }
47237 }
47238 // If this is a check of the z flag of an add with 1, switch to the
47239 // C flag.
47240 if (CarryCC == X86::COND_E &&
47241 CarryOp1.getOpcode() == X86ISD::ADD &&
47242 isOneConstant(CarryOp1.getOperand(1)))
47243 return CarryOp1;
47244 } else if (FoundAndLSB) {
47245 SDLoc DL(Carry);
47246 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
47247 if (Carry.getOpcode() == ISD::SRL) {
47248 BitNo = Carry.getOperand(1);
47249 Carry = Carry.getOperand(0);
47250 }
47251 return getBT(Carry, BitNo, DL, DAG);
47252 }
47253 }
47254 }
47255
47256 return SDValue();
47257}
47258
47259/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
47260/// to avoid the inversion.
47261static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
47262 SelectionDAG &DAG,
47263 const X86Subtarget &Subtarget) {
47264 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
47265 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
47266 EFLAGS.getOpcode() != X86ISD::TESTP)
47267 return SDValue();
47268
47269 // PTEST/TESTP sets EFLAGS as:
47270 // TESTZ: ZF = (Op0 & Op1) == 0
47271 // TESTC: CF = (~Op0 & Op1) == 0
47272 // TESTNZC: ZF == 0 && CF == 0
47273 EVT VT = EFLAGS.getValueType();
47274 SDValue Op0 = EFLAGS.getOperand(0);
47275 SDValue Op1 = EFLAGS.getOperand(1);
47276 EVT OpVT = Op0.getValueType();
47277
47278 // TEST*(~X,Y) == TEST*(X,Y)
47279 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
47280 X86::CondCode InvCC;
47281 switch (CC) {
47282 case X86::COND_B:
47283 // testc -> testz.
47284 InvCC = X86::COND_E;
47285 break;
47286 case X86::COND_AE:
47287 // !testc -> !testz.
47288 InvCC = X86::COND_NE;
47289 break;
47290 case X86::COND_E:
47291 // testz -> testc.
47292 InvCC = X86::COND_B;
47293 break;
47294 case X86::COND_NE:
47295 // !testz -> !testc.
47296 InvCC = X86::COND_AE;
47297 break;
47298 case X86::COND_A:
47299 case X86::COND_BE:
47300 // testnzc -> testnzc (no change).
47301 InvCC = CC;
47302 break;
47303 default:
47304 InvCC = X86::COND_INVALID;
47305 break;
47306 }
47307
47308 if (InvCC != X86::COND_INVALID) {
47309 CC = InvCC;
47310 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47311 DAG.getBitcast(OpVT, NotOp0), Op1);
47312 }
47313 }
47314
47315 if (CC == X86::COND_B || CC == X86::COND_AE) {
47316 // TESTC(X,~X) == TESTC(X,-1)
47317 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47318 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
47319 SDLoc DL(EFLAGS);
47320 return DAG.getNode(EFLAGS.getOpcode(), DL, VT,
47321 DAG.getBitcast(OpVT, NotOp1),
47322 DAG.getAllOnesConstant(DL, OpVT));
47323 }
47324 }
47325 }
47326
47327 if (CC == X86::COND_E || CC == X86::COND_NE) {
47328 // TESTZ(X,~Y) == TESTC(Y,X)
47329 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47330 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47331 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47332 DAG.getBitcast(OpVT, NotOp1), Op0);
47333 }
47334
47335 if (Op0 == Op1) {
47336 SDValue BC = peekThroughBitcasts(Op0);
47337 EVT BCVT = BC.getValueType();
47338
47339 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
47340 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
47341 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47342 DAG.getBitcast(OpVT, BC.getOperand(0)),
47343 DAG.getBitcast(OpVT, BC.getOperand(1)));
47344 }
47345
47346 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
47347 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
47348 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47349 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47350 DAG.getBitcast(OpVT, BC.getOperand(0)),
47351 DAG.getBitcast(OpVT, BC.getOperand(1)));
47352 }
47353
47354 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
47355 // to more efficiently extract the sign bits and compare that.
47356 // TODO: Handle TESTC with comparison inversion.
47357 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
47358 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
47359 if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
47360 unsigned EltBits = BCVT.getScalarSizeInBits();
47361 if (DAG.ComputeNumSignBits(BC) == EltBits) {
47362 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47362, __extension__
__PRETTY_FUNCTION__))
;
47363 APInt SignMask = APInt::getSignMask(EltBits);
47364 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47365 if (SDValue Res =
47366 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
47367 // For vXi16 cases we need to use pmovmksb and extract every other
47368 // sign bit.
47369 SDLoc DL(EFLAGS);
47370 if (EltBits == 32 || EltBits == 64) {
47371 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
47372 MVT FloatVT =
47373 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
47374 Res = DAG.getBitcast(FloatVT, Res);
47375 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
47376 } else if (EltBits == 16) {
47377 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
47378 Res = DAG.getBitcast(MovmskVT, Res);
47379 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47380 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
47381 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47382 } else {
47383 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47384 }
47385 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
47386 DAG.getConstant(0, DL, MVT::i32));
47387 }
47388 }
47389 }
47390 }
47391
47392 // TESTZ(-1,X) == TESTZ(X,X)
47393 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
47394 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
47395
47396 // TESTZ(X,-1) == TESTZ(X,X)
47397 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
47398 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
47399
47400 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
47401 // TODO: Add COND_NE handling?
47402 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
47403 SDValue Src0 = peekThroughBitcasts(Op0);
47404 SDValue Src1 = peekThroughBitcasts(Op1);
47405 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
47406 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
47407 peekThroughBitcasts(Src0.getOperand(1)), true);
47408 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
47409 peekThroughBitcasts(Src1.getOperand(1)), true);
47410 if (Src0 && Src1)
47411 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47412 DAG.getBitcast(MVT::v4i64, Src0),
47413 DAG.getBitcast(MVT::v4i64, Src1));
47414 }
47415 }
47416 }
47417
47418 return SDValue();
47419}
47420
47421// Attempt to simplify the MOVMSK input based on the comparison type.
47422static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
47423 SelectionDAG &DAG,
47424 const X86Subtarget &Subtarget) {
47425 // Handle eq/ne against zero (any_of).
47426 // Handle eq/ne against -1 (all_of).
47427 if (!(CC == X86::COND_E || CC == X86::COND_NE))
47428 return SDValue();
47429 if (EFLAGS.getValueType() != MVT::i32)
47430 return SDValue();
47431 unsigned CmpOpcode = EFLAGS.getOpcode();
47432 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
47433 return SDValue();
47434 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
47435 if (!CmpConstant)
47436 return SDValue();
47437 const APInt &CmpVal = CmpConstant->getAPIntValue();
47438
47439 SDValue CmpOp = EFLAGS.getOperand(0);
47440 unsigned CmpBits = CmpOp.getValueSizeInBits();
47441 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47441, __extension__
__PRETTY_FUNCTION__))
;
47442
47443 // Peek through any truncate.
47444 if (CmpOp.getOpcode() == ISD::TRUNCATE)
47445 CmpOp = CmpOp.getOperand(0);
47446
47447 // Bail if we don't find a MOVMSK.
47448 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
47449 return SDValue();
47450
47451 SDValue Vec = CmpOp.getOperand(0);
47452 MVT VecVT = Vec.getSimpleValueType();
47453 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47454, __extension__
__PRETTY_FUNCTION__))
47454 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47454, __extension__
__PRETTY_FUNCTION__))
;
47455 unsigned NumElts = VecVT.getVectorNumElements();
47456 unsigned NumEltBits = VecVT.getScalarSizeInBits();
47457
47458 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
47459 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
47460 NumElts <= CmpBits && CmpVal.isMask(NumElts);
47461 if (!IsAnyOf && !IsAllOf)
47462 return SDValue();
47463
47464 // TODO: Check more combining cases for me.
47465 // Here we check the cmp use number to decide do combining or not.
47466 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
47467 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
47468 bool IsOneUse = CmpOp.getNode()->hasOneUse();
47469
47470 // See if we can peek through to a vector with a wider element type, if the
47471 // signbits extend down to all the sub-elements as well.
47472 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
47473 // potential SimplifyDemandedBits/Elts cases.
47474 // If we looked through a truncate that discard bits, we can't do this
47475 // transform.
47476 // FIXME: We could do this transform for truncates that discarded bits by
47477 // inserting an AND mask between the new MOVMSK and the CMP.
47478 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
47479 SDValue BC = peekThroughBitcasts(Vec);
47480 MVT BCVT = BC.getSimpleValueType();
47481 unsigned BCNumElts = BCVT.getVectorNumElements();
47482 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
47483 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
47484 BCNumEltBits > NumEltBits &&
47485 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47486 SDLoc DL(EFLAGS);
47487 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
47488 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47489 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
47490 DAG.getConstant(CmpMask, DL, MVT::i32));
47491 }
47492 }
47493
47494 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
47495 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
47496 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
47497 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
47498 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
47499 SmallVector<SDValue> Ops;
47500 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
47501 Ops.size() == 2) {
47502 SDLoc DL(EFLAGS);
47503 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
47504 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
47505 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
47506 DAG.getBitcast(SubVT, Ops[0]),
47507 DAG.getBitcast(SubVT, Ops[1]));
47508 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
47509 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47510 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
47511 DAG.getConstant(CmpMask, DL, MVT::i32));
47512 }
47513 }
47514
47515 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
47516 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
47517 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
47518 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
47519 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
47520 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
47521 SDValue BC = peekThroughBitcasts(Vec);
47522 // Ensure MOVMSK was testing every signbit of BC.
47523 if (BC.getValueType().getVectorNumElements() <= NumElts) {
47524 if (BC.getOpcode() == X86ISD::PCMPEQ) {
47525 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
47526 BC.getOperand(0), BC.getOperand(1));
47527 V = DAG.getBitcast(TestVT, V);
47528 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47529 }
47530 // Check for 256-bit split vector cases.
47531 if (BC.getOpcode() == ISD::AND &&
47532 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
47533 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
47534 SDValue LHS = BC.getOperand(0);
47535 SDValue RHS = BC.getOperand(1);
47536 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
47537 LHS.getOperand(0), LHS.getOperand(1));
47538 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
47539 RHS.getOperand(0), RHS.getOperand(1));
47540 LHS = DAG.getBitcast(TestVT, LHS);
47541 RHS = DAG.getBitcast(TestVT, RHS);
47542 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
47543 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47544 }
47545 }
47546 }
47547
47548 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
47549 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
47550 // sign bits prior to the comparison with zero unless we know that
47551 // the vXi16 splats the sign bit down to the lower i8 half.
47552 // TODO: Handle all_of patterns.
47553 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
47554 SDValue VecOp0 = Vec.getOperand(0);
47555 SDValue VecOp1 = Vec.getOperand(1);
47556 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
47557 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
47558 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
47559 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
47560 SDLoc DL(EFLAGS);
47561 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
47562 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47563 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
47564 if (!SignExt0) {
47565 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
47566 DAG.getConstant(0xAAAA, DL, MVT::i16));
47567 }
47568 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47569 DAG.getConstant(0, DL, MVT::i16));
47570 }
47571 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
47572 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
47573 if (CmpBits >= 16 && Subtarget.hasInt256() &&
47574 (IsAnyOf || (SignExt0 && SignExt1))) {
47575 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
47576 SDLoc DL(EFLAGS);
47577 SDValue Result = peekThroughBitcasts(Src);
47578 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
47579 Result.getValueType().getVectorNumElements() <= NumElts) {
47580 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
47581 Result.getOperand(0), Result.getOperand(1));
47582 V = DAG.getBitcast(MVT::v4i64, V);
47583 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47584 }
47585 Result = DAG.getBitcast(MVT::v32i8, Result);
47586 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47587 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
47588 if (!SignExt0 || !SignExt1) {
47589 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47590, __extension__
__PRETTY_FUNCTION__))
47590 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47590, __extension__
__PRETTY_FUNCTION__))
;
47591 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
47592 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47593 }
47594 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47595 DAG.getConstant(CmpMask, DL, MVT::i32));
47596 }
47597 }
47598 }
47599
47600 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
47601 SmallVector<int, 32> ShuffleMask;
47602 SmallVector<SDValue, 2> ShuffleInputs;
47603 if (NumElts <= CmpBits &&
47604 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
47605 ShuffleMask, DAG) &&
47606 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
47607 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
47608 unsigned NumShuffleElts = ShuffleMask.size();
47609 APInt DemandedElts = APInt::getZero(NumShuffleElts);
47610 for (int M : ShuffleMask) {
47611 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47611, __extension__
__PRETTY_FUNCTION__))
;
47612 DemandedElts.setBit(M);
47613 }
47614 if (DemandedElts.isAllOnes()) {
47615 SDLoc DL(EFLAGS);
47616 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
47617 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47618 Result =
47619 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
47620 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47621 EFLAGS.getOperand(1));
47622 }
47623 }
47624
47625 return SDValue();
47626}
47627
47628/// Optimize an EFLAGS definition used according to the condition code \p CC
47629/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
47630/// uses of chain values.
47631static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
47632 SelectionDAG &DAG,
47633 const X86Subtarget &Subtarget) {
47634 if (CC == X86::COND_B)
47635 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
47636 return Flags;
47637
47638 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
47639 return R;
47640
47641 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
47642 return R;
47643
47644 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
47645 return R;
47646
47647 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
47648}
47649
47650/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
47651static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
47652 TargetLowering::DAGCombinerInfo &DCI,
47653 const X86Subtarget &Subtarget) {
47654 SDLoc DL(N);
47655
47656 SDValue FalseOp = N->getOperand(0);
47657 SDValue TrueOp = N->getOperand(1);
47658 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47659 SDValue Cond = N->getOperand(3);
47660
47661 // cmov X, X, ?, ? --> X
47662 if (TrueOp == FalseOp)
47663 return TrueOp;
47664
47665 // Try to simplify the EFLAGS and condition code operands.
47666 // We can't always do this as FCMOV only supports a subset of X86 cond.
47667 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
47668 if (!(FalseOp.getValueType() == MVT::f80 ||
47669 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
47670 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
47671 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
47672 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
47673 Flags};
47674 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47675 }
47676 }
47677
47678 // If this is a select between two integer constants, try to do some
47679 // optimizations. Note that the operands are ordered the opposite of SELECT
47680 // operands.
47681 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
47682 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
47683 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
47684 // larger than FalseC (the false value).
47685 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47686 CC = X86::GetOppositeBranchCondition(CC);
47687 std::swap(TrueC, FalseC);
47688 std::swap(TrueOp, FalseOp);
47689 }
47690
47691 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47692 // This is efficient for any integer data type (including i8/i16) and
47693 // shift amount.
47694 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47695 Cond = getSETCC(CC, Cond, DL, DAG);
47696
47697 // Zero extend the condition if needed.
47698 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
47699
47700 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
47701 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
47702 DAG.getConstant(ShAmt, DL, MVT::i8));
47703 return Cond;
47704 }
47705
47706 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
47707 // for any integer data type, including i8/i16.
47708 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
47709 Cond = getSETCC(CC, Cond, DL, DAG);
47710
47711 // Zero extend the condition if needed.
47712 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
47713 FalseC->getValueType(0), Cond);
47714 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47715 SDValue(FalseC, 0));
47716 return Cond;
47717 }
47718
47719 // Optimize cases that will turn into an LEA instruction. This requires
47720 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
47721 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
47722 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
47723 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__))
47724 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__))
;
47725
47726 bool isFastMultiplier = false;
47727 if (Diff.ult(10)) {
47728 switch (Diff.getZExtValue()) {
47729 default: break;
47730 case 1: // result = add base, cond
47731 case 2: // result = lea base( , cond*2)
47732 case 3: // result = lea base(cond, cond*2)
47733 case 4: // result = lea base( , cond*4)
47734 case 5: // result = lea base(cond, cond*4)
47735 case 8: // result = lea base( , cond*8)
47736 case 9: // result = lea base(cond, cond*8)
47737 isFastMultiplier = true;
47738 break;
47739 }
47740 }
47741
47742 if (isFastMultiplier) {
47743 Cond = getSETCC(CC, Cond, DL ,DAG);
47744 // Zero extend the condition if needed.
47745 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
47746 Cond);
47747 // Scale the condition by the difference.
47748 if (Diff != 1)
47749 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
47750 DAG.getConstant(Diff, DL, Cond.getValueType()));
47751
47752 // Add the base if non-zero.
47753 if (FalseC->getAPIntValue() != 0)
47754 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47755 SDValue(FalseC, 0));
47756 return Cond;
47757 }
47758 }
47759 }
47760 }
47761
47762 // Handle these cases:
47763 // (select (x != c), e, c) -> select (x != c), e, x),
47764 // (select (x == c), c, e) -> select (x == c), x, e)
47765 // where the c is an integer constant, and the "select" is the combination
47766 // of CMOV and CMP.
47767 //
47768 // The rationale for this change is that the conditional-move from a constant
47769 // needs two instructions, however, conditional-move from a register needs
47770 // only one instruction.
47771 //
47772 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
47773 // some instruction-combining opportunities. This opt needs to be
47774 // postponed as late as possible.
47775 //
47776 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
47777 // the DCI.xxxx conditions are provided to postpone the optimization as
47778 // late as possible.
47779
47780 ConstantSDNode *CmpAgainst = nullptr;
47781 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
47782 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
47783 !isa<ConstantSDNode>(Cond.getOperand(0))) {
47784
47785 if (CC == X86::COND_NE &&
47786 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
47787 CC = X86::GetOppositeBranchCondition(CC);
47788 std::swap(TrueOp, FalseOp);
47789 }
47790
47791 if (CC == X86::COND_E &&
47792 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
47793 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
47794 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
47795 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47796 }
47797 }
47798 }
47799
47800 // Transform:
47801 //
47802 // (cmov 1 T (uge T 2))
47803 //
47804 // to:
47805 //
47806 // (adc T 0 (sub T 1))
47807 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
47808 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
47809 SDValue Cond0 = Cond.getOperand(0);
47810 if (Cond0.getOpcode() == ISD::TRUNCATE)
47811 Cond0 = Cond0.getOperand(0);
47812 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
47813 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
47814 EVT CondVT = Cond->getValueType(0);
47815 EVT OuterVT = N->getValueType(0);
47816 // Subtract 1 and generate a carry.
47817 SDValue NewSub =
47818 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
47819 DAG.getConstant(1, DL, CondVT));
47820 SDValue EFLAGS(NewSub.getNode(), 1);
47821 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
47822 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
47823 }
47824 }
47825
47826 // Fold and/or of setcc's to double CMOV:
47827 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
47828 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
47829 //
47830 // This combine lets us generate:
47831 // cmovcc1 (jcc1 if we don't have CMOV)
47832 // cmovcc2 (same)
47833 // instead of:
47834 // setcc1
47835 // setcc2
47836 // and/or
47837 // cmovne (jne if we don't have CMOV)
47838 // When we can't use the CMOV instruction, it might increase branch
47839 // mispredicts.
47840 // When we can use CMOV, or when there is no mispredict, this improves
47841 // throughput and reduces register pressure.
47842 //
47843 if (CC == X86::COND_NE) {
47844 SDValue Flags;
47845 X86::CondCode CC0, CC1;
47846 bool isAndSetCC;
47847 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
47848 if (isAndSetCC) {
47849 std::swap(FalseOp, TrueOp);
47850 CC0 = X86::GetOppositeBranchCondition(CC0);
47851 CC1 = X86::GetOppositeBranchCondition(CC1);
47852 }
47853
47854 SDValue LOps[] = {FalseOp, TrueOp,
47855 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
47856 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
47857 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
47858 Flags};
47859 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47860 return CMOV;
47861 }
47862 }
47863
47864 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
47865 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
47866 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
47867 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
47868 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
47869 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
47870 SDValue Add = TrueOp;
47871 SDValue Const = FalseOp;
47872 // Canonicalize the condition code for easier matching and output.
47873 if (CC == X86::COND_E)
47874 std::swap(Add, Const);
47875
47876 // We might have replaced the constant in the cmov with the LHS of the
47877 // compare. If so change it to the RHS of the compare.
47878 if (Const == Cond.getOperand(0))
47879 Const = Cond.getOperand(1);
47880
47881 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
47882 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
47883 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
47884 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
47885 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
47886 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
47887 EVT VT = N->getValueType(0);
47888 // This should constant fold.
47889 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
47890 SDValue CMov =
47891 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
47892 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
47893 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
47894 }
47895 }
47896
47897 return SDValue();
47898}
47899
47900/// Different mul shrinking modes.
47901enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
47902
47903static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
47904 EVT VT = N->getOperand(0).getValueType();
47905 if (VT.getScalarSizeInBits() != 32)
47906 return false;
47907
47908 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47908, __extension__
__PRETTY_FUNCTION__))
;
47909 unsigned SignBits[2] = {1, 1};
47910 bool IsPositive[2] = {false, false};
47911 for (unsigned i = 0; i < 2; i++) {
47912 SDValue Opd = N->getOperand(i);
47913
47914 SignBits[i] = DAG.ComputeNumSignBits(Opd);
47915 IsPositive[i] = DAG.SignBitIsZero(Opd);
47916 }
47917
47918 bool AllPositive = IsPositive[0] && IsPositive[1];
47919 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
47920 // When ranges are from -128 ~ 127, use MULS8 mode.
47921 if (MinSignBits >= 25)
47922 Mode = ShrinkMode::MULS8;
47923 // When ranges are from 0 ~ 255, use MULU8 mode.
47924 else if (AllPositive && MinSignBits >= 24)
47925 Mode = ShrinkMode::MULU8;
47926 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47927 else if (MinSignBits >= 17)
47928 Mode = ShrinkMode::MULS16;
47929 // When ranges are from 0 ~ 65535, use MULU16 mode.
47930 else if (AllPositive && MinSignBits >= 16)
47931 Mode = ShrinkMode::MULU16;
47932 else
47933 return false;
47934 return true;
47935}
47936
47937/// When the operands of vector mul are extended from smaller size values,
47938/// like i8 and i16, the type of mul may be shrinked to generate more
47939/// efficient code. Two typical patterns are handled:
47940/// Pattern1:
47941/// %2 = sext/zext <N x i8> %1 to <N x i32>
47942/// %4 = sext/zext <N x i8> %3 to <N x i32>
47943// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47944/// %5 = mul <N x i32> %2, %4
47945///
47946/// Pattern2:
47947/// %2 = zext/sext <N x i16> %1 to <N x i32>
47948/// %4 = zext/sext <N x i16> %3 to <N x i32>
47949/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47950/// %5 = mul <N x i32> %2, %4
47951///
47952/// There are four mul shrinking modes:
47953/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
47954/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
47955/// generate pmullw+sext32 for it (MULS8 mode).
47956/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
47957/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
47958/// generate pmullw+zext32 for it (MULU8 mode).
47959/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
47960/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
47961/// generate pmullw+pmulhw for it (MULS16 mode).
47962/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
47963/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
47964/// generate pmullw+pmulhuw for it (MULU16 mode).
47965static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
47966 const X86Subtarget &Subtarget) {
47967 // Check for legality
47968 // pmullw/pmulhw are not supported by SSE.
47969 if (!Subtarget.hasSSE2())
47970 return SDValue();
47971
47972 // Check for profitability
47973 // pmulld is supported since SSE41. It is better to use pmulld
47974 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
47975 // the expansion.
47976 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
47977 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
47978 return SDValue();
47979
47980 ShrinkMode Mode;
47981 if (!canReduceVMulWidth(N, DAG, Mode))
47982 return SDValue();
47983
47984 SDLoc DL(N);
47985 SDValue N0 = N->getOperand(0);
47986 SDValue N1 = N->getOperand(1);
47987 EVT VT = N->getOperand(0).getValueType();
47988 unsigned NumElts = VT.getVectorNumElements();
47989 if ((NumElts % 2) != 0)
47990 return SDValue();
47991
47992 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
47993
47994 // Shrink the operands of mul.
47995 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
47996 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
47997
47998 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
47999 // lower part is needed.
48000 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48001 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48002 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48003 : ISD::SIGN_EXTEND,
48004 DL, VT, MulLo);
48005
48006 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48007 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48008 // the higher part is also needed.
48009 SDValue MulHi =
48010 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48011 ReducedVT, NewN0, NewN1);
48012
48013 // Repack the lower part and higher part result of mul into a wider
48014 // result.
48015 // Generate shuffle functioning as punpcklwd.
48016 SmallVector<int, 16> ShuffleMask(NumElts);
48017 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48018 ShuffleMask[2 * i] = i;
48019 ShuffleMask[2 * i + 1] = i + NumElts;
48020 }
48021 SDValue ResLo =
48022 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48023 ResLo = DAG.getBitcast(ResVT, ResLo);
48024 // Generate shuffle functioning as punpckhwd.
48025 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48026 ShuffleMask[2 * i] = i + NumElts / 2;
48027 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48028 }
48029 SDValue ResHi =
48030 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48031 ResHi = DAG.getBitcast(ResVT, ResHi);
48032 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48033}
48034
48035static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
48036 EVT VT, const SDLoc &DL) {
48037
48038 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48039 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48040 DAG.getConstant(Mult, DL, VT));
48041 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48042 DAG.getConstant(Shift, DL, MVT::i8));
48043 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48044 N->getOperand(0));
48045 return Result;
48046 };
48047
48048 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48049 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48050 DAG.getConstant(Mul1, DL, VT));
48051 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48052 DAG.getConstant(Mul2, DL, VT));
48053 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48054 N->getOperand(0));
48055 return Result;
48056 };
48057
48058 switch (MulAmt) {
48059 default:
48060 break;
48061 case 11:
48062 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48063 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48064 case 21:
48065 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48066 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48067 case 41:
48068 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48069 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48070 case 22:
48071 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48072 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48073 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48074 case 19:
48075 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48076 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48077 case 37:
48078 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48079 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48080 case 73:
48081 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48082 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48083 case 13:
48084 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48085 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48086 case 23:
48087 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48088 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48089 case 26:
48090 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48091 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48092 case 28:
48093 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48094 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48095 case 29:
48096 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48097 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48098 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48099 }
48100
48101 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48102 // by a single LEA.
48103 // First check if this a sum of two power of 2s because that's easy. Then
48104 // count how many zeros are up to the first bit.
48105 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48106 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48107 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48108 if (ScaleShift >= 1 && ScaleShift < 4) {
48109 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48110 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48111 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48112 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48113 DAG.getConstant(ScaleShift, DL, MVT::i8));
48114 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48115 }
48116 }
48117
48118 return SDValue();
48119}
48120
48121// If the upper 17 bits of either element are zero and the other element are
48122// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48123// PMULLD, except on KNL.
48124static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
48125 const X86Subtarget &Subtarget) {
48126 if (!Subtarget.hasSSE2())
48127 return SDValue();
48128
48129 if (Subtarget.isPMADDWDSlow())
48130 return SDValue();
48131
48132 EVT VT = N->getValueType(0);
48133
48134 // Only support vXi32 vectors.
48135 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48136 return SDValue();
48137
48138 // Make sure the type is legal or can split/widen to a legal type.
48139 // With AVX512 but without BWI, we would need to split v32i16.
48140 unsigned NumElts = VT.getVectorNumElements();
48141 if (NumElts == 1 || !isPowerOf2_32(NumElts))
48142 return SDValue();
48143
48144 // With AVX512 but without BWI, we would need to split v32i16.
48145 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48146 return SDValue();
48147
48148 SDValue N0 = N->getOperand(0);
48149 SDValue N1 = N->getOperand(1);
48150
48151 // If we are zero/sign extending two steps without SSE4.1, its better to
48152 // reduce the vmul width instead.
48153 if (!Subtarget.hasSSE41() &&
48154 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48155 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48156 (N1.getOpcode() == ISD::ZERO_EXTEND &&
48157 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48158 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48159 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48160 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48161 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48162 return SDValue();
48163
48164 // If we are sign extending a wide vector without SSE4.1, its better to reduce
48165 // the vmul width instead.
48166 if (!Subtarget.hasSSE41() &&
48167 (N0.getOpcode() == ISD::SIGN_EXTEND &&
48168 N0.getOperand(0).getValueSizeInBits() > 128) &&
48169 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48170 N1.getOperand(0).getValueSizeInBits() > 128))
48171 return SDValue();
48172
48173 // Sign bits must extend down to the lowest i16.
48174 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48175 DAG.ComputeMaxSignificantBits(N0) > 16)
48176 return SDValue();
48177
48178 // At least one of the elements must be zero in the upper 17 bits, or can be
48179 // safely made zero without altering the final result.
48180 auto GetZeroableOp = [&](SDValue Op) {
48181 APInt Mask17 = APInt::getHighBitsSet(32, 17);
48182 if (DAG.MaskedValueIsZero(Op, Mask17))
48183 return Op;
48184 // Mask off upper 16-bits of sign-extended constants.
48185 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
48186 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
48187 DAG.getConstant(0xFFFF, SDLoc(N), VT));
48188 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48189 SDValue Src = Op.getOperand(0);
48190 // Convert sext(vXi16) to zext(vXi16).
48191 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
48192 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48193 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
48194 // which will expand the extension.
48195 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
48196 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
48197 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
48198 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48199 }
48200 }
48201 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
48202 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
48203 N->isOnlyUserOf(Op.getNode())) {
48204 SDValue Src = Op.getOperand(0);
48205 if (Src.getScalarValueSizeInBits() == 16)
48206 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
48207 }
48208 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
48209 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
48210 N->isOnlyUserOf(Op.getNode())) {
48211 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
48212 Op.getOperand(1));
48213 }
48214 return SDValue();
48215 };
48216 SDValue ZeroN0 = GetZeroableOp(N0);
48217 SDValue ZeroN1 = GetZeroableOp(N1);
48218 if (!ZeroN0 && !ZeroN1)
48219 return SDValue();
48220 N0 = ZeroN0 ? ZeroN0 : N0;
48221 N1 = ZeroN1 ? ZeroN1 : N1;
48222
48223 // Use SplitOpsAndApply to handle AVX splitting.
48224 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48225 ArrayRef<SDValue> Ops) {
48226 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
48227 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
48228 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
48229 DAG.getBitcast(OpVT, Ops[0]),
48230 DAG.getBitcast(OpVT, Ops[1]));
48231 };
48232 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
48233 PMADDWDBuilder);
48234}
48235
48236static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
48237 const X86Subtarget &Subtarget) {
48238 if (!Subtarget.hasSSE2())
48239 return SDValue();
48240
48241 EVT VT = N->getValueType(0);
48242
48243 // Only support vXi64 vectors.
48244 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
48245 VT.getVectorNumElements() < 2 ||
48246 !isPowerOf2_32(VT.getVectorNumElements()))
48247 return SDValue();
48248
48249 SDValue N0 = N->getOperand(0);
48250 SDValue N1 = N->getOperand(1);
48251
48252 // MULDQ returns the 64-bit result of the signed multiplication of the lower
48253 // 32-bits. We can lower with this if the sign bits stretch that far.
48254 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
48255 DAG.ComputeNumSignBits(N1) > 32) {
48256 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48257 ArrayRef<SDValue> Ops) {
48258 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
48259 };
48260 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48261 PMULDQBuilder, /*CheckBWI*/false);
48262 }
48263
48264 // If the upper bits are zero we can use a single pmuludq.
48265 APInt Mask = APInt::getHighBitsSet(64, 32);
48266 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
48267 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48268 ArrayRef<SDValue> Ops) {
48269 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
48270 };
48271 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48272 PMULUDQBuilder, /*CheckBWI*/false);
48273 }
48274
48275 return SDValue();
48276}
48277
48278static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
48279 TargetLowering::DAGCombinerInfo &DCI,
48280 const X86Subtarget &Subtarget) {
48281 EVT VT = N->getValueType(0);
48282
48283 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
48284 return V;
48285
48286 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
48287 return V;
48288
48289 if (DCI.isBeforeLegalize() && VT.isVector())
48290 return reduceVMULWidth(N, DAG, Subtarget);
48291
48292 // Optimize a single multiply with constant into two operations in order to
48293 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
48294 if (!MulConstantOptimization)
48295 return SDValue();
48296
48297 // An imul is usually smaller than the alternative sequence.
48298 if (DAG.getMachineFunction().getFunction().hasMinSize())
48299 return SDValue();
48300
48301 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
48302 return SDValue();
48303
48304 if (VT != MVT::i64 && VT != MVT::i32)
48305 return SDValue();
48306
48307 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
48308 if (!C)
48309 return SDValue();
48310 if (isPowerOf2_64(C->getZExtValue()))
48311 return SDValue();
48312
48313 int64_t SignMulAmt = C->getSExtValue();
48314 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48314, __extension__
__PRETTY_FUNCTION__))
;
48315 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
48316
48317 SDLoc DL(N);
48318 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
48319 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48320 DAG.getConstant(AbsMulAmt, DL, VT));
48321 if (SignMulAmt < 0)
48322 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48323 NewMul);
48324
48325 return NewMul;
48326 }
48327
48328 uint64_t MulAmt1 = 0;
48329 uint64_t MulAmt2 = 0;
48330 if ((AbsMulAmt % 9) == 0) {
48331 MulAmt1 = 9;
48332 MulAmt2 = AbsMulAmt / 9;
48333 } else if ((AbsMulAmt % 5) == 0) {
48334 MulAmt1 = 5;
48335 MulAmt2 = AbsMulAmt / 5;
48336 } else if ((AbsMulAmt % 3) == 0) {
48337 MulAmt1 = 3;
48338 MulAmt2 = AbsMulAmt / 3;
48339 }
48340
48341 SDValue NewMul;
48342 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
48343 if (MulAmt2 &&
48344 (isPowerOf2_64(MulAmt2) ||
48345 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
48346
48347 if (isPowerOf2_64(MulAmt2) &&
48348 !(SignMulAmt >= 0 && N->hasOneUse() &&
48349 N->use_begin()->getOpcode() == ISD::ADD))
48350 // If second multiplifer is pow2, issue it first. We want the multiply by
48351 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
48352 // is an add. Only do this for positive multiply amounts since the
48353 // negate would prevent it from being used as an address mode anyway.
48354 std::swap(MulAmt1, MulAmt2);
48355
48356 if (isPowerOf2_64(MulAmt1))
48357 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48358 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
48359 else
48360 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48361 DAG.getConstant(MulAmt1, DL, VT));
48362
48363 if (isPowerOf2_64(MulAmt2))
48364 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
48365 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
48366 else
48367 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
48368 DAG.getConstant(MulAmt2, DL, VT));
48369
48370 // Negate the result.
48371 if (SignMulAmt < 0)
48372 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
48373 NewMul);
48374 } else if (!Subtarget.slowLEA())
48375 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
48376
48377 if (!NewMul) {
48378 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))
48379 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))
48380 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))
48381 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))
;
48382 if (isPowerOf2_64(AbsMulAmt - 1)) {
48383 // (mul x, 2^N + 1) => (add (shl x, N), x)
48384 NewMul = DAG.getNode(
48385 ISD::ADD, DL, VT, N->getOperand(0),
48386 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48387 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
48388 MVT::i8)));
48389 // To negate, subtract the number from zero
48390 if (SignMulAmt < 0)
48391 NewMul = DAG.getNode(ISD::SUB, DL, VT,
48392 DAG.getConstant(0, DL, VT), NewMul);
48393 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
48394 // (mul x, 2^N - 1) => (sub (shl x, N), x)
48395 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48396 DAG.getConstant(Log2_64(AbsMulAmt + 1),
48397 DL, MVT::i8));
48398 // To negate, reverse the operands of the subtract.
48399 if (SignMulAmt < 0)
48400 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
48401 else
48402 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
48403 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
48404 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
48405 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48406 DAG.getConstant(Log2_64(AbsMulAmt - 2),
48407 DL, MVT::i8));
48408 NewMul = DAG.getNode(
48409 ISD::ADD, DL, VT, NewMul,
48410 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48411 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
48412 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48413 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48414 DAG.getConstant(Log2_64(AbsMulAmt + 2),
48415 DL, MVT::i8));
48416 NewMul = DAG.getNode(
48417 ISD::SUB, DL, VT, NewMul,
48418 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48419 }
48420 }
48421
48422 return NewMul;
48423}
48424
48425// Try to form a MULHU or MULHS node by looking for
48426// (srl (mul ext, ext), 16)
48427// TODO: This is X86 specific because we want to be able to handle wide types
48428// before type legalization. But we can only do it if the vector will be
48429// legalized via widening/splitting. Type legalization can't handle promotion
48430// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
48431// combiner.
48432static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
48433 const X86Subtarget &Subtarget) {
48434 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48435, __extension__
__PRETTY_FUNCTION__))
48435 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48435, __extension__
__PRETTY_FUNCTION__))
;
48436 SDLoc DL(N);
48437
48438 if (!Subtarget.hasSSE2())
48439 return SDValue();
48440
48441 // The operation feeding into the shift must be a multiply.
48442 SDValue ShiftOperand = N->getOperand(0);
48443 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
48444 return SDValue();
48445
48446 // Input type should be at least vXi32.
48447 EVT VT = N->getValueType(0);
48448 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
48449 return SDValue();
48450
48451 // Need a shift by 16.
48452 APInt ShiftAmt;
48453 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
48454 ShiftAmt != 16)
48455 return SDValue();
48456
48457 SDValue LHS = ShiftOperand.getOperand(0);
48458 SDValue RHS = ShiftOperand.getOperand(1);
48459
48460 unsigned ExtOpc = LHS.getOpcode();
48461 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
48462 RHS.getOpcode() != ExtOpc)
48463 return SDValue();
48464
48465 // Peek through the extends.
48466 LHS = LHS.getOperand(0);
48467 RHS = RHS.getOperand(0);
48468
48469 // Ensure the input types match.
48470 EVT MulVT = LHS.getValueType();
48471 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
48472 return SDValue();
48473
48474 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
48475 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
48476
48477 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48478 return DAG.getNode(ExtOpc, DL, VT, Mulh);
48479}
48480
48481static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
48482 SDValue N0 = N->getOperand(0);
48483 SDValue N1 = N->getOperand(1);
48484 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
48485 EVT VT = N0.getValueType();
48486
48487 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
48488 // since the result of setcc_c is all zero's or all ones.
48489 if (VT.isInteger() && !VT.isVector() &&
48490 N1C && N0.getOpcode() == ISD::AND &&
48491 N0.getOperand(1).getOpcode() == ISD::Constant) {
48492 SDValue N00 = N0.getOperand(0);
48493 APInt Mask = N0.getConstantOperandAPInt(1);
48494 Mask <<= N1C->getAPIntValue();
48495 bool MaskOK = false;
48496 // We can handle cases concerning bit-widening nodes containing setcc_c if
48497 // we carefully interrogate the mask to make sure we are semantics
48498 // preserving.
48499 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
48500 // of the underlying setcc_c operation if the setcc_c was zero extended.
48501 // Consider the following example:
48502 // zext(setcc_c) -> i32 0x0000FFFF
48503 // c1 -> i32 0x0000FFFF
48504 // c2 -> i32 0x00000001
48505 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
48506 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
48507 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
48508 MaskOK = true;
48509 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
48510 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48511 MaskOK = true;
48512 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
48513 N00.getOpcode() == ISD::ANY_EXTEND) &&
48514 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
48515 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
48516 }
48517 if (MaskOK && Mask != 0) {
48518 SDLoc DL(N);
48519 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
48520 }
48521 }
48522
48523 return SDValue();
48524}
48525
48526static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
48527 const X86Subtarget &Subtarget) {
48528 SDValue N0 = N->getOperand(0);
48529 SDValue N1 = N->getOperand(1);
48530 EVT VT = N0.getValueType();
48531 unsigned Size = VT.getSizeInBits();
48532
48533 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48534 return V;
48535
48536 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
48537 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
48538 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
48539 // depending on sign of (SarConst - [56,48,32,24,16])
48540
48541 // sexts in X86 are MOVs. The MOVs have the same code size
48542 // as above SHIFTs (only SHIFT on 1 has lower code size).
48543 // However the MOVs have 2 advantages to a SHIFT:
48544 // 1. MOVs can write to a register that differs from source
48545 // 2. MOVs accept memory operands
48546
48547 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
48548 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
48549 N0.getOperand(1).getOpcode() != ISD::Constant)
48550 return SDValue();
48551
48552 SDValue N00 = N0.getOperand(0);
48553 SDValue N01 = N0.getOperand(1);
48554 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
48555 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
48556 EVT CVT = N1.getValueType();
48557
48558 if (SarConst.isNegative())
48559 return SDValue();
48560
48561 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
48562 unsigned ShiftSize = SVT.getSizeInBits();
48563 // skipping types without corresponding sext/zext and
48564 // ShlConst that is not one of [56,48,32,24,16]
48565 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48566 continue;
48567 SDLoc DL(N);
48568 SDValue NN =
48569 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
48570 SarConst = SarConst - (Size - ShiftSize);
48571 if (SarConst == 0)
48572 return NN;
48573 if (SarConst.isNegative())
48574 return DAG.getNode(ISD::SHL, DL, VT, NN,
48575 DAG.getConstant(-SarConst, DL, CVT));
48576 return DAG.getNode(ISD::SRA, DL, VT, NN,
48577 DAG.getConstant(SarConst, DL, CVT));
48578 }
48579 return SDValue();
48580}
48581
48582static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
48583 TargetLowering::DAGCombinerInfo &DCI,
48584 const X86Subtarget &Subtarget) {
48585 SDValue N0 = N->getOperand(0);
48586 SDValue N1 = N->getOperand(1);
48587 EVT VT = N0.getValueType();
48588
48589 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48590 return V;
48591
48592 // Only do this on the last DAG combine as it can interfere with other
48593 // combines.
48594 if (!DCI.isAfterLegalizeDAG())
48595 return SDValue();
48596
48597 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
48598 // TODO: This is a generic DAG combine that became an x86-only combine to
48599 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48600 // and-not ('andn').
48601 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
48602 return SDValue();
48603
48604 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
48605 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
48606 if (!ShiftC || !AndC)
48607 return SDValue();
48608
48609 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48610 // transform should reduce code size. It may also enable secondary transforms
48611 // from improved known-bits analysis or instruction selection.
48612 APInt MaskVal = AndC->getAPIntValue();
48613
48614 // If this can be matched by a zero extend, don't optimize.
48615 if (MaskVal.isMask()) {
48616 unsigned TO = MaskVal.countr_one();
48617 if (TO >= 8 && isPowerOf2_32(TO))
48618 return SDValue();
48619 }
48620
48621 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48622 unsigned OldMaskSize = MaskVal.getSignificantBits();
48623 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
48624 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
48625 (OldMaskSize > 32 && NewMaskSize <= 32)) {
48626 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48627 SDLoc DL(N);
48628 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
48629 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
48630 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
48631 }
48632 return SDValue();
48633}
48634
48635static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
48636 const X86Subtarget &Subtarget) {
48637 unsigned Opcode = N->getOpcode();
48638 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__))
;
48639
48640 SDLoc DL(N);
48641 EVT VT = N->getValueType(0);
48642 SDValue N0 = N->getOperand(0);
48643 SDValue N1 = N->getOperand(1);
48644 EVT SrcVT = N0.getValueType();
48645
48646 SDValue BC0 =
48647 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48648 SDValue BC1 =
48649 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48650
48651 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
48652 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
48653 // truncation trees that help us avoid lane crossing shuffles.
48654 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48655 // TODO: We don't handle vXf64 shuffles yet.
48656 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48657 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
48658 SmallVector<SDValue> ShuffleOps;
48659 SmallVector<int> ShuffleMask, ScaledMask;
48660 SDValue Vec = peekThroughBitcasts(BCSrc);
48661 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
48662 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
48663 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
48664 // shuffle to a v4X64 width - we can probably relax this in the future.
48665 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
48666 ShuffleOps[0].getValueType().is256BitVector() &&
48667 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
48668 SDValue Lo, Hi;
48669 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48670 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
48671 Lo = DAG.getBitcast(SrcVT, Lo);
48672 Hi = DAG.getBitcast(SrcVT, Hi);
48673 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
48674 Res = DAG.getBitcast(ShufVT, Res);
48675 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
48676 return DAG.getBitcast(VT, Res);
48677 }
48678 }
48679 }
48680 }
48681
48682 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48683 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48684 // If either/both ops are a shuffle that can scale to v2x64,
48685 // then see if we can perform this as a v4x32 post shuffle.
48686 SmallVector<SDValue> Ops0, Ops1;
48687 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
48688 bool IsShuf0 =
48689 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48690 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48691 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48692 bool IsShuf1 =
48693 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48694 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
48695 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48696 if (IsShuf0 || IsShuf1) {
48697 if (!IsShuf0) {
48698 Ops0.assign({BC0});
48699 ScaledMask0.assign({0, 1});
48700 }
48701 if (!IsShuf1) {
48702 Ops1.assign({BC1});
48703 ScaledMask1.assign({0, 1});
48704 }
48705
48706 SDValue LHS, RHS;
48707 int PostShuffle[4] = {-1, -1, -1, -1};
48708 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
48709 if (M < 0)
48710 return true;
48711 Idx = M % 2;
48712 SDValue Src = Ops[M / 2];
48713 if (!LHS || LHS == Src) {
48714 LHS = Src;
48715 return true;
48716 }
48717 if (!RHS || RHS == Src) {
48718 Idx += 2;
48719 RHS = Src;
48720 return true;
48721 }
48722 return false;
48723 };
48724 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
48725 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
48726 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
48727 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
48728 LHS = DAG.getBitcast(SrcVT, LHS);
48729 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
48730 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48731 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
48732 Res = DAG.getBitcast(ShufVT, Res);
48733 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
48734 return DAG.getBitcast(VT, Res);
48735 }
48736 }
48737 }
48738
48739 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
48740 if (VT.is256BitVector() && Subtarget.hasInt256()) {
48741 SmallVector<int> Mask0, Mask1;
48742 SmallVector<SDValue> Ops0, Ops1;
48743 SmallVector<int, 2> ScaledMask0, ScaledMask1;
48744 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48745 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48746 !Ops0.empty() && !Ops1.empty() &&
48747 all_of(Ops0,
48748 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48749 all_of(Ops1,
48750 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48751 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48752 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
48753 SDValue Op00 = peekThroughBitcasts(Ops0.front());
48754 SDValue Op10 = peekThroughBitcasts(Ops1.front());
48755 SDValue Op01 = peekThroughBitcasts(Ops0.back());
48756 SDValue Op11 = peekThroughBitcasts(Ops1.back());
48757 if ((Op00 == Op11) && (Op01 == Op10)) {
48758 std::swap(Op10, Op11);
48759 ShuffleVectorSDNode::commuteMask(ScaledMask1);
48760 }
48761 if ((Op00 == Op10) && (Op01 == Op11)) {
48762 const int Map[4] = {0, 2, 1, 3};
48763 SmallVector<int, 4> ShuffleMask(
48764 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
48765 Map[ScaledMask1[1]]});
48766 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
48767 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
48768 DAG.getBitcast(SrcVT, Op01));
48769 Res = DAG.getBitcast(ShufVT, Res);
48770 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
48771 return DAG.getBitcast(VT, Res);
48772 }
48773 }
48774 }
48775
48776 return SDValue();
48777}
48778
48779static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
48780 TargetLowering::DAGCombinerInfo &DCI,
48781 const X86Subtarget &Subtarget) {
48782 unsigned Opcode = N->getOpcode();
48783 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48784, __extension__
__PRETTY_FUNCTION__))
48784 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48784, __extension__
__PRETTY_FUNCTION__))
;
48785
48786 EVT VT = N->getValueType(0);
48787 SDValue N0 = N->getOperand(0);
48788 SDValue N1 = N->getOperand(1);
48789 unsigned NumDstElts = VT.getVectorNumElements();
48790 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
48791 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
48792 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48794, __extension__
__PRETTY_FUNCTION__))
48793 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48794, __extension__
__PRETTY_FUNCTION__))
48794 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48794, __extension__
__PRETTY_FUNCTION__))
;
48795
48796 bool IsSigned = (X86ISD::PACKSS == Opcode);
48797
48798 // Constant Folding.
48799 APInt UndefElts0, UndefElts1;
48800 SmallVector<APInt, 32> EltBits0, EltBits1;
48801 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
48802 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
48803 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
48804 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
48805 unsigned NumLanes = VT.getSizeInBits() / 128;
48806 unsigned NumSrcElts = NumDstElts / 2;
48807 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
48808 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
48809
48810 APInt Undefs(NumDstElts, 0);
48811 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
48812 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
48813 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
48814 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
48815 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
48816 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
48817
48818 if (UndefElts[SrcIdx]) {
48819 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
48820 continue;
48821 }
48822
48823 APInt &Val = EltBits[SrcIdx];
48824 if (IsSigned) {
48825 // PACKSS: Truncate signed value with signed saturation.
48826 // Source values less than dst minint are saturated to minint.
48827 // Source values greater than dst maxint are saturated to maxint.
48828 if (Val.isSignedIntN(DstBitsPerElt))
48829 Val = Val.trunc(DstBitsPerElt);
48830 else if (Val.isNegative())
48831 Val = APInt::getSignedMinValue(DstBitsPerElt);
48832 else
48833 Val = APInt::getSignedMaxValue(DstBitsPerElt);
48834 } else {
48835 // PACKUS: Truncate signed value with unsigned saturation.
48836 // Source values less than zero are saturated to zero.
48837 // Source values greater than dst maxuint are saturated to maxuint.
48838 if (Val.isIntN(DstBitsPerElt))
48839 Val = Val.trunc(DstBitsPerElt);
48840 else if (Val.isNegative())
48841 Val = APInt::getZero(DstBitsPerElt);
48842 else
48843 Val = APInt::getAllOnes(DstBitsPerElt);
48844 }
48845 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
48846 }
48847 }
48848
48849 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
48850 }
48851
48852 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48853 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48854 return V;
48855
48856 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
48857 // truncate to create a larger truncate.
48858 if (Subtarget.hasAVX512() &&
48859 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
48860 N0.getOperand(0).getValueType() == MVT::v8i32) {
48861 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
48862 (!IsSigned &&
48863 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
48864 if (Subtarget.hasVLX())
48865 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
48866
48867 // Widen input to v16i32 so we can truncate that.
48868 SDLoc dl(N);
48869 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
48870 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
48871 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
48872 }
48873 }
48874
48875 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48876 if (VT.is128BitVector()) {
48877 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48878 SDValue Src0, Src1;
48879 if (N0.getOpcode() == ExtOpc &&
48880 N0.getOperand(0).getValueType().is64BitVector() &&
48881 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48882 Src0 = N0.getOperand(0);
48883 }
48884 if (N1.getOpcode() == ExtOpc &&
48885 N1.getOperand(0).getValueType().is64BitVector() &&
48886 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48887 Src1 = N1.getOperand(0);
48888 }
48889 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
48890 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48890, __extension__
__PRETTY_FUNCTION__))
;
48891 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
48892 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
48893 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
48894 }
48895
48896 // Try again with pack(*_extend_vector_inreg, undef).
48897 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
48898 : ISD::ZERO_EXTEND_VECTOR_INREG;
48899 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
48900 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
48901 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
48902 DAG);
48903 }
48904
48905 // Attempt to combine as shuffle.
48906 SDValue Op(N, 0);
48907 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48908 return Res;
48909
48910 return SDValue();
48911}
48912
48913static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
48914 TargetLowering::DAGCombinerInfo &DCI,
48915 const X86Subtarget &Subtarget) {
48916 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48918, __extension__
__PRETTY_FUNCTION__))
48917 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48918, __extension__
__PRETTY_FUNCTION__))
48918 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48918, __extension__
__PRETTY_FUNCTION__))
;
48919
48920 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
48921 MVT VT = N->getSimpleValueType(0);
48922 SDValue LHS = N->getOperand(0);
48923 SDValue RHS = N->getOperand(1);
48924
48925 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48926 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48927 LHS.getOpcode() == RHS.getOpcode() &&
48928 LHS.getValueType() == RHS.getValueType() &&
48929 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48930 SDValue LHS0 = LHS.getOperand(0);
48931 SDValue LHS1 = LHS.getOperand(1);
48932 SDValue RHS0 = RHS.getOperand(0);
48933 SDValue RHS1 = RHS.getOperand(1);
48934 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
48935 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
48936 SDLoc DL(N);
48937 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
48938 LHS0.isUndef() ? LHS1 : LHS0,
48939 RHS0.isUndef() ? RHS1 : RHS0);
48940 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
48941 Res = DAG.getBitcast(ShufVT, Res);
48942 SDValue NewLHS =
48943 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48944 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
48945 SDValue NewRHS =
48946 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48947 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
48948 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48949 DAG.getBitcast(VT, NewRHS));
48950 }
48951 }
48952 }
48953
48954 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
48955 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48956 return V;
48957
48958 return SDValue();
48959}
48960
48961static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
48962 TargetLowering::DAGCombinerInfo &DCI,
48963 const X86Subtarget &Subtarget) {
48964 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48966, __extension__
__PRETTY_FUNCTION__))
48965 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48966, __extension__
__PRETTY_FUNCTION__))
48966 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48966, __extension__
__PRETTY_FUNCTION__))
;
48967 EVT VT = N->getValueType(0);
48968 SDValue N0 = N->getOperand(0);
48969 SDValue N1 = N->getOperand(1);
48970
48971 // Shift zero -> zero.
48972 if (ISD::isBuildVectorAllZeros(N0.getNode()))
48973 return DAG.getConstant(0, SDLoc(N), VT);
48974
48975 // Detect constant shift amounts.
48976 APInt UndefElts;
48977 SmallVector<APInt, 32> EltBits;
48978 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
48979 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
48980 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
48981 EltBits[0].getZExtValue(), DAG);
48982 }
48983
48984 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48985 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
48986 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
48987 return SDValue(N, 0);
48988
48989 return SDValue();
48990}
48991
48992static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
48993 TargetLowering::DAGCombinerInfo &DCI,
48994 const X86Subtarget &Subtarget) {
48995 unsigned Opcode = N->getOpcode();
48996 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48998, __extension__
__PRETTY_FUNCTION__))
48997 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48998, __extension__
__PRETTY_FUNCTION__))
48998 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48998, __extension__
__PRETTY_FUNCTION__))
;
48999 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49000 EVT VT = N->getValueType(0);
49001 SDValue N0 = N->getOperand(0);
49002 SDValue N1 = N->getOperand(1);
49003 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49004 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49005, __extension__
__PRETTY_FUNCTION__))
49005 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49005, __extension__
__PRETTY_FUNCTION__))
;
49006 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49006, __extension__
__PRETTY_FUNCTION__))
;
49007
49008 // (shift undef, X) -> 0
49009 if (N0.isUndef())
49010 return DAG.getConstant(0, SDLoc(N), VT);
49011
49012 // Out of range logical bit shifts are guaranteed to be zero.
49013 // Out of range arithmetic bit shifts splat the sign bit.
49014 unsigned ShiftVal = N->getConstantOperandVal(1);
49015 if (ShiftVal >= NumBitsPerElt) {
49016 if (LogicalShift)
49017 return DAG.getConstant(0, SDLoc(N), VT);
49018 ShiftVal = NumBitsPerElt - 1;
49019 }
49020
49021 // (shift X, 0) -> X
49022 if (!ShiftVal)
49023 return N0;
49024
49025 // (shift 0, C) -> 0
49026 if (ISD::isBuildVectorAllZeros(N0.getNode()))
49027 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49028 // result are all zeros, not undef.
49029 return DAG.getConstant(0, SDLoc(N), VT);
49030
49031 // (VSRAI -1, C) -> -1
49032 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49033 // N0 is all ones or undef. We guarantee that the bits shifted into the
49034 // result are all ones, not undef.
49035 return DAG.getConstant(-1, SDLoc(N), VT);
49036
49037 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49038 unsigned NewShiftVal = Amt0 + Amt1;
49039 if (NewShiftVal >= NumBitsPerElt) {
49040 // Out of range logical bit shifts are guaranteed to be zero.
49041 // Out of range arithmetic bit shifts splat the sign bit.
49042 if (LogicalShift)
49043 return DAG.getConstant(0, SDLoc(N), VT);
49044 NewShiftVal = NumBitsPerElt - 1;
49045 }
49046 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49047 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49048 };
49049
49050 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49051 if (Opcode == N0.getOpcode())
49052 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49053
49054 // (shl (add X, X), C) -> (shl X, (C + 1))
49055 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49056 N0.getOperand(0) == N0.getOperand(1))
49057 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49058
49059 // We can decode 'whole byte' logical bit shifts as shuffles.
49060 if (LogicalShift && (ShiftVal % 8) == 0) {
49061 SDValue Op(N, 0);
49062 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49063 return Res;
49064 }
49065
49066 auto TryConstantFold = [&](SDValue V) {
49067 APInt UndefElts;
49068 SmallVector<APInt, 32> EltBits;
49069 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
49070 return SDValue();
49071 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49072, __extension__
__PRETTY_FUNCTION__))
49072 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49072, __extension__
__PRETTY_FUNCTION__))
;
49073 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
49074 // created an undef input due to no input bits being demanded, but user
49075 // still expects 0 in other bits.
49076 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
49077 APInt &Elt = EltBits[i];
49078 if (UndefElts[i])
49079 Elt = 0;
49080 else if (X86ISD::VSHLI == Opcode)
49081 Elt <<= ShiftVal;
49082 else if (X86ISD::VSRAI == Opcode)
49083 Elt.ashrInPlace(ShiftVal);
49084 else
49085 Elt.lshrInPlace(ShiftVal);
49086 }
49087 // Reset undef elements since they were zeroed above.
49088 UndefElts = 0;
49089 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
49090 };
49091
49092 // Constant Folding.
49093 if (N->isOnlyUserOf(N0.getNode())) {
49094 if (SDValue C = TryConstantFold(N0))
49095 return C;
49096
49097 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
49098 // Don't break NOT patterns.
49099 SDValue BC = peekThroughOneUseBitcasts(N0);
49100 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
49101 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
49102 !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
49103 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
49104 SDLoc DL(N);
49105 SDValue LHS = DAG.getNode(Opcode, DL, VT,
49106 DAG.getBitcast(VT, BC.getOperand(0)), N1);
49107 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
49108 }
49109 }
49110 }
49111
49112 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49113 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
49114 DCI))
49115 return SDValue(N, 0);
49116
49117 return SDValue();
49118}
49119
49120static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
49121 TargetLowering::DAGCombinerInfo &DCI,
49122 const X86Subtarget &Subtarget) {
49123 EVT VT = N->getValueType(0);
49124 unsigned Opcode = N->getOpcode();
49125 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))
49126 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))
49127 Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))
49128 "Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))
;
49129
49130 SDValue Vec = N->getOperand(0);
49131 SDValue Scl = N->getOperand(1);
49132 SDValue Idx = N->getOperand(2);
49133
49134 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
49135 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
49136 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
49137
49138 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
49139 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49140 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49141 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
49142 APInt::getAllOnes(NumBitsPerElt), DCI))
49143 return SDValue(N, 0);
49144 }
49145
49146 // Attempt to combine insertion patterns to a shuffle.
49147 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
49148 SDValue Op(N, 0);
49149 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49150 return Res;
49151 }
49152
49153 return SDValue();
49154}
49155
49156/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
49157/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
49158/// OR -> CMPNEQSS.
49159static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
49160 TargetLowering::DAGCombinerInfo &DCI,
49161 const X86Subtarget &Subtarget) {
49162 unsigned opcode;
49163
49164 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
49165 // we're requiring SSE2 for both.
49166 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
49167 SDValue N0 = N->getOperand(0);
49168 SDValue N1 = N->getOperand(1);
49169 SDValue CMP0 = N0.getOperand(1);
49170 SDValue CMP1 = N1.getOperand(1);
49171 SDLoc DL(N);
49172
49173 // The SETCCs should both refer to the same CMP.
49174 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
49175 return SDValue();
49176
49177 SDValue CMP00 = CMP0->getOperand(0);
49178 SDValue CMP01 = CMP0->getOperand(1);
49179 EVT VT = CMP00.getValueType();
49180
49181 if (VT == MVT::f32 || VT == MVT::f64 ||
49182 (VT == MVT::f16 && Subtarget.hasFP16())) {
49183 bool ExpectingFlags = false;
49184 // Check for any users that want flags:
49185 for (const SDNode *U : N->uses()) {
49186 if (ExpectingFlags)
49187 break;
49188
49189 switch (U->getOpcode()) {
49190 default:
49191 case ISD::BR_CC:
49192 case ISD::BRCOND:
49193 case ISD::SELECT:
49194 ExpectingFlags = true;
49195 break;
49196 case ISD::CopyToReg:
49197 case ISD::SIGN_EXTEND:
49198 case ISD::ZERO_EXTEND:
49199 case ISD::ANY_EXTEND:
49200 break;
49201 }
49202 }
49203
49204 if (!ExpectingFlags) {
49205 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
49206 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
49207
49208 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
49209 X86::CondCode tmp = cc0;
49210 cc0 = cc1;
49211 cc1 = tmp;
49212 }
49213
49214 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
49215 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
49216 // FIXME: need symbolic constants for these magic numbers.
49217 // See X86ATTInstPrinter.cpp:printSSECC().
49218 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
49219 if (Subtarget.hasAVX512()) {
49220 SDValue FSetCC =
49221 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
49222 DAG.getTargetConstant(x86cc, DL, MVT::i8));
49223 // Need to fill with zeros to ensure the bitcast will produce zeroes
49224 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
49225 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
49226 DAG.getConstant(0, DL, MVT::v16i1),
49227 FSetCC, DAG.getIntPtrConstant(0, DL));
49228 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
49229 N->getSimpleValueType(0));
49230 }
49231 SDValue OnesOrZeroesF =
49232 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
49233 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
49234
49235 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
49236 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
49237
49238 if (is64BitFP && !Subtarget.is64Bit()) {
49239 // On a 32-bit target, we cannot bitcast the 64-bit float to a
49240 // 64-bit integer, since that's not a legal type. Since
49241 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
49242 // bits, but can do this little dance to extract the lowest 32 bits
49243 // and work with those going forward.
49244 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
49245 OnesOrZeroesF);
49246 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
49247 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
49248 Vector32, DAG.getIntPtrConstant(0, DL));
49249 IntVT = MVT::i32;
49250 }
49251
49252 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
49253 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
49254 DAG.getConstant(1, DL, IntVT));
49255 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
49256 ANDed);
49257 return OneBitOfTruth;
49258 }
49259 }
49260 }
49261 }
49262 return SDValue();
49263}
49264
49265/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
49266static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
49267 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49267, __extension__
__PRETTY_FUNCTION__))
;
49268
49269 MVT VT = N->getSimpleValueType(0);
49270 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
49271 return SDValue();
49272
49273 SDValue X, Y;
49274 SDValue N0 = N->getOperand(0);
49275 SDValue N1 = N->getOperand(1);
49276
49277 if (SDValue Not = IsNOT(N0, DAG)) {
49278 X = Not;
49279 Y = N1;
49280 } else if (SDValue Not = IsNOT(N1, DAG)) {
49281 X = Not;
49282 Y = N0;
49283 } else
49284 return SDValue();
49285
49286 X = DAG.getBitcast(VT, X);
49287 Y = DAG.getBitcast(VT, Y);
49288 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
49289}
49290
49291/// Try to fold:
49292/// and (vector_shuffle<Z,...,Z>
49293/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49294/// ->
49295/// andnp (vector_shuffle<Z,...,Z>
49296/// (insert_vector_elt undef, X, Z), undef), Y
49297static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
49298 const X86Subtarget &Subtarget) {
49299 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49299, __extension__
__PRETTY_FUNCTION__))
;
49300
49301 EVT VT = N->getValueType(0);
49302 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
49303 // value and require extra moves.
49304 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49305 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
49306 return SDValue();
49307
49308 auto GetNot = [&DAG](SDValue V) {
49309 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
49310 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49311 // end-users are ISD::AND including cases
49312 // (and(extract_vector_element(SVN), Y)).
49313 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49314 !SVN->getOperand(1).isUndef()) {
49315 return SDValue();
49316 }
49317 SDValue IVEN = SVN->getOperand(0);
49318 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
49319 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
49320 return SDValue();
49321 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
49322 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49323 return SDValue();
49324 SDValue Src = IVEN.getOperand(1);
49325 if (SDValue Not = IsNOT(Src, DAG)) {
49326 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
49327 SDValue NotIVEN =
49328 DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
49329 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
49330 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49331 SVN->getOperand(1), SVN->getMask());
49332 }
49333 return SDValue();
49334 };
49335
49336 SDValue X, Y;
49337 SDValue N0 = N->getOperand(0);
49338 SDValue N1 = N->getOperand(1);
49339
49340 if (SDValue Not = GetNot(N0)) {
49341 X = Not;
49342 Y = N1;
49343 } else if (SDValue Not = GetNot(N1)) {
49344 X = Not;
49345 Y = N0;
49346 } else
49347 return SDValue();
49348
49349 X = DAG.getBitcast(VT, X);
49350 Y = DAG.getBitcast(VT, Y);
49351 SDLoc DL(N);
49352 // We do not split for SSE at all, but we need to split vectors for AVX1 and
49353 // AVX2.
49354 if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
49355 SDValue LoX, HiX;
49356 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
49357 SDValue LoY, HiY;
49358 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
49359 EVT SplitVT = LoX.getValueType();
49360 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
49361 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
49362 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
49363 }
49364 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
49365}
49366
49367// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
49368// logical operations, like in the example below.
49369// or (and (truncate x, truncate y)),
49370// (xor (truncate z, build_vector (constants)))
49371// Given a target type \p VT, we generate
49372// or (and x, y), (xor z, zext(build_vector (constants)))
49373// given x, y and z are of type \p VT. We can do so, if operands are either
49374// truncates from VT types, the second operand is a vector of constants or can
49375// be recursively promoted.
49376static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
49377 unsigned Depth) {
49378 // Limit recursion to avoid excessive compile times.
49379 if (Depth >= SelectionDAG::MaxRecursionDepth)
49380 return SDValue();
49381
49382 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
49383 N->getOpcode() != ISD::OR)
49384 return SDValue();
49385
49386 SDValue N0 = N->getOperand(0);
49387 SDValue N1 = N->getOperand(1);
49388 SDLoc DL(N);
49389
49390 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49391 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
49392 return SDValue();
49393
49394 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
49395 N0 = NN0;
49396 else {
49397 // The Left side has to be a trunc.
49398 if (N0.getOpcode() != ISD::TRUNCATE)
49399 return SDValue();
49400
49401 // The type of the truncated inputs.
49402 if (N0.getOperand(0).getValueType() != VT)
49403 return SDValue();
49404
49405 N0 = N0.getOperand(0);
49406 }
49407
49408 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
49409 N1 = NN1;
49410 else {
49411 // The right side has to be a 'trunc' or a constant vector.
49412 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
49413 N1.getOperand(0).getValueType() == VT;
49414 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
49415 return SDValue();
49416
49417 if (RHSTrunc)
49418 N1 = N1.getOperand(0);
49419 else
49420 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
49421 }
49422
49423 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
49424}
49425
49426// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
49427// register. In most cases we actually compare or select YMM-sized registers
49428// and mixing the two types creates horrible code. This method optimizes
49429// some of the transition sequences.
49430// Even with AVX-512 this is still useful for removing casts around logical
49431// operations on vXi1 mask types.
49432static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
49433 const X86Subtarget &Subtarget) {
49434 EVT VT = N->getValueType(0);
49435 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49435, __extension__
__PRETTY_FUNCTION__))
;
49436
49437 SDLoc DL(N);
49438 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49440, __extension__
__PRETTY_FUNCTION__))
49439 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49440, __extension__
__PRETTY_FUNCTION__))
49440 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49440, __extension__
__PRETTY_FUNCTION__))
;
49441
49442 SDValue Narrow = N->getOperand(0);
49443 EVT NarrowVT = Narrow.getValueType();
49444
49445 // Generate the wide operation.
49446 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
49447 if (!Op)
49448 return SDValue();
49449 switch (N->getOpcode()) {
49450 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49450)
;
49451 case ISD::ANY_EXTEND:
49452 return Op;
49453 case ISD::ZERO_EXTEND:
49454 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
49455 case ISD::SIGN_EXTEND:
49456 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
49457 Op, DAG.getValueType(NarrowVT));
49458 }
49459}
49460
49461static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
49462 unsigned FPOpcode;
49463 switch (Opcode) {
49464 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49464)
;
49465 case ISD::AND: FPOpcode = X86ISD::FAND; break;
49466 case ISD::OR: FPOpcode = X86ISD::FOR; break;
49467 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
49468 }
49469 return FPOpcode;
49470}
49471
49472/// If both input operands of a logic op are being cast from floating-point
49473/// types or FP compares, try to convert this into a floating-point logic node
49474/// to avoid unnecessary moves from SSE to integer registers.
49475static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
49476 TargetLowering::DAGCombinerInfo &DCI,
49477 const X86Subtarget &Subtarget) {
49478 EVT VT = N->getValueType(0);
49479 SDValue N0 = N->getOperand(0);
49480 SDValue N1 = N->getOperand(1);
49481 SDLoc DL(N);
49482
49483 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
49484 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
49485 return SDValue();
49486
49487 SDValue N00 = N0.getOperand(0);
49488 SDValue N10 = N1.getOperand(0);
49489 EVT N00Type = N00.getValueType();
49490 EVT N10Type = N10.getValueType();
49491
49492 // Ensure that both types are the same and are legal scalar fp types.
49493 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
49494 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
49495 (Subtarget.hasFP16() && N00Type == MVT::f16)))
49496 return SDValue();
49497
49498 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
49499 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49500 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
49501 return DAG.getBitcast(VT, FPLogic);
49502 }
49503
49504 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
49505 !N1.hasOneUse())
49506 return SDValue();
49507
49508 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49509 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49510
49511 // The vector ISA for FP predicates is incomplete before AVX, so converting
49512 // COMIS* to CMPS* may not be a win before AVX.
49513 if (!Subtarget.hasAVX() &&
49514 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
49515 return SDValue();
49516
49517 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
49518 // and vector logic:
49519 // logic (setcc N00, N01), (setcc N10, N11) -->
49520 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
49521 unsigned NumElts = 128 / N00Type.getSizeInBits();
49522 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
49523 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
49524 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
49525 SDValue N01 = N0.getOperand(1);
49526 SDValue N11 = N1.getOperand(1);
49527 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
49528 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
49529 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
49530 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
49531 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
49532 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
49533 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49534 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
49535}
49536
49537// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49538// to reduce XMM->GPR traffic.
49539static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
49540 unsigned Opc = N->getOpcode();
49541 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49542, __extension__
__PRETTY_FUNCTION__))
49542 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49542, __extension__
__PRETTY_FUNCTION__))
;
49543
49544 SDValue N0 = N->getOperand(0);
49545 SDValue N1 = N->getOperand(1);
49546
49547 // Both operands must be single use MOVMSK.
49548 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
49549 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
49550 return SDValue();
49551
49552 SDValue Vec0 = N0.getOperand(0);
49553 SDValue Vec1 = N1.getOperand(0);
49554 EVT VecVT0 = Vec0.getValueType();
49555 EVT VecVT1 = Vec1.getValueType();
49556
49557 // Both MOVMSK operands must be from vectors of the same size and same element
49558 // size, but its OK for a fp/int diff.
49559 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
49560 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
49561 return SDValue();
49562
49563 SDLoc DL(N);
49564 unsigned VecOpc =
49565 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
49566 SDValue Result =
49567 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
49568 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49569}
49570
49571// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49572// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
49573// handles in InstCombine.
49574static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
49575 unsigned Opc = N->getOpcode();
49576 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49577, __extension__
__PRETTY_FUNCTION__))
49577 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49577, __extension__
__PRETTY_FUNCTION__))
;
49578
49579 SDValue N0 = N->getOperand(0);
49580 SDValue N1 = N->getOperand(1);
49581 EVT VT = N->getValueType(0);
49582
49583 // Both operands must be single use.
49584 if (!N0.hasOneUse() || !N1.hasOneUse())
49585 return SDValue();
49586
49587 // Search for matching shifts.
49588 SDValue BC0 = peekThroughOneUseBitcasts(N0);
49589 SDValue BC1 = peekThroughOneUseBitcasts(N1);
49590
49591 unsigned BCOpc = BC0.getOpcode();
49592 EVT BCVT = BC0.getValueType();
49593 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49594 return SDValue();
49595
49596 switch (BCOpc) {
49597 case X86ISD::VSHLI:
49598 case X86ISD::VSRLI:
49599 case X86ISD::VSRAI: {
49600 if (BC0.getOperand(1) != BC1.getOperand(1))
49601 return SDValue();
49602
49603 SDLoc DL(N);
49604 SDValue BitOp =
49605 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
49606 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
49607 return DAG.getBitcast(VT, Shift);
49608 }
49609 }
49610
49611 return SDValue();
49612}
49613
49614/// If this is a zero/all-bits result that is bitwise-anded with a low bits
49615/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
49616/// with a shift-right to eliminate loading the vector constant mask value.
49617static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
49618 const X86Subtarget &Subtarget) {
49619 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49620 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49621 EVT VT = Op0.getValueType();
49622 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
49623 return SDValue();
49624
49625 // Try to convert an "is positive" signbit masking operation into arithmetic
49626 // shift and "andn". This saves a materialization of a -1 vector constant.
49627 // The "is negative" variant should be handled more generally because it only
49628 // requires "and" rather than "andn":
49629 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49630 //
49631 // This is limited to the original type to avoid producing even more bitcasts.
49632 // If the bitcasts can't be eliminated, then it is unlikely that this fold
49633 // will be profitable.
49634 if (N->getValueType(0) == VT &&
49635 supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
49636 SDValue X, Y;
49637 if (Op1.getOpcode() == X86ISD::PCMPGT &&
49638 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
49639 X = Op1.getOperand(0);
49640 Y = Op0;
49641 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
49642 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
49643 X = Op0.getOperand(0);
49644 Y = Op1;
49645 }
49646 if (X && Y) {
49647 SDLoc DL(N);
49648 SDValue Sra =
49649 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
49650 VT.getScalarSizeInBits() - 1, DAG);
49651 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
49652 }
49653 }
49654
49655 APInt SplatVal;
49656 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
49657 !SplatVal.isMask())
49658 return SDValue();
49659
49660 // Don't prevent creation of ANDN.
49661 if (isBitwiseNot(Op0))
49662 return SDValue();
49663
49664 if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
49665 return SDValue();
49666
49667 unsigned EltBitWidth = VT.getScalarSizeInBits();
49668 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
49669 return SDValue();
49670
49671 SDLoc DL(N);
49672 unsigned ShiftVal = SplatVal.countr_one();
49673 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49674 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
49675 return DAG.getBitcast(N->getValueType(0), Shift);
49676}
49677
49678// Get the index node from the lowered DAG of a GEP IR instruction with one
49679// indexing dimension.
49680static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
49681 if (Ld->isIndexed())
49682 return SDValue();
49683
49684 SDValue Base = Ld->getBasePtr();
49685
49686 if (Base.getOpcode() != ISD::ADD)
49687 return SDValue();
49688
49689 SDValue ShiftedIndex = Base.getOperand(0);
49690
49691 if (ShiftedIndex.getOpcode() != ISD::SHL)
49692 return SDValue();
49693
49694 return ShiftedIndex.getOperand(0);
49695
49696}
49697
49698static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
49699 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
49700 switch (VT.getSizeInBits()) {
49701 default: return false;
49702 case 64: return Subtarget.is64Bit() ? true : false;
49703 case 32: return true;
49704 }
49705 }
49706 return false;
49707}
49708
49709// This function recognizes cases where X86 bzhi instruction can replace and
49710// 'and-load' sequence.
49711// In case of loading integer value from an array of constants which is defined
49712// as follows:
49713//
49714// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
49715//
49716// then applying a bitwise and on the result with another input.
49717// It's equivalent to performing bzhi (zero high bits) on the input, with the
49718// same index of the load.
49719static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
49720 const X86Subtarget &Subtarget) {
49721 MVT VT = Node->getSimpleValueType(0);
49722 SDLoc dl(Node);
49723
49724 // Check if subtarget has BZHI instruction for the node's type
49725 if (!hasBZHI(Subtarget, VT))
49726 return SDValue();
49727
49728 // Try matching the pattern for both operands.
49729 for (unsigned i = 0; i < 2; i++) {
49730 SDValue N = Node->getOperand(i);
49731 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
49732
49733 // continue if the operand is not a load instruction
49734 if (!Ld)
49735 return SDValue();
49736
49737 const Value *MemOp = Ld->getMemOperand()->getValue();
49738
49739 if (!MemOp)
49740 return SDValue();
49741
49742 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
49743 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
49744 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
49745
49746 Constant *Init = GV->getInitializer();
49747 Type *Ty = Init->getType();
49748 if (!isa<ConstantDataArray>(Init) ||
49749 !Ty->getArrayElementType()->isIntegerTy() ||
49750 Ty->getArrayElementType()->getScalarSizeInBits() !=
49751 VT.getSizeInBits() ||
49752 Ty->getArrayNumElements() >
49753 Ty->getArrayElementType()->getScalarSizeInBits())
49754 continue;
49755
49756 // Check if the array's constant elements are suitable to our case.
49757 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
49758 bool ConstantsMatch = true;
49759 for (uint64_t j = 0; j < ArrayElementCount; j++) {
49760 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
49761 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
49762 ConstantsMatch = false;
49763 break;
49764 }
49765 }
49766 if (!ConstantsMatch)
49767 continue;
49768
49769 // Do the transformation (For 32-bit type):
49770 // -> (and (load arr[idx]), inp)
49771 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
49772 // that will be replaced with one bzhi instruction.
49773 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
49774 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
49775
49776 // Get the Node which indexes into the array.
49777 SDValue Index = getIndexFromUnindexedLoad(Ld);
49778 if (!Index)
49779 return SDValue();
49780 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
49781
49782 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
49783 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
49784
49785 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
49786 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
49787
49788 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
49789 }
49790 }
49791 }
49792 }
49793 return SDValue();
49794}
49795
49796// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
49797// Where C is a mask containing the same number of bits as the setcc and
49798// where the setcc will freely 0 upper bits of k-register. We can replace the
49799// undef in the concat with 0s and remove the AND. This mainly helps with
49800// v2i1/v4i1 setcc being casted to scalar.
49801static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
49802 const X86Subtarget &Subtarget) {
49803 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49803, __extension__
__PRETTY_FUNCTION__))
;
49804
49805 EVT VT = N->getValueType(0);
49806
49807 // Make sure this is an AND with constant. We will check the value of the
49808 // constant later.
49809 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49810 if (!C1)
49811 return SDValue();
49812
49813 // This is implied by the ConstantSDNode.
49814 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49814, __extension__
__PRETTY_FUNCTION__))
;
49815
49816 SDValue Src = N->getOperand(0);
49817 if (!Src.hasOneUse())
49818 return SDValue();
49819
49820 // (Optionally) peek through any_extend().
49821 if (Src.getOpcode() == ISD::ANY_EXTEND) {
49822 if (!Src.getOperand(0).hasOneUse())
49823 return SDValue();
49824 Src = Src.getOperand(0);
49825 }
49826
49827 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
49828 return SDValue();
49829
49830 Src = Src.getOperand(0);
49831 EVT SrcVT = Src.getValueType();
49832
49833 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49834 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
49835 !TLI.isTypeLegal(SrcVT))
49836 return SDValue();
49837
49838 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
49839 return SDValue();
49840
49841 // We only care about the first subvector of the concat, we expect the
49842 // other subvectors to be ignored due to the AND if we make the change.
49843 SDValue SubVec = Src.getOperand(0);
49844 EVT SubVecVT = SubVec.getValueType();
49845
49846 // The RHS of the AND should be a mask with as many bits as SubVec.
49847 if (!TLI.isTypeLegal(SubVecVT) ||
49848 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49849 return SDValue();
49850
49851 // First subvector should be a setcc with a legal result type or a
49852 // AND containing at least one setcc with a legal result type.
49853 auto IsLegalSetCC = [&](SDValue V) {
49854 if (V.getOpcode() != ISD::SETCC)
49855 return false;
49856 EVT SetccVT = V.getOperand(0).getValueType();
49857 if (!TLI.isTypeLegal(SetccVT) ||
49858 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
49859 return false;
49860 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
49861 return false;
49862 return true;
49863 };
49864 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
49865 (IsLegalSetCC(SubVec.getOperand(0)) ||
49866 IsLegalSetCC(SubVec.getOperand(1))))))
49867 return SDValue();
49868
49869 // We passed all the checks. Rebuild the concat_vectors with zeroes
49870 // and cast it back to VT.
49871 SDLoc dl(N);
49872 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
49873 DAG.getConstant(0, dl, SubVecVT));
49874 Ops[0] = SubVec;
49875 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
49876 Ops);
49877 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
49878 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
49879}
49880
49881static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
49882 SDValue OpMustEq, SDValue Op, unsigned Depth) {
49883 // We don't want to go crazy with the recursion here. This isn't a super
49884 // important optimization.
49885 static constexpr unsigned kMaxDepth = 2;
49886
49887 // Only do this re-ordering if op has one use.
49888 if (!Op.hasOneUse())
49889 return SDValue();
49890
49891 SDLoc DL(Op);
49892 // If we hit another assosiative op, recurse further.
49893 if (Op.getOpcode() == Opc) {
49894 // Done recursing.
49895 if (Depth++ >= kMaxDepth)
49896 return SDValue();
49897
49898 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49899 if (SDValue R =
49900 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
49901 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
49902 Op.getOperand(1 - OpIdx));
49903
49904 } else if (Op.getOpcode() == ISD::SUB) {
49905 if (Opc == ISD::AND) {
49906 // BLSI: (and x, (sub 0, x))
49907 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
49908 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49909 }
49910 // Opc must be ISD::AND or ISD::XOR
49911 // BLSR: (and x, (sub x, 1))
49912 // BLSMSK: (xor x, (sub x, 1))
49913 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49914 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49915
49916 } else if (Op.getOpcode() == ISD::ADD) {
49917 // Opc must be ISD::AND or ISD::XOR
49918 // BLSR: (and x, (add x, -1))
49919 // BLSMSK: (xor x, (add x, -1))
49920 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49921 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49922 }
49923 return SDValue();
49924}
49925
49926static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
49927 const X86Subtarget &Subtarget) {
49928 EVT VT = N->getValueType(0);
49929 // Make sure this node is a candidate for BMI instructions.
49930 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49931 (VT != MVT::i32 && VT != MVT::i64))
49932 return SDValue();
49933
49934 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49934, __extension__
__PRETTY_FUNCTION__))
;
49935
49936 // Try and match LHS and RHS.
49937 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49938 if (SDValue OpMatch =
49939 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49940 N->getOperand(1 - OpIdx), 0))
49941 return OpMatch;
49942 return SDValue();
49943}
49944
49945static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
49946 TargetLowering::DAGCombinerInfo &DCI,
49947 const X86Subtarget &Subtarget) {
49948 SDValue N0 = N->getOperand(0);
49949 SDValue N1 = N->getOperand(1);
49950 EVT VT = N->getValueType(0);
49951 SDLoc dl(N);
49952 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49953
49954 // If this is SSE1 only convert to FAND to avoid scalarization.
49955 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49956 return DAG.getBitcast(MVT::v4i32,
49957 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49958 DAG.getBitcast(MVT::v4f32, N0),
49959 DAG.getBitcast(MVT::v4f32, N1)));
49960 }
49961
49962 // Use a 32-bit and+zext if upper bits known zero.
49963 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49964 APInt HiMask = APInt::getHighBitsSet(64, 32);
49965 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49966 DAG.MaskedValueIsZero(N0, HiMask)) {
49967 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49968 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49969 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49970 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49971 }
49972 }
49973
49974 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49975 // TODO: Support multiple SrcOps.
49976 if (VT == MVT::i1) {
49977 SmallVector<SDValue, 2> SrcOps;
49978 SmallVector<APInt, 2> SrcPartials;
49979 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49980 SrcOps.size() == 1) {
49981 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49982 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49983 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49984 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49985 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49986 if (Mask) {
49987 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49988, __extension__
__PRETTY_FUNCTION__))
49988 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49988, __extension__
__PRETTY_FUNCTION__))
;
49989 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49990 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49991 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49992 }
49993 }
49994 }
49995
49996 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49997 return V;
49998
49999 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50000 return R;
50001
50002 if (SDValue R = combineBitOpWithShift(N, DAG))
50003 return R;
50004
50005 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50006 return FPLogic;
50007
50008 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
50009 return R;
50010
50011 if (DCI.isBeforeLegalizeOps())
50012 return SDValue();
50013
50014 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50015 return R;
50016
50017 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
50018 return R;
50019
50020 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
50021 return ShiftRight;
50022
50023 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
50024 return R;
50025
50026 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
50027 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
50028 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
50029 if (VT.isVector() && getTargetConstantFromNode(N1)) {
50030 unsigned Opc0 = N0.getOpcode();
50031 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
50032 getTargetConstantFromNode(N0.getOperand(1)) &&
50033 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
50034 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
50035 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
50036 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
50037 }
50038 }
50039
50040 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
50041 // avoids slow variable shift (moving shift amount to ECX etc.)
50042 if (isOneConstant(N1) && N0->hasOneUse()) {
50043 SDValue Src = N0;
50044 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
50045 Src.getOpcode() == ISD::TRUNCATE) &&
50046 Src.getOperand(0)->hasOneUse())
50047 Src = Src.getOperand(0);
50048 bool ContainsNOT = false;
50049 X86::CondCode X86CC = X86::COND_B;
50050 // Peek through AND(NOT(SRL(X,Y)),1).
50051 if (isBitwiseNot(Src)) {
50052 Src = Src.getOperand(0);
50053 X86CC = X86::COND_AE;
50054 ContainsNOT = true;
50055 }
50056 if (Src.getOpcode() == ISD::SRL &&
50057 !isa<ConstantSDNode>(Src.getOperand(1))) {
50058 SDValue BitNo = Src.getOperand(1);
50059 Src = Src.getOperand(0);
50060 // Peek through AND(SRL(NOT(X),Y),1).
50061 if (isBitwiseNot(Src)) {
50062 Src = Src.getOperand(0);
50063 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
50064 ContainsNOT = true;
50065 }
50066 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
50067 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
50068 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
50069 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
50070 }
50071 }
50072
50073 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50074 // Attempt to recursively combine a bitmask AND with shuffles.
50075 SDValue Op(N, 0);
50076 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50077 return Res;
50078
50079 // If either operand is a constant mask, then only the elements that aren't
50080 // zero are actually demanded by the other operand.
50081 auto GetDemandedMasks = [&](SDValue Op) {
50082 APInt UndefElts;
50083 SmallVector<APInt> EltBits;
50084 int NumElts = VT.getVectorNumElements();
50085 int EltSizeInBits = VT.getScalarSizeInBits();
50086 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50087 APInt DemandedElts = APInt::getAllOnes(NumElts);
50088 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50089 EltBits)) {
50090 DemandedBits.clearAllBits();
50091 DemandedElts.clearAllBits();
50092 for (int I = 0; I != NumElts; ++I) {
50093 if (UndefElts[I]) {
50094 // We can't assume an undef src element gives an undef dst - the
50095 // other src might be zero.
50096 DemandedBits.setAllBits();
50097 DemandedElts.setBit(I);
50098 } else if (!EltBits[I].isZero()) {
50099 DemandedBits |= EltBits[I];
50100 DemandedElts.setBit(I);
50101 }
50102 }
50103 }
50104 return std::make_pair(DemandedBits, DemandedElts);
50105 };
50106 APInt Bits0, Elts0;
50107 APInt Bits1, Elts1;
50108 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
50109 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
50110
50111 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
50112 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
50113 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
50114 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
50115 if (N->getOpcode() != ISD::DELETED_NODE)
50116 DCI.AddToWorklist(N);
50117 return SDValue(N, 0);
50118 }
50119
50120 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
50121 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
50122 if (NewN0 || NewN1)
50123 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
50124 NewN1 ? NewN1 : N1);
50125 }
50126
50127 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
50128 if ((VT.getScalarSizeInBits() % 8) == 0 &&
50129 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50130 isa<ConstantSDNode>(N0.getOperand(1))) {
50131 SDValue BitMask = N1;
50132 SDValue SrcVec = N0.getOperand(0);
50133 EVT SrcVecVT = SrcVec.getValueType();
50134
50135 // Check that the constant bitmask masks whole bytes.
50136 APInt UndefElts;
50137 SmallVector<APInt, 64> EltBits;
50138 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50139 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
50140 llvm::all_of(EltBits, [](const APInt &M) {
50141 return M.isZero() || M.isAllOnes();
50142 })) {
50143 unsigned NumElts = SrcVecVT.getVectorNumElements();
50144 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
50145 unsigned Idx = N0.getConstantOperandVal(1);
50146
50147 // Create a root shuffle mask from the byte mask and the extracted index.
50148 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
50149 for (unsigned i = 0; i != Scale; ++i) {
50150 if (UndefElts[i])
50151 continue;
50152 int VecIdx = Scale * Idx + i;
50153 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
50154 }
50155
50156 if (SDValue Shuffle = combineX86ShufflesRecursively(
50157 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
50158 X86::MaxShuffleCombineDepth,
50159 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
50160 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
50161 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
50162 N0.getOperand(1));
50163 }
50164 }
50165
50166 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
50167 return R;
50168
50169 return SDValue();
50170}
50171
50172// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50173static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
50174 const X86Subtarget &Subtarget) {
50175 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50175, __extension__
__PRETTY_FUNCTION__))
;
50176
50177 MVT VT = N->getSimpleValueType(0);
50178 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50179 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
50180 return SDValue();
50181
50182 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50183 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50184 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
50185 return SDValue();
50186
50187 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
50188 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
50189 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
50190 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
50191 return SDValue();
50192
50193 // Attempt to extract constant byte masks.
50194 APInt UndefElts0, UndefElts1;
50195 SmallVector<APInt, 32> EltBits0, EltBits1;
50196 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
50197 false, false))
50198 return SDValue();
50199 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
50200 false, false))
50201 return SDValue();
50202
50203 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
50204 // TODO - add UNDEF elts support.
50205 if (UndefElts0[i] || UndefElts1[i])
50206 return SDValue();
50207 if (EltBits0[i] != ~EltBits1[i])
50208 return SDValue();
50209 }
50210
50211 SDLoc DL(N);
50212
50213 if (useVPTERNLOG(Subtarget, VT)) {
50214 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50215 // VPTERNLOG is only available as vXi32/64-bit types.
50216 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
50217 MVT OpVT =
50218 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
50219 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
50220 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
50221 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
50222 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
50223 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
50224 DAG, Subtarget);
50225 return DAG.getBitcast(VT, Res);
50226 }
50227
50228 SDValue X = N->getOperand(0);
50229 SDValue Y =
50230 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
50231 DAG.getBitcast(VT, N1.getOperand(0)));
50232 return DAG.getNode(ISD::OR, DL, VT, X, Y);
50233}
50234
50235// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
50236static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
50237 if (N->getOpcode() != ISD::OR)
50238 return false;
50239
50240 SDValue N0 = N->getOperand(0);
50241 SDValue N1 = N->getOperand(1);
50242
50243 // Canonicalize AND to LHS.
50244 if (N1.getOpcode() == ISD::AND)
50245 std::swap(N0, N1);
50246
50247 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
50248 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
50249 return false;
50250
50251 Mask = N1.getOperand(0);
50252 X = N1.getOperand(1);
50253
50254 // Check to see if the mask appeared in both the AND and ANDNP.
50255 if (N0.getOperand(0) == Mask)
50256 Y = N0.getOperand(1);
50257 else if (N0.getOperand(1) == Mask)
50258 Y = N0.getOperand(0);
50259 else
50260 return false;
50261
50262 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50263 // ANDNP combine allows other combines to happen that prevent matching.
50264 return true;
50265}
50266
50267// Try to fold:
50268// (or (and (m, y), (pandn m, x)))
50269// into:
50270// (vselect m, x, y)
50271// As a special case, try to fold:
50272// (or (and (m, (sub 0, x)), (pandn m, x)))
50273// into:
50274// (sub (xor X, M), M)
50275static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
50276 const X86Subtarget &Subtarget) {
50277 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50277, __extension__
__PRETTY_FUNCTION__))
;
50278
50279 EVT VT = N->getValueType(0);
50280 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50281 (VT.is256BitVector() && Subtarget.hasInt256())))
50282 return SDValue();
50283
50284 SDValue X, Y, Mask;
50285 if (!matchLogicBlend(N, X, Y, Mask))
50286 return SDValue();
50287
50288 // Validate that X, Y, and Mask are bitcasts, and see through them.
50289 Mask = peekThroughBitcasts(Mask);
50290 X = peekThroughBitcasts(X);
50291 Y = peekThroughBitcasts(Y);
50292
50293 EVT MaskVT = Mask.getValueType();
50294 unsigned EltBits = MaskVT.getScalarSizeInBits();
50295
50296 // TODO: Attempt to handle floating point cases as well?
50297 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50298 return SDValue();
50299
50300 SDLoc DL(N);
50301
50302 // Attempt to combine to conditional negate: (sub (xor X, M), M)
50303 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50304 DAG, Subtarget))
50305 return Res;
50306
50307 // PBLENDVB is only available on SSE 4.1.
50308 if (!Subtarget.hasSSE41())
50309 return SDValue();
50310
50311 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50312 if (Subtarget.hasVLX())
50313 return SDValue();
50314
50315 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50316
50317 X = DAG.getBitcast(BlendVT, X);
50318 Y = DAG.getBitcast(BlendVT, Y);
50319 Mask = DAG.getBitcast(BlendVT, Mask);
50320 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50321 return DAG.getBitcast(VT, Mask);
50322}
50323
50324// Helper function for combineOrCmpEqZeroToCtlzSrl
50325// Transforms:
50326// seteq(cmp x, 0)
50327// into:
50328// srl(ctlz x), log2(bitsize(x))
50329// Input pattern is checked by caller.
50330static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
50331 SDValue Cmp = Op.getOperand(1);
50332 EVT VT = Cmp.getOperand(0).getValueType();
50333 unsigned Log2b = Log2_32(VT.getSizeInBits());
50334 SDLoc dl(Op);
50335 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50336 // The result of the shift is true or false, and on X86, the 32-bit
50337 // encoding of shr and lzcnt is more desirable.
50338 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50339 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50340 DAG.getConstant(Log2b, dl, MVT::i8));
50341 return Scc;
50342}
50343
50344// Try to transform:
50345// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50346// into:
50347// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50348// Will also attempt to match more generic cases, eg:
50349// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50350// Only applies if the target supports the FastLZCNT feature.
50351static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
50352 TargetLowering::DAGCombinerInfo &DCI,
50353 const X86Subtarget &Subtarget) {
50354 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50355 return SDValue();
50356
50357 auto isORCandidate = [](SDValue N) {
50358 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50359 };
50360
50361 // Check the zero extend is extending to 32-bit or more. The code generated by
50362 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50363 // instructions to clear the upper bits.
50364 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50365 !isORCandidate(N->getOperand(0)))
50366 return SDValue();
50367
50368 // Check the node matches: setcc(eq, cmp 0)
50369 auto isSetCCCandidate = [](SDValue N) {
50370 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50371 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50372 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50373 isNullConstant(N->getOperand(1).getOperand(1)) &&
50374 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50375 };
50376
50377 SDNode *OR = N->getOperand(0).getNode();
50378 SDValue LHS = OR->getOperand(0);
50379 SDValue RHS = OR->getOperand(1);
50380
50381 // Save nodes matching or(or, setcc(eq, cmp 0)).
50382 SmallVector<SDNode *, 2> ORNodes;
50383 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
50384 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
50385 ORNodes.push_back(OR);
50386 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50387 LHS = OR->getOperand(0);
50388 RHS = OR->getOperand(1);
50389 }
50390
50391 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
50392 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
50393 !isORCandidate(SDValue(OR, 0)))
50394 return SDValue();
50395
50396 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
50397 // to
50398 // or(srl(ctlz),srl(ctlz)).
50399 // The dag combiner can then fold it into:
50400 // srl(or(ctlz, ctlz)).
50401 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
50402 SDValue Ret, NewRHS;
50403 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
50404 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
50405
50406 if (!Ret)
50407 return SDValue();
50408
50409 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
50410 while (!ORNodes.empty()) {
50411 OR = ORNodes.pop_back_val();
50412 LHS = OR->getOperand(0);
50413 RHS = OR->getOperand(1);
50414 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
50415 if (RHS->getOpcode() == ISD::OR)
50416 std::swap(LHS, RHS);
50417 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
50418 if (!NewRHS)
50419 return SDValue();
50420 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
50421 }
50422
50423 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50424}
50425
50426static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
50427 SDValue And1_L, SDValue And1_R,
50428 const SDLoc &DL, SelectionDAG &DAG) {
50429 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50430 return SDValue();
50431 SDValue NotOp = And0_L->getOperand(0);
50432 if (NotOp == And1_R)
50433 std::swap(And1_R, And1_L);
50434 if (NotOp != And1_L)
50435 return SDValue();
50436
50437 // (~(NotOp) & And0_R) | (NotOp & And1_R)
50438 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50439 EVT VT = And1_L->getValueType(0);
50440 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
50441 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
50442 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
50443 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
50444 return Xor1;
50445}
50446
50447/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
50448/// equivalent `((x ^ y) & m) ^ y)` pattern.
50449/// This is typically a better representation for targets without a fused
50450/// "and-not" operation. This function is intended to be called from a
50451/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
50452static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
50453 // Note that masked-merge variants using XOR or ADD expressions are
50454 // normalized to OR by InstCombine so we only check for OR.
50455 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50455, __extension__
__PRETTY_FUNCTION__))
;
50456 SDValue N0 = Node->getOperand(0);
50457 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50458 return SDValue();
50459 SDValue N1 = Node->getOperand(1);
50460 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50461 return SDValue();
50462
50463 SDLoc DL(Node);
50464 SDValue N00 = N0->getOperand(0);
50465 SDValue N01 = N0->getOperand(1);
50466 SDValue N10 = N1->getOperand(0);
50467 SDValue N11 = N1->getOperand(1);
50468 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
50469 return Result;
50470 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
50471 return Result;
50472 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
50473 return Result;
50474 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
50475 return Result;
50476 return SDValue();
50477}
50478
50479/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50480/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50481/// with CMP+{ADC, SBB}.
50482/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
50483static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
50484 SDValue X, SDValue Y,
50485 SelectionDAG &DAG,
50486 bool ZeroSecondOpOnly = false) {
50487 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
50488 return SDValue();
50489
50490 // Look through a one-use zext.
50491 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
50492 Y = Y.getOperand(0);
50493
50494 X86::CondCode CC;
50495 SDValue EFLAGS;
50496 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
50497 CC = (X86::CondCode)Y.getConstantOperandVal(0);
50498 EFLAGS = Y.getOperand(1);
50499 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
50500 Y.hasOneUse()) {
50501 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
50502 }
50503
50504 if (!EFLAGS)
50505 return SDValue();
50506
50507 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50508 // the general case below.
50509 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50510 if (ConstantX && !ZeroSecondOpOnly) {
50511 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50512 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50513 // This is a complicated way to get -1 or 0 from the carry flag:
50514 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50515 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50516 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50517 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50518 EFLAGS);
50519 }
50520
50521 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50522 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50523 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50524 EFLAGS.getValueType().isInteger() &&
50525 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50526 // Swap the operands of a SUB, and we have the same pattern as above.
50527 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50528 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50529 SDValue NewSub = DAG.getNode(
50530 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50531 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50532 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50533 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50534 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50535 NewEFLAGS);
50536 }
50537 }
50538 }
50539
50540 if (CC == X86::COND_B) {
50541 // X + SETB Z --> adc X, 0
50542 // X - SETB Z --> sbb X, 0
50543 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50544 DAG.getVTList(VT, MVT::i32), X,
50545 DAG.getConstant(0, DL, VT), EFLAGS);
50546 }
50547
50548 if (ZeroSecondOpOnly)
50549 return SDValue();
50550
50551 if (CC == X86::COND_A) {
50552 // Try to convert COND_A into COND_B in an attempt to facilitate
50553 // materializing "setb reg".
50554 //
50555 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50556 // cannot take an immediate as its first operand.
50557 //
50558 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50559 EFLAGS.getValueType().isInteger() &&
50560 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50561 SDValue NewSub =
50562 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50563 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50564 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50565 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50566 DAG.getVTList(VT, MVT::i32), X,
50567 DAG.getConstant(0, DL, VT), NewEFLAGS);
50568 }
50569 }
50570
50571 if (CC == X86::COND_AE) {
50572 // X + SETAE --> sbb X, -1
50573 // X - SETAE --> adc X, -1
50574 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50575 DAG.getVTList(VT, MVT::i32), X,
50576 DAG.getConstant(-1, DL, VT), EFLAGS);
50577 }
50578
50579 if (CC == X86::COND_BE) {
50580 // X + SETBE --> sbb X, -1
50581 // X - SETBE --> adc X, -1
50582 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50583 // materializing "setae reg".
50584 //
50585 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50586 // cannot take an immediate as its first operand.
50587 //
50588 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50589 EFLAGS.getValueType().isInteger() &&
50590 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50591 SDValue NewSub =
50592 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50593 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50594 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50595 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50596 DAG.getVTList(VT, MVT::i32), X,
50597 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50598 }
50599 }
50600
50601 if (CC != X86::COND_E && CC != X86::COND_NE)
50602 return SDValue();
50603
50604 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
50605 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
50606 !EFLAGS.getOperand(0).getValueType().isInteger())
50607 return SDValue();
50608
50609 SDValue Z = EFLAGS.getOperand(0);
50610 EVT ZVT = Z.getValueType();
50611
50612 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50613 // the general case below.
50614 if (ConstantX) {
50615 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50616 // fake operands:
50617 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50618 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50619 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50620 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50621 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50622 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50623 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50624 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50625 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50626 SDValue(Neg.getNode(), 1));
50627 }
50628
50629 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50630 // with fake operands:
50631 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50632 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50633 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50634 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50635 SDValue One = DAG.getConstant(1, DL, ZVT);
50636 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50637 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50638 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50639 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50640 Cmp1.getValue(1));
50641 }
50642 }
50643
50644 // (cmp Z, 1) sets the carry flag if Z is 0.
50645 SDValue One = DAG.getConstant(1, DL, ZVT);
50646 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50647 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50648
50649 // Add the flags type for ADC/SBB nodes.
50650 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50651
50652 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50653 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50654 if (CC == X86::COND_NE)
50655 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50656 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50657
50658 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50659 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50660 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50661 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50662}
50663
50664/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50665/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50666/// with CMP+{ADC, SBB}.
50667static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
50668 bool IsSub = N->getOpcode() == ISD::SUB;
50669 SDValue X = N->getOperand(0);
50670 SDValue Y = N->getOperand(1);
50671 EVT VT = N->getValueType(0);
50672 SDLoc DL(N);
50673
50674 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
50675 return ADCOrSBB;
50676
50677 // Commute and try again (negate the result for subtracts).
50678 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
50679 if (IsSub)
50680 ADCOrSBB =
50681 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
50682 return ADCOrSBB;
50683 }
50684
50685 return SDValue();
50686}
50687
50688static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
50689 SelectionDAG &DAG) {
50690 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50691, __extension__
__PRETTY_FUNCTION__))
50691 "Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50691, __extension__
__PRETTY_FUNCTION__))
;
50692
50693 // Delegate to combineAddOrSubToADCOrSBB if we have:
50694 //
50695 // (xor/or (zero_extend (setcc)) imm)
50696 //
50697 // where imm is odd if and only if we have xor, in which case the XOR/OR are
50698 // equivalent to a SUB/ADD, respectively.
50699 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
50700 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
50701 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
50702 bool IsSub = N->getOpcode() == ISD::XOR;
50703 bool N1COdd = N1C->getZExtValue() & 1;
50704 if (IsSub ? N1COdd : !N1COdd) {
50705 SDLoc DL(N);
50706 EVT VT = N->getValueType(0);
50707 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
50708 return R;
50709 }
50710 }
50711 }
50712
50713 return SDValue();
50714}
50715
50716static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
50717 TargetLowering::DAGCombinerInfo &DCI,
50718 const X86Subtarget &Subtarget) {
50719 SDValue N0 = N->getOperand(0);
50720 SDValue N1 = N->getOperand(1);
50721 EVT VT = N->getValueType(0);
50722 SDLoc dl(N);
50723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50724
50725 // If this is SSE1 only convert to FOR to avoid scalarization.
50726 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50727 return DAG.getBitcast(MVT::v4i32,
50728 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
50729 DAG.getBitcast(MVT::v4f32, N0),
50730 DAG.getBitcast(MVT::v4f32, N1)));
50731 }
50732
50733 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
50734 // TODO: Support multiple SrcOps.
50735 if (VT == MVT::i1) {
50736 SmallVector<SDValue, 2> SrcOps;
50737 SmallVector<APInt, 2> SrcPartials;
50738 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
50739 SrcOps.size() == 1) {
50740 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50741 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50742 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50743 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50744 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50745 if (Mask) {
50746 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50747, __extension__
__PRETTY_FUNCTION__))
50747 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50747, __extension__
__PRETTY_FUNCTION__))
;
50748 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
50749 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50750 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50751 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
50752 }
50753 }
50754 }
50755
50756 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50757 return R;
50758
50759 if (SDValue R = combineBitOpWithShift(N, DAG))
50760 return R;
50761
50762 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50763 return FPLogic;
50764
50765 if (DCI.isBeforeLegalizeOps())
50766 return SDValue();
50767
50768 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50769 return R;
50770
50771 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
50772 return R;
50773
50774 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
50775 return R;
50776
50777 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
50778 if ((VT == MVT::i32 || VT == MVT::i64) &&
50779 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
50780 isNullConstant(N0.getOperand(0))) {
50781 SDValue Cond = N0.getOperand(1);
50782 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
50783 Cond = Cond.getOperand(0);
50784
50785 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
50786 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
50787 uint64_t Val = CN->getZExtValue();
50788 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
50789 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
50790 CCode = X86::GetOppositeBranchCondition(CCode);
50791 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
50792
50793 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
50794 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
50795 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
50796 return R;
50797 }
50798 }
50799 }
50800 }
50801
50802 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
50803 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
50804 // iff the upper elements of the non-shifted arg are zero.
50805 // KUNPCK require 16+ bool vector elements.
50806 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
50807 unsigned NumElts = VT.getVectorNumElements();
50808 unsigned HalfElts = NumElts / 2;
50809 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
50810 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
50811 N1.getConstantOperandAPInt(1) == HalfElts &&
50812 DAG.MaskedVectorIsZero(N0, UpperElts)) {
50813 return DAG.getNode(
50814 ISD::CONCAT_VECTORS, dl, VT,
50815 extractSubVector(N0, 0, DAG, dl, HalfElts),
50816 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
50817 }
50818 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
50819 N0.getConstantOperandAPInt(1) == HalfElts &&
50820 DAG.MaskedVectorIsZero(N1, UpperElts)) {
50821 return DAG.getNode(
50822 ISD::CONCAT_VECTORS, dl, VT,
50823 extractSubVector(N1, 0, DAG, dl, HalfElts),
50824 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
50825 }
50826 }
50827
50828 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50829 // Attempt to recursively combine an OR of shuffles.
50830 SDValue Op(N, 0);
50831 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50832 return Res;
50833
50834 // If either operand is a constant mask, then only the elements that aren't
50835 // allones are actually demanded by the other operand.
50836 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
50837 APInt UndefElts;
50838 SmallVector<APInt> EltBits;
50839 int NumElts = VT.getVectorNumElements();
50840 int EltSizeInBits = VT.getScalarSizeInBits();
50841 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
50842 return false;
50843
50844 APInt DemandedElts = APInt::getZero(NumElts);
50845 for (int I = 0; I != NumElts; ++I)
50846 if (!EltBits[I].isAllOnes())
50847 DemandedElts.setBit(I);
50848
50849 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
50850 };
50851 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
50852 if (N->getOpcode() != ISD::DELETED_NODE)
50853 DCI.AddToWorklist(N);
50854 return SDValue(N, 0);
50855 }
50856 }
50857
50858 // We should fold "masked merge" patterns when `andn` is not available.
50859 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
50860 if (SDValue R = foldMaskedMerge(N, DAG))
50861 return R;
50862
50863 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
50864 return R;
50865
50866 return SDValue();
50867}
50868
50869/// Try to turn tests against the signbit in the form of:
50870/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50871/// into:
50872/// SETGT(X, -1)
50873static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
50874 // This is only worth doing if the output type is i8 or i1.
50875 EVT ResultType = N->getValueType(0);
50876 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50877 return SDValue();
50878
50879 SDValue N0 = N->getOperand(0);
50880 SDValue N1 = N->getOperand(1);
50881
50882 // We should be performing an xor against a truncated shift.
50883 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50884 return SDValue();
50885
50886 // Make sure we are performing an xor against one.
50887 if (!isOneConstant(N1))
50888 return SDValue();
50889
50890 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50891 SDValue Shift = N0.getOperand(0);
50892 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50893 return SDValue();
50894
50895 // Make sure we are truncating from one of i16, i32 or i64.
50896 EVT ShiftTy = Shift.getValueType();
50897 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50898 return SDValue();
50899
50900 // Make sure the shift amount extracts the sign bit.
50901 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50902 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50903 return SDValue();
50904
50905 // Create a greater-than comparison against -1.
50906 // N.B. Using SETGE against 0 works but we want a canonical looking
50907 // comparison, using SETGT matches up with what TranslateX86CC.
50908 SDLoc DL(N);
50909 SDValue ShiftOp = Shift.getOperand(0);
50910 EVT ShiftOpTy = ShiftOp.getValueType();
50911 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50912 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50913 *DAG.getContext(), ResultType);
50914 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50915 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50916 if (SetCCResultType != ResultType)
50917 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50918 return Cond;
50919}
50920
50921/// Turn vector tests of the signbit in the form of:
50922/// xor (sra X, elt_size(X)-1), -1
50923/// into:
50924/// pcmpgt X, -1
50925///
50926/// This should be called before type legalization because the pattern may not
50927/// persist after that.
50928static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
50929 const X86Subtarget &Subtarget) {
50930 EVT VT = N->getValueType(0);
50931 if (!VT.isSimple())
50932 return SDValue();
50933
50934 switch (VT.getSimpleVT().SimpleTy) {
50935 default: return SDValue();
50936 case MVT::v16i8:
50937 case MVT::v8i16:
50938 case MVT::v4i32:
50939 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50940 case MVT::v32i8:
50941 case MVT::v16i16:
50942 case MVT::v8i32:
50943 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50944 }
50945
50946 // There must be a shift right algebraic before the xor, and the xor must be a
50947 // 'not' operation.
50948 SDValue Shift = N->getOperand(0);
50949 SDValue Ones = N->getOperand(1);
50950 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50951 !ISD::isBuildVectorAllOnes(Ones.getNode()))
50952 return SDValue();
50953
50954 // The shift should be smearing the sign bit across each vector element.
50955 auto *ShiftAmt =
50956 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50957 if (!ShiftAmt ||
50958 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50959 return SDValue();
50960
50961 // Create a greater-than comparison against -1. We don't use the more obvious
50962 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50963 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50964}
50965
50966/// Detect patterns of truncation with unsigned saturation:
50967///
50968/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50969/// Return the source value x to be truncated or SDValue() if the pattern was
50970/// not matched.
50971///
50972/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50973/// where C1 >= 0 and C2 is unsigned max of destination type.
50974///
50975/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50976/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50977///
50978/// These two patterns are equivalent to:
50979/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50980/// So return the smax(x, C1) value to be truncated or SDValue() if the
50981/// pattern was not matched.
50982static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
50983 const SDLoc &DL) {
50984 EVT InVT = In.getValueType();
50985
50986 // Saturation with truncation. We truncate from InVT to VT.
50987 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50988, __extension__
__PRETTY_FUNCTION__))
50988 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50988, __extension__
__PRETTY_FUNCTION__))
;
50989
50990 // Match min/max and return limit value as a parameter.
50991 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50992 if (V.getOpcode() == Opcode &&
50993 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50994 return V.getOperand(0);
50995 return SDValue();
50996 };
50997
50998 APInt C1, C2;
50999 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
51000 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
51001 // the element size of the destination type.
51002 if (C2.isMask(VT.getScalarSizeInBits()))
51003 return UMin;
51004
51005 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
51006 if (MatchMinMax(SMin, ISD::SMAX, C1))
51007 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
51008 return SMin;
51009
51010 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
51011 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
51012 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
51013 C2.uge(C1)) {
51014 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
51015 }
51016
51017 return SDValue();
51018}
51019
51020/// Detect patterns of truncation with signed saturation:
51021/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
51022/// signed_max_of_dest_type)) to dest_type)
51023/// or:
51024/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
51025/// signed_min_of_dest_type)) to dest_type).
51026/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
51027/// Return the source value to be truncated or SDValue() if the pattern was not
51028/// matched.
51029static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
51030 unsigned NumDstBits = VT.getScalarSizeInBits();
51031 unsigned NumSrcBits = In.getScalarValueSizeInBits();
51032 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51032, __extension__
__PRETTY_FUNCTION__))
;
51033
51034 auto MatchMinMax = [](SDValue V, unsigned Opcode,
51035 const APInt &Limit) -> SDValue {
51036 APInt C;
51037 if (V.getOpcode() == Opcode &&
51038 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
51039 return V.getOperand(0);
51040 return SDValue();
51041 };
51042
51043 APInt SignedMax, SignedMin;
51044 if (MatchPackUS) {
51045 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
51046 SignedMin = APInt(NumSrcBits, 0);
51047 } else {
51048 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
51049 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
51050 }
51051
51052 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
51053 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
51054 return SMax;
51055
51056 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
51057 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
51058 return SMin;
51059
51060 return SDValue();
51061}
51062
51063static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
51064 SelectionDAG &DAG,
51065 const X86Subtarget &Subtarget) {
51066 if (!Subtarget.hasSSE2() || !VT.isVector())
51067 return SDValue();
51068
51069 EVT SVT = VT.getVectorElementType();
51070 EVT InVT = In.getValueType();
51071 EVT InSVT = InVT.getVectorElementType();
51072
51073 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51074 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51075 // and concatenate at the same time. Then we can use a final vpmovuswb to
51076 // clip to 0-255.
51077 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
51078 InVT == MVT::v16i32 && VT == MVT::v16i8) {
51079 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51080 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
51081 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
51082 DL, DAG, Subtarget);
51083 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51083, __extension__
__PRETTY_FUNCTION__))
;
51084 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
51085 }
51086 }
51087
51088 // vXi32 truncate instructions are available with AVX512F.
51089 // vXi16 truncate instructions are only available with AVX512BW.
51090 // For 256-bit or smaller vectors, we require VLX.
51091 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
51092 // If the result type is 256-bits or larger and we have disable 512-bit
51093 // registers, we should go ahead and use the pack instructions if possible.
51094 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
51095 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
51096 (InVT.getSizeInBits() > 128) &&
51097 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
51098 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
51099
51100 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
51101 VT.getSizeInBits() >= 64 &&
51102 (SVT == MVT::i8 || SVT == MVT::i16) &&
51103 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
51104 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51105 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51106 // Only do this when the result is at least 64 bits or we'll leaving
51107 // dangling PACKSSDW nodes.
51108 if (SVT == MVT::i8 && InSVT == MVT::i32) {
51109 EVT MidVT = VT.changeVectorElementType(MVT::i16);
51110 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
51111 DAG, Subtarget);
51112 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51112, __extension__
__PRETTY_FUNCTION__))
;
51113 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
51114 Subtarget);
51115 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51115, __extension__ __PRETTY_FUNCTION__))
;
51116 return V;
51117 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
51118 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
51119 Subtarget);
51120 }
51121 if (SDValue SSatVal = detectSSatPattern(In, VT))
51122 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
51123 Subtarget);
51124 }
51125
51126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51127 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
51128 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
51129 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
51130 unsigned TruncOpc = 0;
51131 SDValue SatVal;
51132 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
51133 SatVal = SSatVal;
51134 TruncOpc = X86ISD::VTRUNCS;
51135 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
51136 SatVal = USatVal;
51137 TruncOpc = X86ISD::VTRUNCUS;
51138 }
51139 if (SatVal) {
51140 unsigned ResElts = VT.getVectorNumElements();
51141 // If the input type is less than 512 bits and we don't have VLX, we need
51142 // to widen to 512 bits.
51143 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
51144 unsigned NumConcats = 512 / InVT.getSizeInBits();
51145 ResElts *= NumConcats;
51146 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
51147 ConcatOps[0] = SatVal;
51148 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
51149 NumConcats * InVT.getVectorNumElements());
51150 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
51151 }
51152 // Widen the result if its narrower than 128 bits.
51153 if (ResElts * SVT.getSizeInBits() < 128)
51154 ResElts = 128 / SVT.getSizeInBits();
51155 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
51156 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
51157 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51158 DAG.getIntPtrConstant(0, DL));
51159 }
51160 }
51161
51162 return SDValue();
51163}
51164
51165/// This function detects the AVG pattern between vectors of unsigned i8/i16,
51166/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
51167/// ISD::AVGCEILU (AVG) instruction.
51168static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51169 const X86Subtarget &Subtarget,
51170 const SDLoc &DL) {
51171 if (!VT.isVector())
51172 return SDValue();
51173 EVT InVT = In.getValueType();
51174 unsigned NumElems = VT.getVectorNumElements();
51175
51176 EVT ScalarVT = VT.getVectorElementType();
51177 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
51178 return SDValue();
51179
51180 // InScalarVT is the intermediate type in AVG pattern and it should be greater
51181 // than the original input type (i8/i16).
51182 EVT InScalarVT = InVT.getVectorElementType();
51183 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
51184 return SDValue();
51185
51186 if (!Subtarget.hasSSE2())
51187 return SDValue();
51188
51189 // Detect the following pattern:
51190 //
51191 // %1 = zext <N x i8> %a to <N x i32>
51192 // %2 = zext <N x i8> %b to <N x i32>
51193 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
51194 // %4 = add nuw nsw <N x i32> %3, %2
51195 // %5 = lshr <N x i32> %N, <i32 1 x N>
51196 // %6 = trunc <N x i32> %5 to <N x i8>
51197 //
51198 // In AVX512, the last instruction can also be a trunc store.
51199 if (In.getOpcode() != ISD::SRL)
51200 return SDValue();
51201
51202 // A lambda checking the given SDValue is a constant vector and each element
51203 // is in the range [Min, Max].
51204 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
51205 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
51206 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
51207 });
51208 };
51209
51210 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
51211 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
51212 return MaxActiveBits <= ScalarVT.getSizeInBits();
51213 };
51214
51215 // Check if each element of the vector is right-shifted by one.
51216 SDValue LHS = In.getOperand(0);
51217 SDValue RHS = In.getOperand(1);
51218 if (!IsConstVectorInRange(RHS, 1, 1))
51219 return SDValue();
51220 if (LHS.getOpcode() != ISD::ADD)
51221 return SDValue();
51222
51223 // Detect a pattern of a + b + 1 where the order doesn't matter.
51224 SDValue Operands[3];
51225 Operands[0] = LHS.getOperand(0);
51226 Operands[1] = LHS.getOperand(1);
51227
51228 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51229 ArrayRef<SDValue> Ops) {
51230 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
51231 };
51232
51233 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
51234 for (SDValue &Op : Ops)
51235 if (Op.getValueType() != VT)
51236 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
51237 // Pad to a power-of-2 vector, split+apply and extract the original vector.
51238 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
51239 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
51240 if (NumElemsPow2 != NumElems) {
51241 for (SDValue &Op : Ops) {
51242 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
51243 for (unsigned i = 0; i != NumElems; ++i) {
51244 SDValue Idx = DAG.getIntPtrConstant(i, DL);
51245 EltsOfOp[i] =
51246 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
51247 }
51248 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
51249 }
51250 }
51251 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
51252 if (NumElemsPow2 == NumElems)
51253 return Res;
51254 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51255 DAG.getIntPtrConstant(0, DL));
51256 };
51257
51258 // Take care of the case when one of the operands is a constant vector whose
51259 // element is in the range [1, 256].
51260 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
51261 IsZExtLike(Operands[0])) {
51262 // The pattern is detected. Subtract one from the constant vector, then
51263 // demote it and emit X86ISD::AVG instruction.
51264 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
51265 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
51266 return AVGSplitter({Operands[0], Operands[1]});
51267 }
51268
51269 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
51270 // Match the or case only if its 'add-like' - can be replaced by an add.
51271 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
51272 if (ISD::ADD == V.getOpcode()) {
51273 Op0 = V.getOperand(0);
51274 Op1 = V.getOperand(1);
51275 return true;
51276 }
51277 if (ISD::ZERO_EXTEND != V.getOpcode())
51278 return false;
51279 V = V.getOperand(0);
51280 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
51281 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
51282 return false;
51283 Op0 = V.getOperand(0);
51284 Op1 = V.getOperand(1);
51285 return true;
51286 };
51287
51288 SDValue Op0, Op1;
51289 if (FindAddLike(Operands[0], Op0, Op1))
51290 std::swap(Operands[0], Operands[1]);
51291 else if (!FindAddLike(Operands[1], Op0, Op1))
51292 return SDValue();
51293 Operands[2] = Op0;
51294 Operands[1] = Op1;
51295
51296 // Now we have three operands of two additions. Check that one of them is a
51297 // constant vector with ones, and the other two can be promoted from i8/i16.
51298 for (SDValue &Op : Operands) {
51299 if (!IsConstVectorInRange(Op, 1, 1))
51300 continue;
51301 std::swap(Op, Operands[2]);
51302
51303 // Check if Operands[0] and Operands[1] are results of type promotion.
51304 for (int j = 0; j < 2; ++j)
51305 if (Operands[j].getValueType() != VT)
51306 if (!IsZExtLike(Operands[j]))
51307 return SDValue();
51308
51309 // The pattern is detected, emit X86ISD::AVG instruction(s).
51310 return AVGSplitter({Operands[0], Operands[1]});
51311 }
51312
51313 return SDValue();
51314}
51315
51316static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
51317 TargetLowering::DAGCombinerInfo &DCI,
51318 const X86Subtarget &Subtarget) {
51319 LoadSDNode *Ld = cast<LoadSDNode>(N);
51320 EVT RegVT = Ld->getValueType(0);
51321 EVT MemVT = Ld->getMemoryVT();
51322 SDLoc dl(Ld);
51323 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51324
51325 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51326 // into two 16-byte operations. Also split non-temporal aligned loads on
51327 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51328 ISD::LoadExtType Ext = Ld->getExtensionType();
51329 unsigned Fast;
51330 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51331 Ext == ISD::NON_EXTLOAD &&
51332 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51333 Ld->getAlign() >= Align(16)) ||
51334 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51335 *Ld->getMemOperand(), &Fast) &&
51336 !Fast))) {
51337 unsigned NumElems = RegVT.getVectorNumElements();
51338 if (NumElems < 2)
51339 return SDValue();
51340
51341 unsigned HalfOffset = 16;
51342 SDValue Ptr1 = Ld->getBasePtr();
51343 SDValue Ptr2 =
51344 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
51345 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51346 NumElems / 2);
51347 SDValue Load1 =
51348 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51349 Ld->getOriginalAlign(),
51350 Ld->getMemOperand()->getFlags());
51351 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51352 Ld->getPointerInfo().getWithOffset(HalfOffset),
51353 Ld->getOriginalAlign(),
51354 Ld->getMemOperand()->getFlags());
51355 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
51356 Load1.getValue(1), Load2.getValue(1));
51357
51358 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
51359 return DCI.CombineTo(N, NewVec, TF, true);
51360 }
51361
51362 // Bool vector load - attempt to cast to an integer, as we have good
51363 // (vXiY *ext(vXi1 bitcast(iX))) handling.
51364 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
51365 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
51366 unsigned NumElts = RegVT.getVectorNumElements();
51367 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51368 if (TLI.isTypeLegal(IntVT)) {
51369 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51370 Ld->getPointerInfo(),
51371 Ld->getOriginalAlign(),
51372 Ld->getMemOperand()->getFlags());
51373 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
51374 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
51375 }
51376 }
51377
51378 // If we also broadcast this as a subvector to a wider type, then just extract
51379 // the lowest subvector.
51380 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51381 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
51382 SDValue Ptr = Ld->getBasePtr();
51383 SDValue Chain = Ld->getChain();
51384 for (SDNode *User : Ptr->uses()) {
51385 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51386 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51387 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51388 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51389 MemVT.getSizeInBits() &&
51390 !User->hasAnyUseOfValue(1) &&
51391 User->getValueSizeInBits(0).getFixedValue() >
51392 RegVT.getFixedSizeInBits()) {
51393 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51394 RegVT.getSizeInBits());
51395 Extract = DAG.getBitcast(RegVT, Extract);
51396 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51397 }
51398 }
51399 }
51400
51401 // Cast ptr32 and ptr64 pointers to the default address space before a load.
51402 unsigned AddrSpace = Ld->getAddressSpace();
51403 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51404 AddrSpace == X86AS::PTR32_UPTR) {
51405 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51406 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51407 SDValue Cast =
51408 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51409 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
51410 Ld->getOriginalAlign(),
51411 Ld->getMemOperand()->getFlags());
51412 }
51413 }
51414
51415 return SDValue();
51416}
51417
51418/// If V is a build vector of boolean constants and exactly one of those
51419/// constants is true, return the operand index of that true element.
51420/// Otherwise, return -1.
51421static int getOneTrueElt(SDValue V) {
51422 // This needs to be a build vector of booleans.
51423 // TODO: Checking for the i1 type matches the IR definition for the mask,
51424 // but the mask check could be loosened to i8 or other types. That might
51425 // also require checking more than 'allOnesValue'; eg, the x86 HW
51426 // instructions only require that the MSB is set for each mask element.
51427 // The ISD::MSTORE comments/definition do not specify how the mask operand
51428 // is formatted.
51429 auto *BV = dyn_cast<BuildVectorSDNode>(V);
51430 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51431 return -1;
51432
51433 int TrueIndex = -1;
51434 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51435 for (unsigned i = 0; i < NumElts; ++i) {
51436 const SDValue &Op = BV->getOperand(i);
51437 if (Op.isUndef())
51438 continue;
51439 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
51440 if (!ConstNode)
51441 return -1;
51442 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51443 // If we already found a one, this is too many.
51444 if (TrueIndex >= 0)
51445 return -1;
51446 TrueIndex = i;
51447 }
51448 }
51449 return TrueIndex;
51450}
51451
51452/// Given a masked memory load/store operation, return true if it has one mask
51453/// bit set. If it has one mask bit set, then also return the memory address of
51454/// the scalar element to load/store, the vector index to insert/extract that
51455/// scalar element, and the alignment for the scalar memory access.
51456static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
51457 SelectionDAG &DAG, SDValue &Addr,
51458 SDValue &Index, Align &Alignment,
51459 unsigned &Offset) {
51460 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51461 if (TrueMaskElt < 0)
51462 return false;
51463
51464 // Get the address of the one scalar element that is specified by the mask
51465 // using the appropriate offset from the base pointer.
51466 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51467 Offset = 0;
51468 Addr = MaskedOp->getBasePtr();
51469 if (TrueMaskElt != 0) {
51470 Offset = TrueMaskElt * EltVT.getStoreSize();
51471 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
51472 SDLoc(MaskedOp));
51473 }
51474
51475 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
51476 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51477 EltVT.getStoreSize());
51478 return true;
51479}
51480
51481/// If exactly one element of the mask is set for a non-extending masked load,
51482/// it is a scalar load and vector insert.
51483/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51484/// mask have already been optimized in IR, so we don't bother with those here.
51485static SDValue
51486reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51487 TargetLowering::DAGCombinerInfo &DCI,
51488 const X86Subtarget &Subtarget) {
51489 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51489, __extension__
__PRETTY_FUNCTION__))
;
51490 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51491 // However, some target hooks may need to be added to know when the transform
51492 // is profitable. Endianness would also have to be considered.
51493
51494 SDValue Addr, VecIndex;
51495 Align Alignment;
51496 unsigned Offset;
51497 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
51498 return SDValue();
51499
51500 // Load the one scalar element that is specified by the mask using the
51501 // appropriate offset from the base pointer.
51502 SDLoc DL(ML);
51503 EVT VT = ML->getValueType(0);
51504 EVT EltVT = VT.getVectorElementType();
51505
51506 EVT CastVT = VT;
51507 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51508 EltVT = MVT::f64;
51509 CastVT = VT.changeVectorElementType(EltVT);
51510 }
51511
51512 SDValue Load =
51513 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51514 ML->getPointerInfo().getWithOffset(Offset),
51515 Alignment, ML->getMemOperand()->getFlags());
51516
51517 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51518
51519 // Insert the loaded element into the appropriate place in the vector.
51520 SDValue Insert =
51521 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
51522 Insert = DAG.getBitcast(VT, Insert);
51523 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
51524}
51525
51526static SDValue
51527combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
51528 TargetLowering::DAGCombinerInfo &DCI) {
51529 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51529, __extension__
__PRETTY_FUNCTION__))
;
51530 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51531 return SDValue();
51532
51533 SDLoc DL(ML);
51534 EVT VT = ML->getValueType(0);
51535
51536 // If we are loading the first and last elements of a vector, it is safe and
51537 // always faster to load the whole vector. Replace the masked load with a
51538 // vector load and select.
51539 unsigned NumElts = VT.getVectorNumElements();
51540 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51541 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51542 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51543 if (LoadFirstElt && LoadLastElt) {
51544 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51545 ML->getMemOperand());
51546 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51547 ML->getPassThru());
51548 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
51549 }
51550
51551 // Convert a masked load with a constant mask into a masked load and a select.
51552 // This allows the select operation to use a faster kind of select instruction
51553 // (for example, vblendvps -> vblendps).
51554
51555 // Don't try this if the pass-through operand is already undefined. That would
51556 // cause an infinite loop because that's what we're about to create.
51557 if (ML->getPassThru().isUndef())
51558 return SDValue();
51559
51560 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51561 return SDValue();
51562
51563 // The new masked load has an undef pass-through operand. The select uses the
51564 // original pass-through operand.
51565 SDValue NewML = DAG.getMaskedLoad(
51566 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51567 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51568 ML->getAddressingMode(), ML->getExtensionType());
51569 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51570 ML->getPassThru());
51571
51572 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
51573}
51574
51575static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
51576 TargetLowering::DAGCombinerInfo &DCI,
51577 const X86Subtarget &Subtarget) {
51578 auto *Mld = cast<MaskedLoadSDNode>(N);
51579
51580 // TODO: Expanding load with constant mask may be optimized as well.
51581 if (Mld->isExpandingLoad())
51582 return SDValue();
51583
51584 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51585 if (SDValue ScalarLoad =
51586 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
51587 return ScalarLoad;
51588
51589 // TODO: Do some AVX512 subsets benefit from this transform?
51590 if (!Subtarget.hasAVX512())
51591 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
51592 return Blend;
51593 }
51594
51595 // If the mask value has been legalized to a non-boolean vector, try to
51596 // simplify ops leading up to it. We only demand the MSB of each lane.
51597 SDValue Mask = Mld->getMask();
51598 if (Mask.getScalarValueSizeInBits() != 1) {
51599 EVT VT = Mld->getValueType(0);
51600 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51601 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51602 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51603 if (N->getOpcode() != ISD::DELETED_NODE)
51604 DCI.AddToWorklist(N);
51605 return SDValue(N, 0);
51606 }
51607 if (SDValue NewMask =
51608 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51609 return DAG.getMaskedLoad(
51610 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51611 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51612 Mld->getAddressingMode(), Mld->getExtensionType());
51613 }
51614
51615 return SDValue();
51616}
51617
51618/// If exactly one element of the mask is set for a non-truncating masked store,
51619/// it is a vector extract and scalar store.
51620/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51621/// mask have already been optimized in IR, so we don't bother with those here.
51622static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
51623 SelectionDAG &DAG,
51624 const X86Subtarget &Subtarget) {
51625 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51626 // However, some target hooks may need to be added to know when the transform
51627 // is profitable. Endianness would also have to be considered.
51628
51629 SDValue Addr, VecIndex;
51630 Align Alignment;
51631 unsigned Offset;
51632 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
51633 return SDValue();
51634
51635 // Extract the one scalar element that is actually being stored.
51636 SDLoc DL(MS);
51637 SDValue Value = MS->getValue();
51638 EVT VT = Value.getValueType();
51639 EVT EltVT = VT.getVectorElementType();
51640 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51641 EltVT = MVT::f64;
51642 EVT CastVT = VT.changeVectorElementType(EltVT);
51643 Value = DAG.getBitcast(CastVT, Value);
51644 }
51645 SDValue Extract =
51646 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
51647
51648 // Store that element at the appropriate offset from the base pointer.
51649 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51650 MS->getPointerInfo().getWithOffset(Offset),
51651 Alignment, MS->getMemOperand()->getFlags());
51652}
51653
51654static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
51655 TargetLowering::DAGCombinerInfo &DCI,
51656 const X86Subtarget &Subtarget) {
51657 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
51658 if (Mst->isCompressingStore())
51659 return SDValue();
51660
51661 EVT VT = Mst->getValue().getValueType();
51662 SDLoc dl(Mst);
51663 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51664
51665 if (Mst->isTruncatingStore())
51666 return SDValue();
51667
51668 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51669 return ScalarStore;
51670
51671 // If the mask value has been legalized to a non-boolean vector, try to
51672 // simplify ops leading up to it. We only demand the MSB of each lane.
51673 SDValue Mask = Mst->getMask();
51674 if (Mask.getScalarValueSizeInBits() != 1) {
51675 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
51676 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51677 if (N->getOpcode() != ISD::DELETED_NODE)
51678 DCI.AddToWorklist(N);
51679 return SDValue(N, 0);
51680 }
51681 if (SDValue NewMask =
51682 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
51683 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51684 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51685 Mst->getMemoryVT(), Mst->getMemOperand(),
51686 Mst->getAddressingMode());
51687 }
51688
51689 SDValue Value = Mst->getValue();
51690 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51691 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51692 Mst->getMemoryVT())) {
51693 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51694 Mst->getBasePtr(), Mst->getOffset(), Mask,
51695 Mst->getMemoryVT(), Mst->getMemOperand(),
51696 Mst->getAddressingMode(), true);
51697 }
51698
51699 return SDValue();
51700}
51701
51702static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
51703 TargetLowering::DAGCombinerInfo &DCI,
51704 const X86Subtarget &Subtarget) {
51705 StoreSDNode *St = cast<StoreSDNode>(N);
51706 EVT StVT = St->getMemoryVT();
51707 SDLoc dl(St);
51708 SDValue StoredVal = St->getValue();
51709 EVT VT = StoredVal.getValueType();
51710 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51711
51712 // Convert a store of vXi1 into a store of iX and a bitcast.
51713 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
51714 VT.getVectorElementType() == MVT::i1) {
51715
51716 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
51717 StoredVal = DAG.getBitcast(NewVT, StoredVal);
51718
51719 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51720 St->getPointerInfo(), St->getOriginalAlign(),
51721 St->getMemOperand()->getFlags());
51722 }
51723
51724 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
51725 // This will avoid a copy to k-register.
51726 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
51727 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
51728 StoredVal.getOperand(0).getValueType() == MVT::i8) {
51729 SDValue Val = StoredVal.getOperand(0);
51730 // We must store zeros to the unused bits.
51731 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
51732 return DAG.getStore(St->getChain(), dl, Val,
51733 St->getBasePtr(), St->getPointerInfo(),
51734 St->getOriginalAlign(),
51735 St->getMemOperand()->getFlags());
51736 }
51737
51738 // Widen v2i1/v4i1 stores to v8i1.
51739 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
51740 Subtarget.hasAVX512()) {
51741 unsigned NumConcats = 8 / VT.getVectorNumElements();
51742 // We must store zeros to the unused bits.
51743 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
51744 Ops[0] = StoredVal;
51745 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
51746 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51747 St->getPointerInfo(), St->getOriginalAlign(),
51748 St->getMemOperand()->getFlags());
51749 }
51750
51751 // Turn vXi1 stores of constants into a scalar store.
51752 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
51753 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
51754 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
51755 // If its a v64i1 store without 64-bit support, we need two stores.
51756 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
51757 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
51758 StoredVal->ops().slice(0, 32));
51759 Lo = combinevXi1ConstantToInteger(Lo, DAG);
51760 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
51761 StoredVal->ops().slice(32, 32));
51762 Hi = combinevXi1ConstantToInteger(Hi, DAG);
51763
51764 SDValue Ptr0 = St->getBasePtr();
51765 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
51766
51767 SDValue Ch0 =
51768 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
51769 St->getOriginalAlign(),
51770 St->getMemOperand()->getFlags());
51771 SDValue Ch1 =
51772 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
51773 St->getPointerInfo().getWithOffset(4),
51774 St->getOriginalAlign(),
51775 St->getMemOperand()->getFlags());
51776 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
51777 }
51778
51779 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
51780 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51781 St->getPointerInfo(), St->getOriginalAlign(),
51782 St->getMemOperand()->getFlags());
51783 }
51784
51785 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
51786 // Sandy Bridge, perform two 16-byte stores.
51787 unsigned Fast;
51788 if (VT.is256BitVector() && StVT == VT &&
51789 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51790 *St->getMemOperand(), &Fast) &&
51791 !Fast) {
51792 unsigned NumElems = VT.getVectorNumElements();
51793 if (NumElems < 2)
51794 return SDValue();
51795
51796 return splitVectorStore(St, DAG);
51797 }
51798
51799 // Split under-aligned vector non-temporal stores.
51800 if (St->isNonTemporal() && StVT == VT &&
51801 St->getAlign().value() < VT.getStoreSize()) {
51802 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
51803 // vectors or the legalizer can scalarize it to use MOVNTI.
51804 if (VT.is256BitVector() || VT.is512BitVector()) {
51805 unsigned NumElems = VT.getVectorNumElements();
51806 if (NumElems < 2)
51807 return SDValue();
51808 return splitVectorStore(St, DAG);
51809 }
51810
51811 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
51812 // to use MOVNTI.
51813 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
51814 MVT NTVT = Subtarget.hasSSE4A()
51815 ? MVT::v2f64
51816 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
51817 return scalarizeVectorStore(St, NTVT, DAG);
51818 }
51819 }
51820
51821 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
51822 // supported, but avx512f is by extending to v16i32 and truncating.
51823 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
51824 St->getValue().getOpcode() == ISD::TRUNCATE &&
51825 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
51826 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
51827 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51828 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51829 St->getValue().getOperand(0));
51830 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51831 MVT::v16i8, St->getMemOperand());
51832 }
51833
51834 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51835 if (!St->isTruncatingStore() &&
51836 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51837 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51838 StoredVal.hasOneUse() &&
51839 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51840 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51841 return EmitTruncSStore(IsSigned, St->getChain(),
51842 dl, StoredVal.getOperand(0), St->getBasePtr(),
51843 VT, St->getMemOperand(), DAG);
51844 }
51845
51846 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51847 if (!St->isTruncatingStore()) {
51848 auto IsExtractedElement = [](SDValue V) {
51849 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51850 V = V.getOperand(0);
51851 unsigned Opc = V.getOpcode();
51852 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51853 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51854 V.getOperand(0).hasOneUse())
51855 return V.getOperand(0);
51856 return SDValue();
51857 };
51858 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51859 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51860 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51861 SDValue Src = Trunc.getOperand(0);
51862 MVT DstVT = Trunc.getSimpleValueType();
51863 MVT SrcVT = Src.getSimpleValueType();
51864 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51865 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51866 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51867 if (NumTruncBits == VT.getSizeInBits() &&
51868 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51869 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51870 TruncVT, St->getMemOperand());
51871 }
51872 }
51873 }
51874 }
51875
51876 // Optimize trunc store (of multiple scalars) to shuffle and store.
51877 // First, pack all of the elements in one place. Next, store to memory
51878 // in fewer chunks.
51879 if (St->isTruncatingStore() && VT.isVector()) {
51880 // Check if we can detect an AVG pattern from the truncation. If yes,
51881 // replace the trunc store by a normal store with the result of X86ISD::AVG
51882 // instruction.
51883 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
51884 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
51885 Subtarget, dl))
51886 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
51887 St->getPointerInfo(), St->getOriginalAlign(),
51888 St->getMemOperand()->getFlags());
51889
51890 if (TLI.isTruncStoreLegal(VT, StVT)) {
51891 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51892 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51893 dl, Val, St->getBasePtr(),
51894 St->getMemoryVT(), St->getMemOperand(), DAG);
51895 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51896 DAG, dl))
51897 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51898 dl, Val, St->getBasePtr(),
51899 St->getMemoryVT(), St->getMemOperand(), DAG);
51900 }
51901
51902 return SDValue();
51903 }
51904
51905 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51906 unsigned AddrSpace = St->getAddressSpace();
51907 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51908 AddrSpace == X86AS::PTR32_UPTR) {
51909 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51910 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51911 SDValue Cast =
51912 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51913 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
51914 St->getPointerInfo(), St->getOriginalAlign(),
51915 St->getMemOperand()->getFlags(), St->getAAInfo());
51916 }
51917 }
51918
51919 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51920 // the FP state in cases where an emms may be missing.
51921 // A preferable solution to the general problem is to figure out the right
51922 // places to insert EMMS. This qualifies as a quick hack.
51923
51924 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51925 if (VT.getSizeInBits() != 64)
51926 return SDValue();
51927
51928 const Function &F = DAG.getMachineFunction().getFunction();
51929 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51930 bool F64IsLegal =
51931 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51932 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
51933 isa<LoadSDNode>(St->getValue()) &&
51934 cast<LoadSDNode>(St->getValue())->isSimple() &&
51935 St->getChain().hasOneUse() && St->isSimple()) {
51936 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
51937
51938 if (!ISD::isNormalLoad(Ld))
51939 return SDValue();
51940
51941 // Avoid the transformation if there are multiple uses of the loaded value.
51942 if (!Ld->hasNUsesOfValue(1, 0))
51943 return SDValue();
51944
51945 SDLoc LdDL(Ld);
51946 SDLoc StDL(N);
51947 // Lower to a single movq load/store pair.
51948 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51949 Ld->getBasePtr(), Ld->getMemOperand());
51950
51951 // Make sure new load is placed in same chain order.
51952 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51953 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51954 St->getMemOperand());
51955 }
51956
51957 // This is similar to the above case, but here we handle a scalar 64-bit
51958 // integer store that is extracted from a vector on a 32-bit target.
51959 // If we have SSE2, then we can treat it like a floating-point double
51960 // to get past legalization. The execution dependencies fixup pass will
51961 // choose the optimal machine instruction for the store if this really is
51962 // an integer or v2f32 rather than an f64.
51963 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
51964 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
51965 SDValue OldExtract = St->getOperand(1);
51966 SDValue ExtOp0 = OldExtract.getOperand(0);
51967 unsigned VecSize = ExtOp0.getValueSizeInBits();
51968 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51969 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51970 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51971 BitCast, OldExtract.getOperand(1));
51972 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51973 St->getPointerInfo(), St->getOriginalAlign(),
51974 St->getMemOperand()->getFlags());
51975 }
51976
51977 return SDValue();
51978}
51979
51980static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
51981 TargetLowering::DAGCombinerInfo &DCI,
51982 const X86Subtarget &Subtarget) {
51983 auto *St = cast<MemIntrinsicSDNode>(N);
51984
51985 SDValue StoredVal = N->getOperand(1);
51986 MVT VT = StoredVal.getSimpleValueType();
51987 EVT MemVT = St->getMemoryVT();
51988
51989 // Figure out which elements we demand.
51990 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51991 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51992
51993 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51994 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51995 if (N->getOpcode() != ISD::DELETED_NODE)
51996 DCI.AddToWorklist(N);
51997 return SDValue(N, 0);
51998 }
51999
52000 return SDValue();
52001}
52002
52003/// Return 'true' if this vector operation is "horizontal"
52004/// and return the operands for the horizontal operation in LHS and RHS. A
52005/// horizontal operation performs the binary operation on successive elements
52006/// of its first operand, then on successive elements of its second operand,
52007/// returning the resulting values in a vector. For example, if
52008/// A = < float a0, float a1, float a2, float a3 >
52009/// and
52010/// B = < float b0, float b1, float b2, float b3 >
52011/// then the result of doing a horizontal operation on A and B is
52012/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
52013/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
52014/// A horizontal-op B, for some already available A and B, and if so then LHS is
52015/// set to A, RHS to B, and the routine returns 'true'.
52016static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
52017 SelectionDAG &DAG, const X86Subtarget &Subtarget,
52018 bool IsCommutative,
52019 SmallVectorImpl<int> &PostShuffleMask) {
52020 // If either operand is undef, bail out. The binop should be simplified.
52021 if (LHS.isUndef() || RHS.isUndef())
52022 return false;
52023
52024 // Look for the following pattern:
52025 // A = < float a0, float a1, float a2, float a3 >
52026 // B = < float b0, float b1, float b2, float b3 >
52027 // and
52028 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
52029 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
52030 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
52031 // which is A horizontal-op B.
52032
52033 MVT VT = LHS.getSimpleValueType();
52034 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52035, __extension__
__PRETTY_FUNCTION__))
52035 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52035, __extension__
__PRETTY_FUNCTION__))
;
52036 unsigned NumElts = VT.getVectorNumElements();
52037
52038 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
52039 SmallVectorImpl<int> &ShuffleMask) {
52040 bool UseSubVector = false;
52041 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52042 Op.getOperand(0).getValueType().is256BitVector() &&
52043 llvm::isNullConstant(Op.getOperand(1))) {
52044 Op = Op.getOperand(0);
52045 UseSubVector = true;
52046 }
52047 SmallVector<SDValue, 2> SrcOps;
52048 SmallVector<int, 16> SrcMask, ScaledMask;
52049 SDValue BC = peekThroughBitcasts(Op);
52050 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
52051 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
52052 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
52053 })) {
52054 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
52055 if (!UseSubVector && SrcOps.size() <= 2 &&
52056 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
52057 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
52058 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
52059 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
52060 }
52061 if (UseSubVector && SrcOps.size() == 1 &&
52062 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
52063 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
52064 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
52065 ShuffleMask.assign(Mask.begin(), Mask.end());
52066 }
52067 }
52068 };
52069
52070 // View LHS in the form
52071 // LHS = VECTOR_SHUFFLE A, B, LMask
52072 // If LHS is not a shuffle, then pretend it is the identity shuffle:
52073 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
52074 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
52075 SDValue A, B;
52076 SmallVector<int, 16> LMask;
52077 GetShuffle(LHS, A, B, LMask);
52078
52079 // Likewise, view RHS in the form
52080 // RHS = VECTOR_SHUFFLE C, D, RMask
52081 SDValue C, D;
52082 SmallVector<int, 16> RMask;
52083 GetShuffle(RHS, C, D, RMask);
52084
52085 // At least one of the operands should be a vector shuffle.
52086 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
52087 if (NumShuffles == 0)
52088 return false;
52089
52090 if (LMask.empty()) {
52091 A = LHS;
52092 for (unsigned i = 0; i != NumElts; ++i)
52093 LMask.push_back(i);
52094 }
52095
52096 if (RMask.empty()) {
52097 C = RHS;
52098 for (unsigned i = 0; i != NumElts; ++i)
52099 RMask.push_back(i);
52100 }
52101
52102 // If we have an unary mask, ensure the other op is set to null.
52103 if (isUndefOrInRange(LMask, 0, NumElts))
52104 B = SDValue();
52105 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
52106 A = SDValue();
52107
52108 if (isUndefOrInRange(RMask, 0, NumElts))
52109 D = SDValue();
52110 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
52111 C = SDValue();
52112
52113 // If A and B occur in reverse order in RHS, then canonicalize by commuting
52114 // RHS operands and shuffle mask.
52115 if (A != C) {
52116 std::swap(C, D);
52117 ShuffleVectorSDNode::commuteMask(RMask);
52118 }
52119 // Check that the shuffles are both shuffling the same vectors.
52120 if (!(A == C && B == D))
52121 return false;
52122
52123 PostShuffleMask.clear();
52124 PostShuffleMask.append(NumElts, SM_SentinelUndef);
52125
52126 // LHS and RHS are now:
52127 // LHS = shuffle A, B, LMask
52128 // RHS = shuffle A, B, RMask
52129 // Check that the masks correspond to performing a horizontal operation.
52130 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52131 // so we just repeat the inner loop if this is a 256-bit op.
52132 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
52133 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
52134 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
52135 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52136, __extension__
__PRETTY_FUNCTION__))
52136 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52136, __extension__
__PRETTY_FUNCTION__))
;
52137 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
52138 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
52139 // Ignore undefined components.
52140 int LIdx = LMask[i + j], RIdx = RMask[i + j];
52141 if (LIdx < 0 || RIdx < 0 ||
52142 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
52143 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
52144 continue;
52145
52146 // Check that successive odd/even elements are being operated on. If not,
52147 // this is not a horizontal operation.
52148 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
52149 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
52150 return false;
52151
52152 // Compute the post-shuffle mask index based on where the element
52153 // is stored in the HOP result, and where it needs to be moved to.
52154 int Base = LIdx & ~1u;
52155 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
52156 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52157
52158 // The low half of the 128-bit result must choose from A.
52159 // The high half of the 128-bit result must choose from B,
52160 // unless B is undef. In that case, we are always choosing from A.
52161 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
52162 Index += NumEltsPer64BitChunk;
52163 PostShuffleMask[i + j] = Index;
52164 }
52165 }
52166
52167 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
52168 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
52169
52170 bool IsIdentityPostShuffle =
52171 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
52172 if (IsIdentityPostShuffle)
52173 PostShuffleMask.clear();
52174
52175 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52176 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
52177 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
52178 return false;
52179
52180 // If the source nodes are already used in HorizOps then always accept this.
52181 // Shuffle folding should merge these back together.
52182 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
52183 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52184 });
52185 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
52186 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52187 });
52188 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
52189
52190 // Assume a SingleSource HOP if we only shuffle one input and don't need to
52191 // shuffle the result.
52192 if (!ForceHorizOp &&
52193 !shouldUseHorizontalOp(NewLHS == NewRHS &&
52194 (NumShuffles < 2 || !IsIdentityPostShuffle),
52195 DAG, Subtarget))
52196 return false;
52197
52198 LHS = DAG.getBitcast(VT, NewLHS);
52199 RHS = DAG.getBitcast(VT, NewRHS);
52200 return true;
52201}
52202
52203// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
52204static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
52205 const X86Subtarget &Subtarget) {
52206 EVT VT = N->getValueType(0);
52207 unsigned Opcode = N->getOpcode();
52208 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
52209 SmallVector<int, 8> PostShuffleMask;
52210
52211 switch (Opcode) {
52212 case ISD::FADD:
52213 case ISD::FSUB:
52214 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
52215 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
52216 SDValue LHS = N->getOperand(0);
52217 SDValue RHS = N->getOperand(1);
52218 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
52219 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52220 PostShuffleMask)) {
52221 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
52222 if (!PostShuffleMask.empty())
52223 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52224 DAG.getUNDEF(VT), PostShuffleMask);
52225 return HorizBinOp;
52226 }
52227 }
52228 break;
52229 case ISD::ADD:
52230 case ISD::SUB:
52231 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
52232 VT == MVT::v16i16 || VT == MVT::v8i32)) {
52233 SDValue LHS = N->getOperand(0);
52234 SDValue RHS = N->getOperand(1);
52235 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
52236 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52237 PostShuffleMask)) {
52238 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
52239 ArrayRef<SDValue> Ops) {
52240 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
52241 };
52242 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
52243 {LHS, RHS}, HOpBuilder);
52244 if (!PostShuffleMask.empty())
52245 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52246 DAG.getUNDEF(VT), PostShuffleMask);
52247 return HorizBinOp;
52248 }
52249 }
52250 break;
52251 }
52252
52253 return SDValue();
52254}
52255
52256// Try to combine the following nodes
52257// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
52258// <i32 -2147483648[float -0.000000e+00]> 0
52259// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
52260// <(load 4 from constant-pool)> t0, t29
52261// [t30: v16i32 = bitcast t27]
52262// t6: v16i32 = xor t7, t27[t30]
52263// t11: v16f32 = bitcast t6
52264// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
52265// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
52266// t22: v16f32 = bitcast t7
52267// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
52268// t24: v32f16 = bitcast t23
52269static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
52270 const X86Subtarget &Subtarget) {
52271 EVT VT = N->getValueType(0);
52272 SDValue LHS = N->getOperand(0);
52273 SDValue RHS = N->getOperand(1);
52274 int CombineOpcode =
52275 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52276 auto isConjugationConstant = [](const Constant *c) {
52277 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
52278 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
52279 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
52280 switch (CI->getBitWidth()) {
52281 case 16:
52282 return false;
52283 case 32:
52284 return CI->getValue() == ConjugationInt32;
52285 case 64:
52286 return CI->getValue() == ConjugationInt64;
52287 default:
52288 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52288)
;
52289 }
52290 }
52291 if (const auto *CF = dyn_cast<ConstantFP>(c))
52292 return CF->isNegativeZeroValue();
52293 return false;
52294 };
52295 auto combineConjugation = [&](SDValue &r) {
52296 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52297 SDValue XOR = LHS.getOperand(0);
52298 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52299 SDValue XORRHS = XOR.getOperand(1);
52300 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
52301 XORRHS = XORRHS.getOperand(0);
52302 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
52303 XORRHS.getOperand(1).getNumOperands()) {
52304 ConstantPoolSDNode *CP =
52305 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
52306 if (CP && isConjugationConstant(CP->getConstVal())) {
52307 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
52308 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
52309 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
52310 r = DAG.getBitcast(VT, FCMulC);
52311 return true;
52312 }
52313 }
52314 }
52315 }
52316 return false;
52317 };
52318 SDValue Res;
52319 if (combineConjugation(Res))
52320 return Res;
52321 std::swap(LHS, RHS);
52322 if (combineConjugation(Res))
52323 return Res;
52324 return Res;
52325}
52326
52327// Try to combine the following nodes:
52328// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
52329static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
52330 const X86Subtarget &Subtarget) {
52331 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
52332 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
52333 Flags.hasAllowContract();
52334 };
52335
52336 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
52337 return DAG.getTarget().Options.NoSignedZerosFPMath ||
52338 Flags.hasNoSignedZeros();
52339 };
52340 auto IsVectorAllNegativeZero = [](const SDNode *N) {
52341 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
52342 return false;
52343 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52344, __extension__
__PRETTY_FUNCTION__))
52344 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52344, __extension__
__PRETTY_FUNCTION__))
;
52345 if (ConstantPoolSDNode *CP =
52346 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
52347 APInt AI = APInt(32, 0x80008000, true);
52348 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
52349 return CI->getValue() == AI;
52350 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
52351 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
52352 }
52353 return false;
52354 };
52355
52356 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52357 !AllowContract(N->getFlags()))
52358 return SDValue();
52359
52360 EVT VT = N->getValueType(0);
52361 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
52362 return SDValue();
52363
52364 SDValue LHS = N->getOperand(0);
52365 SDValue RHS = N->getOperand(1);
52366 bool IsConj;
52367 SDValue FAddOp1, MulOp0, MulOp1;
52368 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
52369 &IsVectorAllNegativeZero,
52370 &HasNoSignedZero](SDValue N) -> bool {
52371 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
52372 return false;
52373 SDValue Op0 = N.getOperand(0);
52374 unsigned Opcode = Op0.getOpcode();
52375 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52376 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
52377 MulOp0 = Op0.getOperand(0);
52378 MulOp1 = Op0.getOperand(1);
52379 IsConj = Opcode == X86ISD::VFCMULC;
52380 return true;
52381 }
52382 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
52383 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
52384 HasNoSignedZero(Op0->getFlags())) ||
52385 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
52386 MulOp0 = Op0.getOperand(0);
52387 MulOp1 = Op0.getOperand(1);
52388 IsConj = Opcode == X86ISD::VFCMADDC;
52389 return true;
52390 }
52391 }
52392 return false;
52393 };
52394
52395 if (GetCFmulFrom(LHS))
52396 FAddOp1 = RHS;
52397 else if (GetCFmulFrom(RHS))
52398 FAddOp1 = LHS;
52399 else
52400 return SDValue();
52401
52402 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
52403 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
52404 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
52405 // FIXME: How do we handle when fast math flags of FADD are different from
52406 // CFMUL's?
52407 SDValue CFmul =
52408 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52409 return DAG.getBitcast(VT, CFmul);
52410}
52411
52412/// Do target-specific dag combines on floating-point adds/subs.
52413static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
52414 const X86Subtarget &Subtarget) {
52415 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
52416 return HOp;
52417
52418 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
52419 return COp;
52420
52421 return SDValue();
52422}
52423
52424/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52425/// the codegen.
52426/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52427/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
52428/// anything that is guaranteed to be transformed by DAGCombiner.
52429static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
52430 const X86Subtarget &Subtarget,
52431 const SDLoc &DL) {
52432 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52432, __extension__
__PRETTY_FUNCTION__))
;
52433 SDValue Src = N->getOperand(0);
52434 unsigned SrcOpcode = Src.getOpcode();
52435 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52436
52437 EVT VT = N->getValueType(0);
52438 EVT SrcVT = Src.getValueType();
52439
52440 auto IsFreeTruncation = [VT](SDValue Op) {
52441 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
52442
52443 // See if this has been extended from a smaller/equal size to
52444 // the truncation size, allowing a truncation to combine with the extend.
52445 unsigned Opcode = Op.getOpcode();
52446 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
52447 Opcode == ISD::ZERO_EXTEND) &&
52448 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
52449 return true;
52450
52451 // See if this is a single use constant which can be constant folded.
52452 // NOTE: We don't peek throught bitcasts here because there is currently
52453 // no support for constant folding truncate+bitcast+vector_of_constants. So
52454 // we'll just send up with a truncate on both operands which will
52455 // get turned back into (truncate (binop)) causing an infinite loop.
52456 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
52457 };
52458
52459 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
52460 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
52461 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
52462 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
52463 };
52464
52465 // Don't combine if the operation has other uses.
52466 if (!Src.hasOneUse())
52467 return SDValue();
52468
52469 // Only support vector truncation for now.
52470 // TODO: i64 scalar math would benefit as well.
52471 if (!VT.isVector())
52472 return SDValue();
52473
52474 // In most cases its only worth pre-truncating if we're only facing the cost
52475 // of one truncation.
52476 // i.e. if one of the inputs will constant fold or the input is repeated.
52477 switch (SrcOpcode) {
52478 case ISD::MUL:
52479 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52480 // better to truncate if we have the chance.
52481 if (SrcVT.getScalarType() == MVT::i64 &&
52482 TLI.isOperationLegal(SrcOpcode, VT) &&
52483 !TLI.isOperationLegal(SrcOpcode, SrcVT))
52484 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
52485 [[fallthrough]];
52486 case ISD::AND:
52487 case ISD::XOR:
52488 case ISD::OR:
52489 case ISD::ADD:
52490 case ISD::SUB: {
52491 SDValue Op0 = Src.getOperand(0);
52492 SDValue Op1 = Src.getOperand(1);
52493 if (TLI.isOperationLegal(SrcOpcode, VT) &&
52494 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
52495 return TruncateArithmetic(Op0, Op1);
52496 break;
52497 }
52498 }
52499
52500 return SDValue();
52501}
52502
52503/// Truncate using ISD::AND mask and X86ISD::PACKUS.
52504/// e.g. trunc <8 x i32> X to <8 x i16> -->
52505/// MaskX = X & 0xffff (clear high bits to prevent saturation)
52506/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
52507static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
52508 const X86Subtarget &Subtarget,
52509 SelectionDAG &DAG) {
52510 SDValue In = N->getOperand(0);
52511 EVT InVT = In.getValueType();
52512 EVT OutVT = N->getValueType(0);
52513
52514 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
52515 OutVT.getScalarSizeInBits());
52516 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
52517 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
52518}
52519
52520/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
52521static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
52522 const X86Subtarget &Subtarget,
52523 SelectionDAG &DAG) {
52524 SDValue In = N->getOperand(0);
52525 EVT InVT = In.getValueType();
52526 EVT OutVT = N->getValueType(0);
52527 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
52528 DAG.getValueType(OutVT));
52529 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
52530}
52531
52532/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
52533/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
52534/// legalization the truncation will be translated into a BUILD_VECTOR with each
52535/// element that is extracted from a vector and then truncated, and it is
52536/// difficult to do this optimization based on them.
52537static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
52538 const X86Subtarget &Subtarget) {
52539 EVT OutVT = N->getValueType(0);
52540 if (!OutVT.isVector())
52541 return SDValue();
52542
52543 SDValue In = N->getOperand(0);
52544 if (!In.getValueType().isSimple())
52545 return SDValue();
52546
52547 EVT InVT = In.getValueType();
52548 unsigned NumElems = OutVT.getVectorNumElements();
52549
52550 // AVX512 provides fast truncate ops.
52551 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
52552 return SDValue();
52553
52554 EVT OutSVT = OutVT.getVectorElementType();
52555 EVT InSVT = InVT.getVectorElementType();
52556 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
52557 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
52558 NumElems >= 8))
52559 return SDValue();
52560
52561 // SSSE3's pshufb results in less instructions in the cases below.
52562 if (Subtarget.hasSSSE3() && NumElems == 8) {
52563 if (InSVT == MVT::i16)
52564 return SDValue();
52565 if (InSVT == MVT::i32 &&
52566 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
52567 return SDValue();
52568 }
52569
52570 SDLoc DL(N);
52571 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
52572 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
52573 // truncate 2 x v4i32 to v8i16.
52574 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
52575 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
52576 if (InSVT == MVT::i32)
52577 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
52578
52579 return SDValue();
52580}
52581
52582/// This function transforms vector truncation of 'extended sign-bits' or
52583/// 'extended zero-bits' values.
52584/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
52585static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
52586 SelectionDAG &DAG,
52587 const X86Subtarget &Subtarget) {
52588 // Requires SSE2.
52589 if (!Subtarget.hasSSE2())
52590 return SDValue();
52591
52592 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
52593 return SDValue();
52594
52595 SDValue In = N->getOperand(0);
52596 if (!In.getValueType().isSimple())
52597 return SDValue();
52598
52599 MVT VT = N->getValueType(0).getSimpleVT();
52600 MVT SVT = VT.getScalarType();
52601
52602 MVT InVT = In.getValueType().getSimpleVT();
52603 MVT InSVT = InVT.getScalarType();
52604
52605 // Check we have a truncation suited for PACKSS/PACKUS.
52606 if (!isPowerOf2_32(VT.getVectorNumElements()))
52607 return SDValue();
52608 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
52609 return SDValue();
52610 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
52611 return SDValue();
52612
52613 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
52614 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
52615 return SDValue();
52616
52617 // AVX512 has fast truncate, but if the input is already going to be split,
52618 // there's no harm in trying pack.
52619 if (Subtarget.hasAVX512() &&
52620 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
52621 InVT.is512BitVector())) {
52622 // PACK should still be worth it for 128-bit vectors if the sources were
52623 // originally concatenated from subvectors.
52624 SmallVector<SDValue> ConcatOps;
52625 if (VT.getSizeInBits() > 128 ||
52626 !collectConcatOps(In.getNode(), ConcatOps, DAG))
52627 return SDValue();
52628 }
52629
52630 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
52631 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
52632
52633 // Use PACKUS if the input has zero-bits that extend all the way to the
52634 // packed/truncated value. e.g. masks, zext_in_reg, etc.
52635 KnownBits Known = DAG.computeKnownBits(In);
52636 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
52637 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
52638 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
52639
52640 // Use PACKSS if the input has sign-bits that extend all the way to the
52641 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
52642 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
52643
52644 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
52645 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
52646 // on and combines/simplifications can't then use it.
52647 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
52648 return SDValue();
52649
52650 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
52651 if (NumSignBits > MinSignBits)
52652 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
52653
52654 // If we have a srl that only generates signbits that we will discard in
52655 // the truncation then we can use PACKSS by converting the srl to a sra.
52656 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
52657 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
52658 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
52659 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
52660 if (*ShAmt == MinSignBits) {
52661 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
52662 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
52663 Subtarget);
52664 }
52665 }
52666
52667 return SDValue();
52668}
52669
52670// Try to form a MULHU or MULHS node by looking for
52671// (trunc (srl (mul ext, ext), 16))
52672// TODO: This is X86 specific because we want to be able to handle wide types
52673// before type legalization. But we can only do it if the vector will be
52674// legalized via widening/splitting. Type legalization can't handle promotion
52675// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
52676// combiner.
52677static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
52678 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
52679 // First instruction should be a right shift of a multiply.
52680 if (Src.getOpcode() != ISD::SRL ||
52681 Src.getOperand(0).getOpcode() != ISD::MUL)
52682 return SDValue();
52683
52684 if (!Subtarget.hasSSE2())
52685 return SDValue();
52686
52687 // Only handle vXi16 types that are at least 128-bits unless they will be
52688 // widened.
52689 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
52690 return SDValue();
52691
52692 // Input type should be at least vXi32.
52693 EVT InVT = Src.getValueType();
52694 if (InVT.getVectorElementType().getSizeInBits() < 32)
52695 return SDValue();
52696
52697 // Need a shift by 16.
52698 APInt ShiftAmt;
52699 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
52700 ShiftAmt != 16)
52701 return SDValue();
52702
52703 SDValue LHS = Src.getOperand(0).getOperand(0);
52704 SDValue RHS = Src.getOperand(0).getOperand(1);
52705
52706 // Count leading sign/zero bits on both inputs - if there are enough then
52707 // truncation back to vXi16 will be cheap - either as a pack/shuffle
52708 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
52709 // truncations may actually be free by peeking through to the ext source.
52710 auto IsSext = [&DAG](SDValue V) {
52711 return DAG.ComputeMaxSignificantBits(V) <= 16;
52712 };
52713 auto IsZext = [&DAG](SDValue V) {
52714 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
52715 };
52716
52717 bool IsSigned = IsSext(LHS) && IsSext(RHS);
52718 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
52719 if (!IsSigned && !IsUnsigned)
52720 return SDValue();
52721
52722 // Check if both inputs are extensions, which will be removed by truncation.
52723 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
52724 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
52725 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
52726 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
52727 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
52728 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
52729
52730 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
52731 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
52732 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
52733 // will have to split anyway.
52734 unsigned InSizeInBits = InVT.getSizeInBits();
52735 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
52736 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
52737 (InSizeInBits % 16) == 0) {
52738 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52739 InVT.getSizeInBits() / 16);
52740 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
52741 DAG.getBitcast(BCVT, RHS));
52742 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
52743 }
52744
52745 // Truncate back to source type.
52746 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
52747 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
52748
52749 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
52750 return DAG.getNode(Opc, DL, VT, LHS, RHS);
52751}
52752
52753// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
52754// from one vector with signed bytes from another vector, adds together
52755// adjacent pairs of 16-bit products, and saturates the result before
52756// truncating to 16-bits.
52757//
52758// Which looks something like this:
52759// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
52760// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
52761static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
52762 const X86Subtarget &Subtarget,
52763 const SDLoc &DL) {
52764 if (!VT.isVector() || !Subtarget.hasSSSE3())
52765 return SDValue();
52766
52767 unsigned NumElems = VT.getVectorNumElements();
52768 EVT ScalarVT = VT.getVectorElementType();
52769 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
52770 return SDValue();
52771
52772 SDValue SSatVal = detectSSatPattern(In, VT);
52773 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
52774 return SDValue();
52775
52776 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
52777 // of multiplies from even/odd elements.
52778 SDValue N0 = SSatVal.getOperand(0);
52779 SDValue N1 = SSatVal.getOperand(1);
52780
52781 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52782 return SDValue();
52783
52784 SDValue N00 = N0.getOperand(0);
52785 SDValue N01 = N0.getOperand(1);
52786 SDValue N10 = N1.getOperand(0);
52787 SDValue N11 = N1.getOperand(1);
52788
52789 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
52790 // Canonicalize zero_extend to LHS.
52791 if (N01.getOpcode() == ISD::ZERO_EXTEND)
52792 std::swap(N00, N01);
52793 if (N11.getOpcode() == ISD::ZERO_EXTEND)
52794 std::swap(N10, N11);
52795
52796 // Ensure we have a zero_extend and a sign_extend.
52797 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
52798 N01.getOpcode() != ISD::SIGN_EXTEND ||
52799 N10.getOpcode() != ISD::ZERO_EXTEND ||
52800 N11.getOpcode() != ISD::SIGN_EXTEND)
52801 return SDValue();
52802
52803 // Peek through the extends.
52804 N00 = N00.getOperand(0);
52805 N01 = N01.getOperand(0);
52806 N10 = N10.getOperand(0);
52807 N11 = N11.getOperand(0);
52808
52809 // Ensure the extend is from vXi8.
52810 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
52811 N01.getValueType().getVectorElementType() != MVT::i8 ||
52812 N10.getValueType().getVectorElementType() != MVT::i8 ||
52813 N11.getValueType().getVectorElementType() != MVT::i8)
52814 return SDValue();
52815
52816 // All inputs should be build_vectors.
52817 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52818 N01.getOpcode() != ISD::BUILD_VECTOR ||
52819 N10.getOpcode() != ISD::BUILD_VECTOR ||
52820 N11.getOpcode() != ISD::BUILD_VECTOR)
52821 return SDValue();
52822
52823 // N00/N10 are zero extended. N01/N11 are sign extended.
52824
52825 // For each element, we need to ensure we have an odd element from one vector
52826 // multiplied by the odd element of another vector and the even element from
52827 // one of the same vectors being multiplied by the even element from the
52828 // other vector. So we need to make sure for each element i, this operator
52829 // is being performed:
52830 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52831 SDValue ZExtIn, SExtIn;
52832 for (unsigned i = 0; i != NumElems; ++i) {
52833 SDValue N00Elt = N00.getOperand(i);
52834 SDValue N01Elt = N01.getOperand(i);
52835 SDValue N10Elt = N10.getOperand(i);
52836 SDValue N11Elt = N11.getOperand(i);
52837 // TODO: Be more tolerant to undefs.
52838 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52839 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52840 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52841 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
52842 return SDValue();
52843 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52844 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52845 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52846 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52847 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52848 return SDValue();
52849 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52850 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52851 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52852 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52853 // Add is commutative so indices can be reordered.
52854 if (IdxN00 > IdxN10) {
52855 std::swap(IdxN00, IdxN10);
52856 std::swap(IdxN01, IdxN11);
52857 }
52858 // N0 indices be the even element. N1 indices must be the next odd element.
52859 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52860 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52861 return SDValue();
52862 SDValue N00In = N00Elt.getOperand(0);
52863 SDValue N01In = N01Elt.getOperand(0);
52864 SDValue N10In = N10Elt.getOperand(0);
52865 SDValue N11In = N11Elt.getOperand(0);
52866 // First time we find an input capture it.
52867 if (!ZExtIn) {
52868 ZExtIn = N00In;
52869 SExtIn = N01In;
52870 }
52871 if (ZExtIn != N00In || SExtIn != N01In ||
52872 ZExtIn != N10In || SExtIn != N11In)
52873 return SDValue();
52874 }
52875
52876 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52877 ArrayRef<SDValue> Ops) {
52878 // Shrink by adding truncate nodes and let DAGCombine fold with the
52879 // sources.
52880 EVT InVT = Ops[0].getValueType();
52881 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52882, __extension__
__PRETTY_FUNCTION__))
52882 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52882, __extension__
__PRETTY_FUNCTION__))
;
52883 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52883, __extension__
__PRETTY_FUNCTION__))
;
52884 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52885 InVT.getVectorNumElements() / 2);
52886 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
52887 };
52888 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
52889 PMADDBuilder);
52890}
52891
52892static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
52893 const X86Subtarget &Subtarget) {
52894 EVT VT = N->getValueType(0);
52895 SDValue Src = N->getOperand(0);
52896 SDLoc DL(N);
52897
52898 // Attempt to pre-truncate inputs to arithmetic ops instead.
52899 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
52900 return V;
52901
52902 // Try to detect AVG pattern first.
52903 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
52904 return Avg;
52905
52906 // Try to detect PMADD
52907 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
52908 return PMAdd;
52909
52910 // Try to combine truncation with signed/unsigned saturation.
52911 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
52912 return Val;
52913
52914 // Try to combine PMULHUW/PMULHW for vXi16.
52915 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
52916 return V;
52917
52918 // The bitcast source is a direct mmx result.
52919 // Detect bitcasts between i32 to x86mmx
52920 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
52921 SDValue BCSrc = Src.getOperand(0);
52922 if (BCSrc.getValueType() == MVT::x86mmx)
52923 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
52924 }
52925
52926 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
52927 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
52928 return V;
52929
52930 return combineVectorTruncation(N, DAG, Subtarget);
52931}
52932
52933static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
52934 TargetLowering::DAGCombinerInfo &DCI) {
52935 EVT VT = N->getValueType(0);
52936 SDValue In = N->getOperand(0);
52937 SDLoc DL(N);
52938
52939 if (SDValue SSatVal = detectSSatPattern(In, VT))
52940 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
52941 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
52942 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
52943
52944 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52945 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
52946 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52947 return SDValue(N, 0);
52948
52949 return SDValue();
52950}
52951
52952/// Returns the negated value if the node \p N flips sign of FP value.
52953///
52954/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
52955/// or FSUB(0, x)
52956/// AVX512F does not have FXOR, so FNEG is lowered as
52957/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
52958/// In this case we go though all bitcasts.
52959/// This also recognizes splat of a negated value and returns the splat of that
52960/// value.
52961static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
52962 if (N->getOpcode() == ISD::FNEG)
52963 return N->getOperand(0);
52964
52965 // Don't recurse exponentially.
52966 if (Depth > SelectionDAG::MaxRecursionDepth)
52967 return SDValue();
52968
52969 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
52970
52971 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
52972 EVT VT = Op->getValueType(0);
52973
52974 // Make sure the element size doesn't change.
52975 if (VT.getScalarSizeInBits() != ScalarSize)
52976 return SDValue();
52977
52978 unsigned Opc = Op.getOpcode();
52979 switch (Opc) {
52980 case ISD::VECTOR_SHUFFLE: {
52981 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
52982 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
52983 if (!Op.getOperand(1).isUndef())
52984 return SDValue();
52985 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
52986 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
52987 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
52988 cast<ShuffleVectorSDNode>(Op)->getMask());
52989 break;
52990 }
52991 case ISD::INSERT_VECTOR_ELT: {
52992 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
52993 // -V, INDEX).
52994 SDValue InsVector = Op.getOperand(0);
52995 SDValue InsVal = Op.getOperand(1);
52996 if (!InsVector.isUndef())
52997 return SDValue();
52998 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
52999 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53000 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53001 NegInsVal, Op.getOperand(2));
53002 break;
53003 }
53004 case ISD::FSUB:
53005 case ISD::XOR:
53006 case X86ISD::FXOR: {
53007 SDValue Op1 = Op.getOperand(1);
53008 SDValue Op0 = Op.getOperand(0);
53009
53010 // For XOR and FXOR, we want to check if constant
53011 // bits of Op1 are sign bit masks. For FSUB, we
53012 // have to check if constant bits of Op0 are sign
53013 // bit masks and hence we swap the operands.
53014 if (Opc == ISD::FSUB)
53015 std::swap(Op0, Op1);
53016
53017 APInt UndefElts;
53018 SmallVector<APInt, 16> EltBits;
53019 // Extract constant bits and see if they are all
53020 // sign bit masks. Ignore the undef elements.
53021 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
53022 /* AllowWholeUndefs */ true,
53023 /* AllowPartialUndefs */ false)) {
53024 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
53025 if (!UndefElts[I] && !EltBits[I].isSignMask())
53026 return SDValue();
53027
53028 // Only allow bitcast from correctly-sized constant.
53029 Op0 = peekThroughBitcasts(Op0);
53030 if (Op0.getScalarValueSizeInBits() == ScalarSize)
53031 return Op0;
53032 }
53033 break;
53034 } // case
53035 } // switch
53036
53037 return SDValue();
53038}
53039
53040static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
53041 bool NegRes) {
53042 if (NegMul) {
53043 switch (Opcode) {
53044 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53044)
;
53045 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
53046 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
53047 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
53048 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
53049 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
53050 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
53051 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
53052 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
53053 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
53054 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
53055 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
53056 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
53057 }
53058 }
53059
53060 if (NegAcc) {
53061 switch (Opcode) {
53062 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53062)
;
53063 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
53064 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
53065 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53066 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
53067 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
53068 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53069 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
53070 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
53071 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53072 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
53073 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
53074 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53075 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
53076 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
53077 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
53078 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
53079 }
53080 }
53081
53082 if (NegRes) {
53083 switch (Opcode) {
53084 // For accuracy reason, we never combine fneg and fma under strict FP.
53085 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53085)
;
53086 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
53087 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
53088 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
53089 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
53090 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
53091 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
53092 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
53093 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
53094 }
53095 }
53096
53097 return Opcode;
53098}
53099
53100/// Do target-specific dag combines on floating point negations.
53101static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
53102 TargetLowering::DAGCombinerInfo &DCI,
53103 const X86Subtarget &Subtarget) {
53104 EVT OrigVT = N->getValueType(0);
53105 SDValue Arg = isFNEG(DAG, N);
53106 if (!Arg)
53107 return SDValue();
53108
53109 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53110 EVT VT = Arg.getValueType();
53111 EVT SVT = VT.getScalarType();
53112 SDLoc DL(N);
53113
53114 // Let legalize expand this if it isn't a legal type yet.
53115 if (!TLI.isTypeLegal(VT))
53116 return SDValue();
53117
53118 // If we're negating a FMUL node on a target with FMA, then we can avoid the
53119 // use of a constant by performing (-0 - A*B) instead.
53120 // FIXME: Check rounding control flags as well once it becomes available.
53121 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
53122 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
53123 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
53124 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
53125 Arg.getOperand(1), Zero);
53126 return DAG.getBitcast(OrigVT, NewNode);
53127 }
53128
53129 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53130 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53131 if (SDValue NegArg =
53132 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
53133 return DAG.getBitcast(OrigVT, NegArg);
53134
53135 return SDValue();
53136}
53137
53138SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
53139 bool LegalOperations,
53140 bool ForCodeSize,
53141 NegatibleCost &Cost,
53142 unsigned Depth) const {
53143 // fneg patterns are removable even if they have multiple uses.
53144 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
53145 Cost = NegatibleCost::Cheaper;
53146 return DAG.getBitcast(Op.getValueType(), Arg);
53147 }
53148
53149 EVT VT = Op.getValueType();
53150 EVT SVT = VT.getScalarType();
53151 unsigned Opc = Op.getOpcode();
53152 SDNodeFlags Flags = Op.getNode()->getFlags();
53153 switch (Opc) {
53154 case ISD::FMA:
53155 case X86ISD::FMSUB:
53156 case X86ISD::FNMADD:
53157 case X86ISD::FNMSUB:
53158 case X86ISD::FMADD_RND:
53159 case X86ISD::FMSUB_RND:
53160 case X86ISD::FNMADD_RND:
53161 case X86ISD::FNMSUB_RND: {
53162 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
53163 !(SVT == MVT::f32 || SVT == MVT::f64) ||
53164 !isOperationLegal(ISD::FMA, VT))
53165 break;
53166
53167 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
53168 // if it may have signed zeros.
53169 if (!Flags.hasNoSignedZeros())
53170 break;
53171
53172 // This is always negatible for free but we might be able to remove some
53173 // extra operand negations as well.
53174 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
53175 for (int i = 0; i != 3; ++i)
53176 NewOps[i] = getCheaperNegatedExpression(
53177 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
53178
53179 bool NegA = !!NewOps[0];
53180 bool NegB = !!NewOps[1];
53181 bool NegC = !!NewOps[2];
53182 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
53183
53184 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
53185 : NegatibleCost::Neutral;
53186
53187 // Fill in the non-negated ops with the original values.
53188 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
53189 if (!NewOps[i])
53190 NewOps[i] = Op.getOperand(i);
53191 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
53192 }
53193 case X86ISD::FRCP:
53194 if (SDValue NegOp0 =
53195 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
53196 ForCodeSize, Cost, Depth + 1))
53197 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
53198 break;
53199 }
53200
53201 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
53202 ForCodeSize, Cost, Depth);
53203}
53204
53205static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
53206 const X86Subtarget &Subtarget) {
53207 MVT VT = N->getSimpleValueType(0);
53208 // If we have integer vector types available, use the integer opcodes.
53209 if (!VT.isVector() || !Subtarget.hasSSE2())
53210 return SDValue();
53211
53212 SDLoc dl(N);
53213
53214 unsigned IntBits = VT.getScalarSizeInBits();
53215 MVT IntSVT = MVT::getIntegerVT(IntBits);
53216 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
53217
53218 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
53219 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
53220 unsigned IntOpcode;
53221 switch (N->getOpcode()) {
53222 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53222)
;
53223 case X86ISD::FOR: IntOpcode = ISD::OR; break;
53224 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
53225 case X86ISD::FAND: IntOpcode = ISD::AND; break;
53226 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
53227 }
53228 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
53229 return DAG.getBitcast(VT, IntOp);
53230}
53231
53232
53233/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
53234static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
53235 if (N->getOpcode() != ISD::XOR)
53236 return SDValue();
53237
53238 SDValue LHS = N->getOperand(0);
53239 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
53240 return SDValue();
53241
53242 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
53243 X86::CondCode(LHS->getConstantOperandVal(0)));
53244 SDLoc DL(N);
53245 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
53246}
53247
53248static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
53249 const X86Subtarget &Subtarget) {
53250 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53251, __extension__
__PRETTY_FUNCTION__))
53251 "Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53251, __extension__
__PRETTY_FUNCTION__))
;
53252 if (Subtarget.hasFastLZCNT())
53253 return SDValue();
53254
53255 EVT VT = N->getValueType(0);
53256 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
53257 (VT != MVT::i64 || !Subtarget.is64Bit()))
53258 return SDValue();
53259
53260 SDValue N0 = N->getOperand(0);
53261 SDValue N1 = N->getOperand(1);
53262
53263 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
53264 N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
53265 return SDValue();
53266
53267 SDValue OpCTLZ;
53268 SDValue OpSizeTM1;
53269
53270 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
53271 OpCTLZ = N1;
53272 OpSizeTM1 = N0;
53273 } else if (N->getOpcode() == ISD::SUB) {
53274 return SDValue();
53275 } else {
53276 OpCTLZ = N0;
53277 OpSizeTM1 = N1;
53278 }
53279
53280 if (!OpCTLZ.hasOneUse())
53281 return SDValue();
53282 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
53283 if (!C)
53284 return SDValue();
53285
53286 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53287 return SDValue();
53288 SDLoc DL(N);
53289 EVT OpVT = VT;
53290 SDValue Op = OpCTLZ.getOperand(0);
53291 if (VT == MVT::i8) {
53292 // Zero extend to i32 since there is not an i8 bsr.
53293 OpVT = MVT::i32;
53294 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
53295 }
53296
53297 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
53298 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
53299 if (VT == MVT::i8)
53300 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
53301
53302 return Op;
53303}
53304
53305static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
53306 TargetLowering::DAGCombinerInfo &DCI,
53307 const X86Subtarget &Subtarget) {
53308 SDValue N0 = N->getOperand(0);
53309 SDValue N1 = N->getOperand(1);
53310 EVT VT = N->getValueType(0);
53311
53312 // If this is SSE1 only convert to FXOR to avoid scalarization.
53313 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
53314 return DAG.getBitcast(MVT::v4i32,
53315 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
53316 DAG.getBitcast(MVT::v4f32, N0),
53317 DAG.getBitcast(MVT::v4f32, N1)));
53318 }
53319
53320 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
53321 return Cmp;
53322
53323 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
53324 return R;
53325
53326 if (SDValue R = combineBitOpWithShift(N, DAG))
53327 return R;
53328
53329 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
53330 return FPLogic;
53331
53332 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
53333 return R;
53334
53335 if (DCI.isBeforeLegalizeOps())
53336 return SDValue();
53337
53338 if (SDValue SetCC = foldXor1SetCC(N, DAG))
53339 return SetCC;
53340
53341 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
53342 return R;
53343
53344 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
53345 return RV;
53346
53347 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53348 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53349 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
53350 N0.getOperand(0).getValueType().isVector() &&
53351 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53352 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
53353 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
53354 N0.getOperand(0).getValueType()));
53355 }
53356
53357 // Handle AVX512 mask widening.
53358 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53359 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
53360 VT.getVectorElementType() == MVT::i1 &&
53361 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
53362 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
53363 return DAG.getNode(
53364 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
53365 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
53366 N0.getOperand(2));
53367 }
53368
53369 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53370 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53371 // TODO: Under what circumstances could this be performed in DAGCombine?
53372 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
53373 N0.getOperand(0).getOpcode() == N->getOpcode()) {
53374 SDValue TruncExtSrc = N0.getOperand(0);
53375 auto *N1C = dyn_cast<ConstantSDNode>(N1);
53376 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
53377 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53378 SDLoc DL(N);
53379 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
53380 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
53381 return DAG.getNode(ISD::XOR, DL, VT, LHS,
53382 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
53383 }
53384 }
53385
53386 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
53387 return R;
53388
53389 return combineFneg(N, DAG, DCI, Subtarget);
53390}
53391
53392static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
53393 TargetLowering::DAGCombinerInfo &DCI,
53394 const X86Subtarget &Subtarget) {
53395 EVT VT = N->getValueType(0);
53396 unsigned NumBits = VT.getSizeInBits();
53397
53398 // TODO - Constant Folding.
53399
53400 // Simplify the inputs.
53401 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53402 APInt DemandedMask(APInt::getAllOnes(NumBits));
53403 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53404 return SDValue(N, 0);
53405
53406 return SDValue();
53407}
53408
53409static bool isNullFPScalarOrVectorConst(SDValue V) {
53410 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
53411}
53412
53413/// If a value is a scalar FP zero or a vector FP zero (potentially including
53414/// undefined elements), return a zero constant that may be used to fold away
53415/// that value. In the case of a vector, the returned constant will not contain
53416/// undefined elements even if the input parameter does. This makes it suitable
53417/// to be used as a replacement operand with operations (eg, bitwise-and) where
53418/// an undef should not propagate.
53419static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
53420 const X86Subtarget &Subtarget) {
53421 if (!isNullFPScalarOrVectorConst(V))
53422 return SDValue();
53423
53424 if (V.getValueType().isVector())
53425 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
53426
53427 return V;
53428}
53429
53430static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
53431 const X86Subtarget &Subtarget) {
53432 SDValue N0 = N->getOperand(0);
53433 SDValue N1 = N->getOperand(1);
53434 EVT VT = N->getValueType(0);
53435 SDLoc DL(N);
53436
53437 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
53438 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
53439 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
53440 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
53441 return SDValue();
53442
53443 auto isAllOnesConstantFP = [](SDValue V) {
53444 if (V.getSimpleValueType().isVector())
53445 return ISD::isBuildVectorAllOnes(V.getNode());
53446 auto *C = dyn_cast<ConstantFPSDNode>(V);
53447 return C && C->getConstantFPValue()->isAllOnesValue();
53448 };
53449
53450 // fand (fxor X, -1), Y --> fandn X, Y
53451 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
53452 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
53453
53454 // fand X, (fxor Y, -1) --> fandn Y, X
53455 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
53456 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
53457
53458 return SDValue();
53459}
53460
53461/// Do target-specific dag combines on X86ISD::FAND nodes.
53462static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
53463 const X86Subtarget &Subtarget) {
53464 // FAND(0.0, x) -> 0.0
53465 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53466 return V;
53467
53468 // FAND(x, 0.0) -> 0.0
53469 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53470 return V;
53471
53472 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
53473 return V;
53474
53475 return lowerX86FPLogicOp(N, DAG, Subtarget);
53476}
53477
53478/// Do target-specific dag combines on X86ISD::FANDN nodes.
53479static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
53480 const X86Subtarget &Subtarget) {
53481 // FANDN(0.0, x) -> x
53482 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53483 return N->getOperand(1);
53484
53485 // FANDN(x, 0.0) -> 0.0
53486 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53487 return V;
53488
53489 return lowerX86FPLogicOp(N, DAG, Subtarget);
53490}
53491
53492/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53493static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
53494 TargetLowering::DAGCombinerInfo &DCI,
53495 const X86Subtarget &Subtarget) {
53496 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53496, __extension__
__PRETTY_FUNCTION__))
;
53497
53498 // F[X]OR(0.0, x) -> x
53499 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53500 return N->getOperand(1);
53501
53502 // F[X]OR(x, 0.0) -> x
53503 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53504 return N->getOperand(0);
53505
53506 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
53507 return NewVal;
53508
53509 return lowerX86FPLogicOp(N, DAG, Subtarget);
53510}
53511
53512/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53513static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
53514 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53514, __extension__
__PRETTY_FUNCTION__))
;
53515
53516 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
53517 if (!DAG.getTarget().Options.NoNaNsFPMath ||
53518 !DAG.getTarget().Options.NoSignedZerosFPMath)
53519 return SDValue();
53520
53521 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53522 // into FMINC and FMAXC, which are Commutative operations.
53523 unsigned NewOp = 0;
53524 switch (N->getOpcode()) {
53525 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53525)
;
53526 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
53527 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
53528 }
53529
53530 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53531 N->getOperand(0), N->getOperand(1));
53532}
53533
53534static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
53535 const X86Subtarget &Subtarget) {
53536 EVT VT = N->getValueType(0);
53537 if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
53538 return SDValue();
53539
53540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53541
53542 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
53543 (Subtarget.hasSSE2() && VT == MVT::f64) ||
53544 (Subtarget.hasFP16() && VT == MVT::f16) ||
53545 (VT.isVector() && TLI.isTypeLegal(VT))))
53546 return SDValue();
53547
53548 SDValue Op0 = N->getOperand(0);
53549 SDValue Op1 = N->getOperand(1);
53550 SDLoc DL(N);
53551 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53552
53553 // If we don't have to respect NaN inputs, this is a direct translation to x86
53554 // min/max instructions.
53555 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53556 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53557
53558 // If one of the operands is known non-NaN use the native min/max instructions
53559 // with the non-NaN input as second operand.
53560 if (DAG.isKnownNeverNaN(Op1))
53561 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53562 if (DAG.isKnownNeverNaN(Op0))
53563 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53564
53565 // If we have to respect NaN inputs, this takes at least 3 instructions.
53566 // Favor a library call when operating on a scalar and minimizing code size.
53567 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
53568 return SDValue();
53569
53570 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
53571 VT);
53572
53573 // There are 4 possibilities involving NaN inputs, and these are the required
53574 // outputs:
53575 // Op1
53576 // Num NaN
53577 // ----------------
53578 // Num | Max | Op0 |
53579 // Op0 ----------------
53580 // NaN | Op1 | NaN |
53581 // ----------------
53582 //
53583 // The SSE FP max/min instructions were not designed for this case, but rather
53584 // to implement:
53585 // Min = Op1 < Op0 ? Op1 : Op0
53586 // Max = Op1 > Op0 ? Op1 : Op0
53587 //
53588 // So they always return Op0 if either input is a NaN. However, we can still
53589 // use those instructions for fmaxnum by selecting away a NaN input.
53590
53591 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
53592 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
53593 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
53594
53595 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
53596 // are NaN, the NaN value of Op1 is the result.
53597 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
53598}
53599
53600static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
53601 TargetLowering::DAGCombinerInfo &DCI) {
53602 EVT VT = N->getValueType(0);
53603 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53604
53605 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53606 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
53607 return SDValue(N, 0);
53608
53609 // Convert a full vector load into vzload when not all bits are needed.
53610 SDValue In = N->getOperand(0);
53611 MVT InVT = In.getSimpleValueType();
53612 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53613 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53614 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53614, __extension__
__PRETTY_FUNCTION__))
;
53615 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53616 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53617 MVT MemVT = MVT::getIntegerVT(NumBits);
53618 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53619 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53620 SDLoc dl(N);
53621 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53622 DAG.getBitcast(InVT, VZLoad));
53623 DCI.CombineTo(N, Convert);
53624 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53625 DCI.recursivelyDeleteUnusedNodes(LN);
53626 return SDValue(N, 0);
53627 }
53628 }
53629
53630 return SDValue();
53631}
53632
53633static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
53634 TargetLowering::DAGCombinerInfo &DCI) {
53635 bool IsStrict = N->isTargetStrictFPOpcode();
53636 EVT VT = N->getValueType(0);
53637
53638 // Convert a full vector load into vzload when not all bits are needed.
53639 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53640 MVT InVT = In.getSimpleValueType();
53641 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53642 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53643 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53643, __extension__
__PRETTY_FUNCTION__))
;
53644 LoadSDNode *LN = cast<LoadSDNode>(In);
53645 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53646 MVT MemVT = MVT::getFloatingPointVT(NumBits);
53647 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53648 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53649 SDLoc dl(N);
53650 if (IsStrict) {
53651 SDValue Convert =
53652 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53653 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53654 DCI.CombineTo(N, Convert, Convert.getValue(1));
53655 } else {
53656 SDValue Convert =
53657 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53658 DCI.CombineTo(N, Convert);
53659 }
53660 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53661 DCI.recursivelyDeleteUnusedNodes(LN);
53662 return SDValue(N, 0);
53663 }
53664 }
53665
53666 return SDValue();
53667}
53668
53669/// Do target-specific dag combines on X86ISD::ANDNP nodes.
53670static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
53671 TargetLowering::DAGCombinerInfo &DCI,
53672 const X86Subtarget &Subtarget) {
53673 SDValue N0 = N->getOperand(0);
53674 SDValue N1 = N->getOperand(1);
53675 MVT VT = N->getSimpleValueType(0);
53676 int NumElts = VT.getVectorNumElements();
53677 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53678
53679 // ANDNP(undef, x) -> 0
53680 // ANDNP(x, undef) -> 0
53681 if (N0.isUndef() || N1.isUndef())
53682 return DAG.getConstant(0, SDLoc(N), VT);
53683
53684 // ANDNP(0, x) -> x
53685 if (ISD::isBuildVectorAllZeros(N0.getNode()))
53686 return N1;
53687
53688 // ANDNP(x, 0) -> 0
53689 if (ISD::isBuildVectorAllZeros(N1.getNode()))
53690 return DAG.getConstant(0, SDLoc(N), VT);
53691
53692 // Turn ANDNP back to AND if input is inverted.
53693 if (SDValue Not = IsNOT(N0, DAG))
53694 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
53695
53696 // Constant Folding
53697 APInt Undefs0, Undefs1;
53698 SmallVector<APInt> EltBits0, EltBits1;
53699 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
53700 SDLoc DL(N);
53701 APInt ResultUndefs = APInt::getZero(NumElts);
53702
53703 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
53704 SmallVector<APInt> ResultBits;
53705 for (int I = 0; I != NumElts; ++I)
53706 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
53707 return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);
53708 }
53709
53710 // Constant fold NOT(N0) to allow us to use AND.
53711 // Ensure this is only performed if we can confirm that the bitcasted source
53712 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
53713 if (N0->hasOneUse()) {
53714 SDValue BC0 = peekThroughOneUseBitcasts(N0);
53715 if (BC0.getOpcode() != ISD::BITCAST) {
53716 for (APInt &Elt : EltBits0)
53717 Elt = ~Elt;
53718 SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);
53719 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
53720 }
53721 }
53722 }
53723
53724 // Attempt to recursively combine a bitmask ANDNP with shuffles.
53725 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
53726 SDValue Op(N, 0);
53727 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
53728 return Res;
53729
53730 // If either operand is a constant mask, then only the elements that aren't
53731 // zero are actually demanded by the other operand.
53732 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
53733 APInt UndefElts;
53734 SmallVector<APInt> EltBits;
53735 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
53736 APInt DemandedElts = APInt::getAllOnes(NumElts);
53737 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
53738 EltBits)) {
53739 DemandedBits.clearAllBits();
53740 DemandedElts.clearAllBits();
53741 for (int I = 0; I != NumElts; ++I) {
53742 if (UndefElts[I]) {
53743 // We can't assume an undef src element gives an undef dst - the
53744 // other src might be zero.
53745 DemandedBits.setAllBits();
53746 DemandedElts.setBit(I);
53747 } else if ((Invert && !EltBits[I].isAllOnes()) ||
53748 (!Invert && !EltBits[I].isZero())) {
53749 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
53750 DemandedElts.setBit(I);
53751 }
53752 }
53753 }
53754 return std::make_pair(DemandedBits, DemandedElts);
53755 };
53756 APInt Bits0, Elts0;
53757 APInt Bits1, Elts1;
53758 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
53759 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
53760
53761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53762 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
53763 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
53764 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
53765 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
53766 if (N->getOpcode() != ISD::DELETED_NODE)
53767 DCI.AddToWorklist(N);
53768 return SDValue(N, 0);
53769 }
53770 }
53771
53772 return SDValue();
53773}
53774
53775static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
53776 TargetLowering::DAGCombinerInfo &DCI) {
53777 SDValue N1 = N->getOperand(1);
53778
53779 // BT ignores high bits in the bit index operand.
53780 unsigned BitWidth = N1.getValueSizeInBits();
53781 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
53782 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
53783 if (N->getOpcode() != ISD::DELETED_NODE)
53784 DCI.AddToWorklist(N);
53785 return SDValue(N, 0);
53786 }
53787
53788 return SDValue();
53789}
53790
53791static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
53792 TargetLowering::DAGCombinerInfo &DCI) {
53793 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
53794 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53795
53796 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
53797 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53798 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
53799 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
53800 if (N->getOpcode() != ISD::DELETED_NODE)
53801 DCI.AddToWorklist(N);
53802 return SDValue(N, 0);
53803 }
53804
53805 // Convert a full vector load into vzload when not all bits are needed.
53806 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
53807 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
53808 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
53809 SDLoc dl(N);
53810 if (IsStrict) {
53811 SDValue Convert = DAG.getNode(
53812 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
53813 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
53814 DCI.CombineTo(N, Convert, Convert.getValue(1));
53815 } else {
53816 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
53817 DAG.getBitcast(MVT::v8i16, VZLoad));
53818 DCI.CombineTo(N, Convert);
53819 }
53820
53821 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53822 DCI.recursivelyDeleteUnusedNodes(LN);
53823 return SDValue(N, 0);
53824 }
53825 }
53826 }
53827
53828 return SDValue();
53829}
53830
53831// Try to combine sext_in_reg of a cmov of constants by extending the constants.
53832static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
53833 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53833, __extension__
__PRETTY_FUNCTION__))
;
53834
53835 EVT DstVT = N->getValueType(0);
53836
53837 SDValue N0 = N->getOperand(0);
53838 SDValue N1 = N->getOperand(1);
53839 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53840
53841 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
53842 return SDValue();
53843
53844 // Look through single use any_extends / truncs.
53845 SDValue IntermediateBitwidthOp;
53846 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
53847 N0.hasOneUse()) {
53848 IntermediateBitwidthOp = N0;
53849 N0 = N0.getOperand(0);
53850 }
53851
53852 // See if we have a single use cmov.
53853 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
53854 return SDValue();
53855
53856 SDValue CMovOp0 = N0.getOperand(0);
53857 SDValue CMovOp1 = N0.getOperand(1);
53858
53859 // Make sure both operands are constants.
53860 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53861 !isa<ConstantSDNode>(CMovOp1.getNode()))
53862 return SDValue();
53863
53864 SDLoc DL(N);
53865
53866 // If we looked through an any_extend/trunc above, add one to the constants.
53867 if (IntermediateBitwidthOp) {
53868 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
53869 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
53870 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
53871 }
53872
53873 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
53874 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
53875
53876 EVT CMovVT = DstVT;
53877 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
53878 if (DstVT == MVT::i16) {
53879 CMovVT = MVT::i32;
53880 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
53881 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
53882 }
53883
53884 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
53885 N0.getOperand(2), N0.getOperand(3));
53886
53887 if (CMovVT != DstVT)
53888 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
53889
53890 return CMov;
53891}
53892
53893static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
53894 const X86Subtarget &Subtarget) {
53895 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53895, __extension__
__PRETTY_FUNCTION__))
;
53896
53897 if (SDValue V = combineSextInRegCmov(N, DAG))
53898 return V;
53899
53900 EVT VT = N->getValueType(0);
53901 SDValue N0 = N->getOperand(0);
53902 SDValue N1 = N->getOperand(1);
53903 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53904 SDLoc dl(N);
53905
53906 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
53907 // both SSE and AVX2 since there is no sign-extended shift right
53908 // operation on a vector with 64-bit elements.
53909 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
53910 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
53911 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
53912 N0.getOpcode() == ISD::SIGN_EXTEND)) {
53913 SDValue N00 = N0.getOperand(0);
53914
53915 // EXTLOAD has a better solution on AVX2,
53916 // it may be replaced with X86ISD::VSEXT node.
53917 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
53918 if (!ISD::isNormalLoad(N00.getNode()))
53919 return SDValue();
53920
53921 // Attempt to promote any comparison mask ops before moving the
53922 // SIGN_EXTEND_INREG in the way.
53923 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
53924 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
53925
53926 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
53927 SDValue Tmp =
53928 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
53929 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
53930 }
53931 }
53932 return SDValue();
53933}
53934
53935/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
53936/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
53937/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
53938/// opportunities to combine math ops, use an LEA, or use a complex addressing
53939/// mode. This can eliminate extend, add, and shift instructions.
53940static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
53941 const X86Subtarget &Subtarget) {
53942 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53943 Ext->getOpcode() != ISD::ZERO_EXTEND)
53944 return SDValue();
53945
53946 // TODO: This should be valid for other integer types.
53947 EVT VT = Ext->getValueType(0);
53948 if (VT != MVT::i64)
53949 return SDValue();
53950
53951 SDValue Add = Ext->getOperand(0);
53952 if (Add.getOpcode() != ISD::ADD)
53953 return SDValue();
53954
53955 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53956 bool NSW = Add->getFlags().hasNoSignedWrap();
53957 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53958
53959 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
53960 // into the 'zext'
53961 if ((Sext && !NSW) || (!Sext && !NUW))
53962 return SDValue();
53963
53964 // Having a constant operand to the 'add' ensures that we are not increasing
53965 // the instruction count because the constant is extended for free below.
53966 // A constant operand can also become the displacement field of an LEA.
53967 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
53968 if (!AddOp1)
53969 return SDValue();
53970
53971 // Don't make the 'add' bigger if there's no hope of combining it with some
53972 // other 'add' or 'shl' instruction.
53973 // TODO: It may be profitable to generate simpler LEA instructions in place
53974 // of single 'add' instructions, but the cost model for selecting an LEA
53975 // currently has a high threshold.
53976 bool HasLEAPotential = false;
53977 for (auto *User : Ext->uses()) {
53978 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53979 HasLEAPotential = true;
53980 break;
53981 }
53982 }
53983 if (!HasLEAPotential)
53984 return SDValue();
53985
53986 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53987 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
53988 SDValue AddOp0 = Add.getOperand(0);
53989 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53990 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
53991
53992 // The wider add is guaranteed to not wrap because both operands are
53993 // sign-extended.
53994 SDNodeFlags Flags;
53995 Flags.setNoSignedWrap(NSW);
53996 Flags.setNoUnsignedWrap(NUW);
53997 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53998}
53999
54000// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
54001// operands and the result of CMOV is not used anywhere else - promote CMOV
54002// itself instead of promoting its result. This could be beneficial, because:
54003// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
54004// (or more) pseudo-CMOVs only when they go one-after-another and
54005// getting rid of result extension code after CMOV will help that.
54006// 2) Promotion of constant CMOV arguments is free, hence the
54007// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
54008// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
54009// promotion is also good in terms of code-size.
54010// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
54011// promotion).
54012static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
54013 SDValue CMovN = Extend->getOperand(0);
54014 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
54015 return SDValue();
54016
54017 EVT TargetVT = Extend->getValueType(0);
54018 unsigned ExtendOpcode = Extend->getOpcode();
54019 SDLoc DL(Extend);
54020
54021 EVT VT = CMovN.getValueType();
54022 SDValue CMovOp0 = CMovN.getOperand(0);
54023 SDValue CMovOp1 = CMovN.getOperand(1);
54024
54025 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54026 !isa<ConstantSDNode>(CMovOp1.getNode()))
54027 return SDValue();
54028
54029 // Only extend to i32 or i64.
54030 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
54031 return SDValue();
54032
54033 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
54034 // are free.
54035 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
54036 return SDValue();
54037
54038 // If this a zero extend to i64, we should only extend to i32 and use a free
54039 // zero extend to finish.
54040 EVT ExtendVT = TargetVT;
54041 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
54042 ExtendVT = MVT::i32;
54043
54044 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
54045 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
54046
54047 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
54048 CMovN.getOperand(2), CMovN.getOperand(3));
54049
54050 // Finish extending if needed.
54051 if (ExtendVT != TargetVT)
54052 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
54053
54054 return Res;
54055}
54056
54057// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
54058// result type.
54059static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
54060 const X86Subtarget &Subtarget) {
54061 SDValue N0 = N->getOperand(0);
54062 EVT VT = N->getValueType(0);
54063 SDLoc dl(N);
54064
54065 // Only do this combine with AVX512 for vector extends.
54066 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
54067 return SDValue();
54068
54069 // Only combine legal element types.
54070 EVT SVT = VT.getVectorElementType();
54071 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
54072 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
54073 return SDValue();
54074
54075 // We don't have CMPP Instruction for vxf16
54076 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
54077 return SDValue();
54078 // We can only do this if the vector size in 256 bits or less.
54079 unsigned Size = VT.getSizeInBits();
54080 if (Size > 256 && Subtarget.useAVX512Regs())
54081 return SDValue();
54082
54083 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
54084 // that's the only integer compares with we have.
54085 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
54086 if (ISD::isUnsignedIntSetCC(CC))
54087 return SDValue();
54088
54089 // Only do this combine if the extension will be fully consumed by the setcc.
54090 EVT N00VT = N0.getOperand(0).getValueType();
54091 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
54092 if (Size != MatchingVecType.getSizeInBits())
54093 return SDValue();
54094
54095 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
54096
54097 if (N->getOpcode() == ISD::ZERO_EXTEND)
54098 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
54099
54100 return Res;
54101}
54102
54103static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
54104 TargetLowering::DAGCombinerInfo &DCI,
54105 const X86Subtarget &Subtarget) {
54106 SDValue N0 = N->getOperand(0);
54107 EVT VT = N->getValueType(0);
54108 SDLoc DL(N);
54109
54110 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54111 if (!DCI.isBeforeLegalizeOps() &&
54112 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54113 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
54114 N0->getOperand(1));
54115 bool ReplaceOtherUses = !N0.hasOneUse();
54116 DCI.CombineTo(N, Setcc);
54117 // Replace other uses with a truncate of the widened setcc_carry.
54118 if (ReplaceOtherUses) {
54119 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54120 N0.getValueType(), Setcc);
54121 DCI.CombineTo(N0.getNode(), Trunc);
54122 }
54123
54124 return SDValue(N, 0);
54125 }
54126
54127 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54128 return NewCMov;
54129
54130 if (!DCI.isBeforeLegalizeOps())
54131 return SDValue();
54132
54133 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54134 return V;
54135
54136 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
54137 DAG, DCI, Subtarget))
54138 return V;
54139
54140 if (VT.isVector()) {
54141 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54142 return R;
54143
54144 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
54145 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
54146 }
54147
54148 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54149 return NewAdd;
54150
54151 return SDValue();
54152}
54153
54154static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
54155 TargetLowering::DAGCombinerInfo &DCI,
54156 const X86Subtarget &Subtarget) {
54157 SDLoc dl(N);
54158 EVT VT = N->getValueType(0);
54159 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54160
54161 // Let legalize expand this if it isn't a legal type yet.
54162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54163 if (!TLI.isTypeLegal(VT))
54164 return SDValue();
54165
54166 SDValue A = N->getOperand(IsStrict ? 1 : 0);
54167 SDValue B = N->getOperand(IsStrict ? 2 : 1);
54168 SDValue C = N->getOperand(IsStrict ? 3 : 2);
54169
54170 // If the operation allows fast-math and the target does not support FMA,
54171 // split this into mul+add to avoid libcall(s).
54172 SDNodeFlags Flags = N->getFlags();
54173 if (!IsStrict && Flags.hasAllowReassociation() &&
54174 TLI.isOperationExpand(ISD::FMA, VT)) {
54175 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
54176 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
54177 }
54178
54179 EVT ScalarVT = VT.getScalarType();
54180 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
54181 !Subtarget.hasAnyFMA()) &&
54182 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
54183 return SDValue();
54184
54185 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
54186 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54187 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54188 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
54189 CodeSize)) {
54190 V = NegV;
54191 return true;
54192 }
54193 // Look through extract_vector_elts. If it comes from an FNEG, create a
54194 // new extract from the FNEG input.
54195 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54196 isNullConstant(V.getOperand(1))) {
54197 SDValue Vec = V.getOperand(0);
54198 if (SDValue NegV = TLI.getCheaperNegatedExpression(
54199 Vec, DAG, LegalOperations, CodeSize)) {
54200 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
54201 NegV, V.getOperand(1));
54202 return true;
54203 }
54204 }
54205
54206 return false;
54207 };
54208
54209 // Do not convert the passthru input of scalar intrinsics.
54210 // FIXME: We could allow negations of the lower element only.
54211 bool NegA = invertIfNegative(A);
54212 bool NegB = invertIfNegative(B);
54213 bool NegC = invertIfNegative(C);
54214
54215 if (!NegA && !NegB && !NegC)
54216 return SDValue();
54217
54218 unsigned NewOpcode =
54219 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54220
54221 // Propagate fast-math-flags to new FMA node.
54222 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
54223 if (IsStrict) {
54224 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54224, __extension__
__PRETTY_FUNCTION__))
;
54225 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
54226 {N->getOperand(0), A, B, C});
54227 } else {
54228 if (N->getNumOperands() == 4)
54229 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54230 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
54231 }
54232}
54233
54234// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54235// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54236static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
54237 TargetLowering::DAGCombinerInfo &DCI) {
54238 SDLoc dl(N);
54239 EVT VT = N->getValueType(0);
54240 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54241 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54242 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54243
54244 SDValue N2 = N->getOperand(2);
54245
54246 SDValue NegN2 =
54247 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
54248 if (!NegN2)
54249 return SDValue();
54250 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54251
54252 if (N->getNumOperands() == 4)
54253 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54254 NegN2, N->getOperand(3));
54255 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54256 NegN2);
54257}
54258
54259static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
54260 TargetLowering::DAGCombinerInfo &DCI,
54261 const X86Subtarget &Subtarget) {
54262 SDLoc dl(N);
54263 SDValue N0 = N->getOperand(0);
54264 EVT VT = N->getValueType(0);
54265
54266 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54267 // FIXME: Is this needed? We don't seem to have any tests for it.
54268 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54269 N0.getOpcode() == X86ISD::SETCC_CARRY) {
54270 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54271 N0->getOperand(1));
54272 bool ReplaceOtherUses = !N0.hasOneUse();
54273 DCI.CombineTo(N, Setcc);
54274 // Replace other uses with a truncate of the widened setcc_carry.
54275 if (ReplaceOtherUses) {
54276 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54277 N0.getValueType(), Setcc);
54278 DCI.CombineTo(N0.getNode(), Trunc);
54279 }
54280
54281 return SDValue(N, 0);
54282 }
54283
54284 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54285 return NewCMov;
54286
54287 if (DCI.isBeforeLegalizeOps())
54288 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54289 return V;
54290
54291 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54292 DAG, DCI, Subtarget))
54293 return V;
54294
54295 if (VT.isVector())
54296 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54297 return R;
54298
54299 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54300 return NewAdd;
54301
54302 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
54303 return R;
54304
54305 // TODO: Combine with any target/faux shuffle.
54306 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
54307 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
54308 SDValue N00 = N0.getOperand(0);
54309 SDValue N01 = N0.getOperand(1);
54310 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
54311 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
54312 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
54313 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
54314 return concatSubVectors(N00, N01, DAG, dl);
54315 }
54316 }
54317
54318 return SDValue();
54319}
54320
54321/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
54322/// pre-promote its result type since vXi1 vectors don't get promoted
54323/// during type legalization.
54324static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
54325 SDValue RHS, ISD::CondCode CC,
54326 const SDLoc &DL, SelectionDAG &DAG,
54327 const X86Subtarget &Subtarget) {
54328 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
54329 VT.getVectorElementType() == MVT::i1 &&
54330 (OpVT.getVectorElementType() == MVT::i8 ||
54331 OpVT.getVectorElementType() == MVT::i16)) {
54332 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
54333 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
54334 }
54335 return SDValue();
54336}
54337
54338static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
54339 TargetLowering::DAGCombinerInfo &DCI,
54340 const X86Subtarget &Subtarget) {
54341 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
54342 const SDValue LHS = N->getOperand(0);
54343 const SDValue RHS = N->getOperand(1);
54344 EVT VT = N->getValueType(0);
54345 EVT OpVT = LHS.getValueType();
54346 SDLoc DL(N);
54347
54348 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
54349 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
54350 Subtarget))
54351 return V;
54352
54353 if (VT == MVT::i1) {
54354 X86::CondCode X86CC;
54355 if (SDValue V =
54356 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
54357 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
54358 }
54359
54360 if (OpVT.isScalarInteger()) {
54361 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54362 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54363 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
54364 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54365 if (N0.getOperand(0) == N1)
54366 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54367 N0.getOperand(1));
54368 if (N0.getOperand(1) == N1)
54369 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54370 N0.getOperand(0));
54371 }
54372 return SDValue();
54373 };
54374 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
54375 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54376 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
54377 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54378
54379 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54380 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54381 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
54382 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54383 if (N0.getOperand(0) == N1)
54384 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54385 DAG.getNOT(DL, N0.getOperand(1), OpVT));
54386 if (N0.getOperand(1) == N1)
54387 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54388 DAG.getNOT(DL, N0.getOperand(0), OpVT));
54389 }
54390 return SDValue();
54391 };
54392 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
54393 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54394 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
54395 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54396
54397 // cmpeq(trunc(x),0) --> cmpeq(x,0)
54398 // cmpne(trunc(x),0) --> cmpne(x,0)
54399 // iff x upper bits are zero.
54400 // TODO: Add support for RHS to be truncate as well?
54401 if (LHS.getOpcode() == ISD::TRUNCATE &&
54402 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
54403 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
54404 EVT SrcVT = LHS.getOperand(0).getValueType();
54405 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
54406 OpVT.getScalarSizeInBits());
54407 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54408 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
54409 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
54410 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
54411 DAG.getConstant(0, DL, SrcVT), CC);
54412 }
54413
54414 // With C as a power of 2 and C != 0 and C != INT_MIN:
54415 // icmp eq Abs(X) C ->
54416 // (icmp eq A, C) | (icmp eq A, -C)
54417 // icmp ne Abs(X) C ->
54418 // (icmp ne A, C) & (icmp ne A, -C)
54419 // Both of these patterns can be better optimized in
54420 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
54421 // integers which is checked above.
54422 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
54423 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
54424 const APInt &CInt = C->getAPIntValue();
54425 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
54426 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
54427 SDValue BaseOp = LHS.getOperand(0);
54428 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
54429 SDValue SETCC1 = DAG.getSetCC(
54430 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54431 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
54432 SETCC0, SETCC1);
54433 }
54434 }
54435 }
54436 }
54437 }
54438
54439 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
54440 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
54441 // Using temporaries to avoid messing up operand ordering for later
54442 // transformations if this doesn't work.
54443 SDValue Op0 = LHS;
54444 SDValue Op1 = RHS;
54445 ISD::CondCode TmpCC = CC;
54446 // Put build_vector on the right.
54447 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
54448 std::swap(Op0, Op1);
54449 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
54450 }
54451
54452 bool IsSEXT0 =
54453 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
54454 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
54455 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
54456
54457 if (IsSEXT0 && IsVZero1) {
54458 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54459, __extension__
__PRETTY_FUNCTION__))
54459 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54459, __extension__
__PRETTY_FUNCTION__))
;
54460 if (TmpCC == ISD::SETGT)
54461 return DAG.getConstant(0, DL, VT);
54462 if (TmpCC == ISD::SETLE)
54463 return DAG.getConstant(1, DL, VT);
54464 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
54465 return DAG.getNOT(DL, Op0.getOperand(0), VT);
54466
54467 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54468, __extension__
__PRETTY_FUNCTION__))
54468 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54468, __extension__
__PRETTY_FUNCTION__))
;
54469 return Op0.getOperand(0);
54470 }
54471 }
54472
54473 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
54474 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
54475 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
54476 // a mask, there are signed AVX512 comparisons).
54477 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
54478 bool CanMakeSigned = false;
54479 if (ISD::isUnsignedIntSetCC(CC)) {
54480 KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),
54481 DAG.computeKnownBits(RHS));
54482 // If we know LHS/RHS share the same sign bit at each element we can
54483 // make this signed.
54484 // NOTE: `computeKnownBits` on a vector type aggregates common bits
54485 // across all lanes. So a pattern where the sign varies from lane to
54486 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
54487 // missed. We could get around this by demanding each lane
54488 // independently, but this isn't the most important optimization and
54489 // that may eat into compile time.
54490 CanMakeSigned =
54491 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
54492 }
54493 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
54494 SDValue LHSOut = LHS;
54495 SDValue RHSOut = RHS;
54496 ISD::CondCode NewCC = CC;
54497 switch (CC) {
54498 case ISD::SETGE:
54499 case ISD::SETUGE:
54500 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
54501 /*NSW*/ true))
54502 LHSOut = NewLHS;
54503 else if (SDValue NewRHS = incDecVectorConstant(
54504 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
54505 RHSOut = NewRHS;
54506 else
54507 break;
54508
54509 [[fallthrough]];
54510 case ISD::SETUGT:
54511 NewCC = ISD::SETGT;
54512 break;
54513
54514 case ISD::SETLE:
54515 case ISD::SETULE:
54516 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
54517 /*NSW*/ true))
54518 LHSOut = NewLHS;
54519 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
54520 /*NSW*/ true))
54521 RHSOut = NewRHS;
54522 else
54523 break;
54524
54525 [[fallthrough]];
54526 case ISD::SETULT:
54527 // Will be swapped to SETGT in LowerVSETCC*.
54528 NewCC = ISD::SETLT;
54529 break;
54530 default:
54531 break;
54532 }
54533 if (NewCC != CC) {
54534 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
54535 NewCC, DL, DAG, Subtarget))
54536 return R;
54537 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
54538 }
54539 }
54540 }
54541
54542 if (SDValue R =
54543 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
54544 return R;
54545
54546 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54547 // to avoid scalarization via legalization because v4i32 is not a legal type.
54548 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
54549 LHS.getValueType() == MVT::v4f32)
54550 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
54551
54552 // X pred 0.0 --> X pred -X
54553 // If the negation of X already exists, use it in the comparison. This removes
54554 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
54555 // instructions in patterns with a 'select' node.
54556 if (isNullFPScalarOrVectorConst(RHS)) {
54557 SDVTList FNegVT = DAG.getVTList(OpVT);
54558 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
54559 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
54560 }
54561
54562 return SDValue();
54563}
54564
54565static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
54566 TargetLowering::DAGCombinerInfo &DCI,
54567 const X86Subtarget &Subtarget) {
54568 SDValue Src = N->getOperand(0);
54569 MVT SrcVT = Src.getSimpleValueType();
54570 MVT VT = N->getSimpleValueType(0);
54571 unsigned NumBits = VT.getScalarSizeInBits();
54572 unsigned NumElts = SrcVT.getVectorNumElements();
54573 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
54574 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54574, __extension__
__PRETTY_FUNCTION__))
;
54575
54576 // Perform constant folding.
54577 APInt UndefElts;
54578 SmallVector<APInt, 32> EltBits;
54579 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
54580 APInt Imm(32, 0);
54581 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
54582 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54583 Imm.setBit(Idx);
54584
54585 return DAG.getConstant(Imm, SDLoc(N), VT);
54586 }
54587
54588 // Look through int->fp bitcasts that don't change the element width.
54589 unsigned EltWidth = SrcVT.getScalarSizeInBits();
54590 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
54591 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
54592 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
54593
54594 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54595 // with scalar comparisons.
54596 if (SDValue NotSrc = IsNOT(Src, DAG)) {
54597 SDLoc DL(N);
54598 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54599 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
54600 return DAG.getNode(ISD::XOR, DL, VT,
54601 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
54602 DAG.getConstant(NotMask, DL, VT));
54603 }
54604
54605 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54606 // results with scalar comparisons.
54607 if (Src.getOpcode() == X86ISD::PCMPGT &&
54608 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
54609 SDLoc DL(N);
54610 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54611 return DAG.getNode(ISD::XOR, DL, VT,
54612 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
54613 DAG.getConstant(NotMask, DL, VT));
54614 }
54615
54616 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54617 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54618 // iff pow2splat(c1).
54619 // Use KnownBits to determine if only a single bit is non-zero
54620 // in each element (pow2 or zero), and shift that bit to the msb.
54621 if (Src.getOpcode() == X86ISD::PCMPEQ) {
54622 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
54623 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
54624 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
54625 if (KnownLHS.countMaxPopulation() == 1 &&
54626 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
54627 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
54628 SDLoc DL(N);
54629 MVT ShiftVT = SrcVT;
54630 SDValue ShiftLHS = Src.getOperand(0);
54631 SDValue ShiftRHS = Src.getOperand(1);
54632 if (ShiftVT.getScalarType() == MVT::i8) {
54633 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54634 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
54635 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
54636 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
54637 }
54638 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54639 ShiftLHS, ShiftAmt, DAG);
54640 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54641 ShiftRHS, ShiftAmt, DAG);
54642 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
54643 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
54644 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
54645 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
54646 }
54647 }
54648
54649 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54650 if (N->isOnlyUserOf(Src.getNode())) {
54651 SDValue SrcBC = peekThroughOneUseBitcasts(Src);
54652 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54653 APInt UndefElts;
54654 SmallVector<APInt, 32> EltBits;
54655 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54656 UndefElts, EltBits)) {
54657 APInt Mask = APInt::getZero(NumBits);
54658 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54659 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54660 Mask.setBit(Idx);
54661 }
54662 SDLoc DL(N);
54663 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54664 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54665 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54666 DAG.getConstant(Mask, DL, VT));
54667 }
54668 }
54669 }
54670
54671 // Simplify the inputs.
54672 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54673 APInt DemandedMask(APInt::getAllOnes(NumBits));
54674 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54675 return SDValue(N, 0);
54676
54677 return SDValue();
54678}
54679
54680static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
54681 TargetLowering::DAGCombinerInfo &DCI,
54682 const X86Subtarget &Subtarget) {
54683 MVT VT = N->getSimpleValueType(0);
54684 unsigned NumBits = VT.getScalarSizeInBits();
54685
54686 // Simplify the inputs.
54687 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54688 APInt DemandedMask(APInt::getAllOnes(NumBits));
54689 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54690 return SDValue(N, 0);
54691
54692 return SDValue();
54693}
54694
54695static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
54696 TargetLowering::DAGCombinerInfo &DCI,
54697 const X86Subtarget &Subtarget) {
54698 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
54699 SDValue BasePtr = MemOp->getBasePtr();
54700 SDValue Index = MemOp->getIndex();
54701 SDValue Scale = MemOp->getScale();
54702 SDValue Mask = MemOp->getMask();
54703
54704 // Attempt to fold an index scale into the scale value directly.
54705 // For smaller indices, implicit sext is performed BEFORE scale, preventing
54706 // this fold under most circumstances.
54707 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
54708 if ((Index.getOpcode() == X86ISD::VSHLI ||
54709 (Index.getOpcode() == ISD::ADD &&
54710 Index.getOperand(0) == Index.getOperand(1))) &&
54711 isa<ConstantSDNode>(Scale) &&
54712 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
54713 unsigned ShiftAmt =
54714 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
54715 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
54716 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
54717 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
54718 SDValue NewIndex = Index.getOperand(0);
54719 SDValue NewScale =
54720 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
54721 if (N->getOpcode() == X86ISD::MGATHER)
54722 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
54723 MemOp->getOperand(1), Mask,
54724 MemOp->getBasePtr(), NewIndex, NewScale,
54725 MemOp->getChain(), Subtarget);
54726 if (N->getOpcode() == X86ISD::MSCATTER)
54727 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
54728 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
54729 NewIndex, NewScale, MemOp->getChain(), Subtarget);
54730 }
54731 }
54732
54733 // With vector masks we only demand the upper bit of the mask.
54734 if (Mask.getScalarValueSizeInBits() != 1) {
54735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54736 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54737 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54738 if (N->getOpcode() != ISD::DELETED_NODE)
54739 DCI.AddToWorklist(N);
54740 return SDValue(N, 0);
54741 }
54742 }
54743
54744 return SDValue();
54745}
54746
54747static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
54748 SDValue Index, SDValue Base, SDValue Scale,
54749 SelectionDAG &DAG) {
54750 SDLoc DL(GorS);
54751
54752 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
54753 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
54754 Gather->getMask(), Base, Index, Scale } ;
54755 return DAG.getMaskedGather(Gather->getVTList(),
54756 Gather->getMemoryVT(), DL, Ops,
54757 Gather->getMemOperand(),
54758 Gather->getIndexType(),
54759 Gather->getExtensionType());
54760 }
54761 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
54762 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
54763 Scatter->getMask(), Base, Index, Scale };
54764 return DAG.getMaskedScatter(Scatter->getVTList(),
54765 Scatter->getMemoryVT(), DL,
54766 Ops, Scatter->getMemOperand(),
54767 Scatter->getIndexType(),
54768 Scatter->isTruncatingStore());
54769}
54770
54771static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
54772 TargetLowering::DAGCombinerInfo &DCI) {
54773 SDLoc DL(N);
54774 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
54775 SDValue Index = GorS->getIndex();
54776 SDValue Base = GorS->getBasePtr();
54777 SDValue Scale = GorS->getScale();
54778
54779 if (DCI.isBeforeLegalize()) {
54780 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54781
54782 // Shrink constant indices if they are larger than 32-bits.
54783 // Only do this before legalize types since v2i64 could become v2i32.
54784 // FIXME: We could check that the type is legal if we're after legalize
54785 // types, but then we would need to construct test cases where that happens.
54786 // FIXME: We could support more than just constant vectors, but we need to
54787 // careful with costing. A truncate that can be optimized out would be fine.
54788 // Otherwise we might only want to create a truncate if it avoids a split.
54789 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
54790 if (BV->isConstant() && IndexWidth > 32 &&
54791 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54792 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54793 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54794 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54795 }
54796 }
54797
54798 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
54799 // there are sufficient sign bits. Only do this before legalize types to
54800 // avoid creating illegal types in truncate.
54801 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
54802 Index.getOpcode() == ISD::ZERO_EXTEND) &&
54803 IndexWidth > 32 &&
54804 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
54805 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54806 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54807 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54808 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54809 }
54810 }
54811
54812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54813 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
54814 // Try to move splat constant adders from the index operand to the base
54815 // pointer operand. Taking care to multiply by the scale. We can only do
54816 // this when index element type is the same as the pointer type.
54817 // Otherwise we need to be sure the math doesn't wrap before the scale.
54818 if (Index.getOpcode() == ISD::ADD &&
54819 Index.getValueType().getVectorElementType() == PtrVT &&
54820 isa<ConstantSDNode>(Scale)) {
54821 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
54822 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
54823 BitVector UndefElts;
54824 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
54825 // FIXME: Allow non-constant?
54826 if (UndefElts.none()) {
54827 // Apply the scale.
54828 APInt Adder = C->getAPIntValue() * ScaleAmt;
54829 // Add it to the existing base.
54830 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
54831 DAG.getConstant(Adder, DL, PtrVT));
54832 Index = Index.getOperand(0);
54833 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54834 }
54835 }
54836
54837 // It's also possible base is just a constant. In that case, just
54838 // replace it with 0 and move the displacement into the index.
54839 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
54840 isOneConstant(Scale)) {
54841 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
54842 // Combine the constant build_vector and the constant base.
54843 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54844 Index.getOperand(1), Splat);
54845 // Add to the LHS of the original Index add.
54846 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54847 Index.getOperand(0), Splat);
54848 Base = DAG.getConstant(0, DL, Base.getValueType());
54849 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54850 }
54851 }
54852 }
54853
54854 if (DCI.isBeforeLegalizeOps()) {
54855 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54856
54857 // Make sure the index is either i32 or i64
54858 if (IndexWidth != 32 && IndexWidth != 64) {
54859 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54860 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54861 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54862 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54863 }
54864 }
54865
54866 // With vector masks we only demand the upper bit of the mask.
54867 SDValue Mask = GorS->getMask();
54868 if (Mask.getScalarValueSizeInBits() != 1) {
54869 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54870 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54871 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54872 if (N->getOpcode() != ISD::DELETED_NODE)
54873 DCI.AddToWorklist(N);
54874 return SDValue(N, 0);
54875 }
54876 }
54877
54878 return SDValue();
54879}
54880
54881// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54882static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
54883 const X86Subtarget &Subtarget) {
54884 SDLoc DL(N);
54885 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54886 SDValue EFLAGS = N->getOperand(1);
54887
54888 // Try to simplify the EFLAGS and condition code operands.
54889 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54890 return getSETCC(CC, Flags, DL, DAG);
54891
54892 return SDValue();
54893}
54894
54895/// Optimize branch condition evaluation.
54896static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
54897 const X86Subtarget &Subtarget) {
54898 SDLoc DL(N);
54899 SDValue EFLAGS = N->getOperand(3);
54900 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54901
54902 // Try to simplify the EFLAGS and condition code operands.
54903 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54904 // RAUW them under us.
54905 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54906 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54907 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54908 N->getOperand(1), Cond, Flags);
54909 }
54910
54911 return SDValue();
54912}
54913
54914// TODO: Could we move this to DAGCombine?
54915static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
54916 SelectionDAG &DAG) {
54917 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54918 // to optimize away operation when it's from a constant.
54919 //
54920 // The general transformation is:
54921 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54922 // AND(VECTOR_CMP(x,y), constant2)
54923 // constant2 = UNARYOP(constant)
54924
54925 // Early exit if this isn't a vector operation, the operand of the
54926 // unary operation isn't a bitwise AND, or if the sizes of the operations
54927 // aren't the same.
54928 EVT VT = N->getValueType(0);
54929 bool IsStrict = N->isStrictFPOpcode();
54930 unsigned NumEltBits = VT.getScalarSizeInBits();
54931 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54932 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54933 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54934 VT.getSizeInBits() != Op0.getValueSizeInBits())
54935 return SDValue();
54936
54937 // Now check that the other operand of the AND is a constant. We could
54938 // make the transformation for non-constant splats as well, but it's unclear
54939 // that would be a benefit as it would not eliminate any operations, just
54940 // perform one more step in scalar code before moving to the vector unit.
54941 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54942 // Bail out if the vector isn't a constant.
54943 if (!BV->isConstant())
54944 return SDValue();
54945
54946 // Everything checks out. Build up the new and improved node.
54947 SDLoc DL(N);
54948 EVT IntVT = BV->getValueType(0);
54949 // Create a new constant of the appropriate type for the transformed
54950 // DAG.
54951 SDValue SourceConst;
54952 if (IsStrict)
54953 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54954 {N->getOperand(0), SDValue(BV, 0)});
54955 else
54956 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54957 // The AND node needs bitcasts to/from an integer vector type around it.
54958 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54959 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54960 MaskConst);
54961 SDValue Res = DAG.getBitcast(VT, NewAnd);
54962 if (IsStrict)
54963 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54964 return Res;
54965 }
54966
54967 return SDValue();
54968}
54969
54970/// If we are converting a value to floating-point, try to replace scalar
54971/// truncate of an extracted vector element with a bitcast. This tries to keep
54972/// the sequence on XMM registers rather than moving between vector and GPRs.
54973static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
54974 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54975 // to allow being called by any similar cast opcode.
54976 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54977 SDValue Trunc = N->getOperand(0);
54978 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54979 return SDValue();
54980
54981 SDValue ExtElt = Trunc.getOperand(0);
54982 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54983 !isNullConstant(ExtElt.getOperand(1)))
54984 return SDValue();
54985
54986 EVT TruncVT = Trunc.getValueType();
54987 EVT SrcVT = ExtElt.getValueType();
54988 unsigned DestWidth = TruncVT.getSizeInBits();
54989 unsigned SrcWidth = SrcVT.getSizeInBits();
54990 if (SrcWidth % DestWidth != 0)
54991 return SDValue();
54992
54993 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54994 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54995 unsigned VecWidth = SrcVecVT.getSizeInBits();
54996 unsigned NumElts = VecWidth / DestWidth;
54997 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54998 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54999 SDLoc DL(N);
55000 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
55001 BitcastVec, ExtElt.getOperand(1));
55002 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
55003}
55004
55005static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
55006 const X86Subtarget &Subtarget) {
55007 bool IsStrict = N->isStrictFPOpcode();
55008 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55009 EVT VT = N->getValueType(0);
55010 EVT InVT = Op0.getValueType();
55011
55012 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
55013 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
55014 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
55015 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55016 unsigned ScalarSize = InVT.getScalarSizeInBits();
55017 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55018 return SDValue();
55019 SDLoc dl(N);
55020 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55021 ScalarSize < 16 ? MVT::i16
55022 : ScalarSize < 32 ? MVT::i32
55023 : MVT::i64,
55024 InVT.getVectorNumElements());
55025 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55026 if (IsStrict)
55027 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
55028 {N->getOperand(0), P});
55029 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
55030 }
55031
55032 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
55033 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
55034 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
55035 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55036 VT.getScalarType() != MVT::f16) {
55037 SDLoc dl(N);
55038 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55039 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55040
55041 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
55042 if (IsStrict)
55043 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55044 {N->getOperand(0), P});
55045 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55046 }
55047
55048 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
55049 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
55050 // the optimization here.
55051 if (DAG.SignBitIsZero(Op0)) {
55052 if (IsStrict)
55053 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
55054 {N->getOperand(0), Op0});
55055 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
55056 }
55057
55058 return SDValue();
55059}
55060
55061static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
55062 TargetLowering::DAGCombinerInfo &DCI,
55063 const X86Subtarget &Subtarget) {
55064 // First try to optimize away the conversion entirely when it's
55065 // conditionally from a constant. Vectors only.
55066 bool IsStrict = N->isStrictFPOpcode();
55067 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
55068 return Res;
55069
55070 // Now move on to more general possibilities.
55071 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55072 EVT VT = N->getValueType(0);
55073 EVT InVT = Op0.getValueType();
55074
55075 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
55076 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
55077 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
55078 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55079 unsigned ScalarSize = InVT.getScalarSizeInBits();
55080 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55081 return SDValue();
55082 SDLoc dl(N);
55083 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55084 ScalarSize < 16 ? MVT::i16
55085 : ScalarSize < 32 ? MVT::i32
55086 : MVT::i64,
55087 InVT.getVectorNumElements());
55088 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55089 if (IsStrict)
55090 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55091 {N->getOperand(0), P});
55092 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55093 }
55094
55095 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
55096 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
55097 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
55098 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55099 VT.getScalarType() != MVT::f16) {
55100 SDLoc dl(N);
55101 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55102 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55103 if (IsStrict)
55104 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55105 {N->getOperand(0), P});
55106 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55107 }
55108
55109 // Without AVX512DQ we only support i64 to float scalar conversion. For both
55110 // vectors and scalars, see if we know that the upper bits are all the sign
55111 // bit, in which case we can truncate the input to i32 and convert from that.
55112 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
55113 unsigned BitWidth = InVT.getScalarSizeInBits();
55114 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
55115 if (NumSignBits >= (BitWidth - 31)) {
55116 EVT TruncVT = MVT::i32;
55117 if (InVT.isVector())
55118 TruncVT = InVT.changeVectorElementType(TruncVT);
55119 SDLoc dl(N);
55120 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
55121 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
55122 if (IsStrict)
55123 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55124 {N->getOperand(0), Trunc});
55125 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
55126 }
55127 // If we're after legalize and the type is v2i32 we need to shuffle and
55128 // use CVTSI2P.
55129 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55129, __extension__
__PRETTY_FUNCTION__))
;
55130 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
55131 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
55132 { 0, 2, -1, -1 });
55133 if (IsStrict)
55134 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
55135 {N->getOperand(0), Shuf});
55136 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
55137 }
55138 }
55139
55140 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
55141 // a 32-bit target where SSE doesn't support i64->FP operations.
55142 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
55143 Op0.getOpcode() == ISD::LOAD) {
55144 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
55145
55146 // This transformation is not supported if the result type is f16 or f128.
55147 if (VT == MVT::f16 || VT == MVT::f128)
55148 return SDValue();
55149
55150 // If we have AVX512DQ we can use packed conversion instructions unless
55151 // the VT is f80.
55152 if (Subtarget.hasDQI() && VT != MVT::f80)
55153 return SDValue();
55154
55155 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55156 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
55157 std::pair<SDValue, SDValue> Tmp =
55158 Subtarget.getTargetLowering()->BuildFILD(
55159 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55160 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55161 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
55162 return Tmp.first;
55163 }
55164 }
55165
55166 if (IsStrict)
55167 return SDValue();
55168
55169 if (SDValue V = combineToFPTruncExtElt(N, DAG))
55170 return V;
55171
55172 return SDValue();
55173}
55174
55175static bool needCarryOrOverflowFlag(SDValue Flags) {
55176 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55176, __extension__
__PRETTY_FUNCTION__))
;
55177
55178 for (const SDNode *User : Flags->uses()) {
55179 X86::CondCode CC;
55180 switch (User->getOpcode()) {
55181 default:
55182 // Be conservative.
55183 return true;
55184 case X86ISD::SETCC:
55185 case X86ISD::SETCC_CARRY:
55186 CC = (X86::CondCode)User->getConstantOperandVal(0);
55187 break;
55188 case X86ISD::BRCOND:
55189 case X86ISD::CMOV:
55190 CC = (X86::CondCode)User->getConstantOperandVal(2);
55191 break;
55192 }
55193
55194 switch (CC) {
55195 default: break;
55196 case X86::COND_A: case X86::COND_AE:
55197 case X86::COND_B: case X86::COND_BE:
55198 case X86::COND_O: case X86::COND_NO:
55199 case X86::COND_G: case X86::COND_GE:
55200 case X86::COND_L: case X86::COND_LE:
55201 return true;
55202 }
55203 }
55204
55205 return false;
55206}
55207
55208static bool onlyZeroFlagUsed(SDValue Flags) {
55209 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55209, __extension__
__PRETTY_FUNCTION__))
;
55210
55211 for (const SDNode *User : Flags->uses()) {
55212 unsigned CCOpNo;
55213 switch (User->getOpcode()) {
55214 default:
55215 // Be conservative.
55216 return false;
55217 case X86ISD::SETCC:
55218 case X86ISD::SETCC_CARRY:
55219 CCOpNo = 0;
55220 break;
55221 case X86ISD::BRCOND:
55222 case X86ISD::CMOV:
55223 CCOpNo = 2;
55224 break;
55225 }
55226
55227 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55228 if (CC != X86::COND_E && CC != X86::COND_NE)
55229 return false;
55230 }
55231
55232 return true;
55233}
55234
55235static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
55236 // Only handle test patterns.
55237 if (!isNullConstant(N->getOperand(1)))
55238 return SDValue();
55239
55240 // If we have a CMP of a truncated binop, see if we can make a smaller binop
55241 // and use its flags directly.
55242 // TODO: Maybe we should try promoting compares that only use the zero flag
55243 // first if we can prove the upper bits with computeKnownBits?
55244 SDLoc dl(N);
55245 SDValue Op = N->getOperand(0);
55246 EVT VT = Op.getValueType();
55247
55248 // If we have a constant logical shift that's only used in a comparison
55249 // against zero turn it into an equivalent AND. This allows turning it into
55250 // a TEST instruction later.
55251 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
55252 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
55253 onlyZeroFlagUsed(SDValue(N, 0))) {
55254 unsigned BitWidth = VT.getSizeInBits();
55255 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
55256 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
55257 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55258 APInt Mask = Op.getOpcode() == ISD::SRL
55259 ? APInt::getHighBitsSet(BitWidth, MaskBits)
55260 : APInt::getLowBitsSet(BitWidth, MaskBits);
55261 if (Mask.isSignedIntN(32)) {
55262 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
55263 DAG.getConstant(Mask, dl, VT));
55264 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55265 DAG.getConstant(0, dl, VT));
55266 }
55267 }
55268 }
55269
55270 // Peek through any zero-extend if we're only testing for a zero result.
55271 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
55272 SDValue Src = Op.getOperand(0);
55273 EVT SrcVT = Src.getValueType();
55274 if (SrcVT.getScalarSizeInBits() >= 8 &&
55275 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
55276 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55277 DAG.getConstant(0, dl, SrcVT));
55278 }
55279
55280 // Look for a truncate.
55281 if (Op.getOpcode() != ISD::TRUNCATE)
55282 return SDValue();
55283
55284 SDValue Trunc = Op;
55285 Op = Op.getOperand(0);
55286
55287 // See if we can compare with zero against the truncation source,
55288 // which should help using the Z flag from many ops. Only do this for
55289 // i32 truncated op to prevent partial-reg compares of promoted ops.
55290 EVT OpVT = Op.getValueType();
55291 APInt UpperBits =
55292 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
55293 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55294 onlyZeroFlagUsed(SDValue(N, 0))) {
55295 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55296 DAG.getConstant(0, dl, OpVT));
55297 }
55298
55299 // After this the truncate and arithmetic op must have a single use.
55300 if (!Trunc.hasOneUse() || !Op.hasOneUse())
55301 return SDValue();
55302
55303 unsigned NewOpc;
55304 switch (Op.getOpcode()) {
55305 default: return SDValue();
55306 case ISD::AND:
55307 // Skip and with constant. We have special handling for and with immediate
55308 // during isel to generate test instructions.
55309 if (isa<ConstantSDNode>(Op.getOperand(1)))
55310 return SDValue();
55311 NewOpc = X86ISD::AND;
55312 break;
55313 case ISD::OR: NewOpc = X86ISD::OR; break;
55314 case ISD::XOR: NewOpc = X86ISD::XOR; break;
55315 case ISD::ADD:
55316 // If the carry or overflow flag is used, we can't truncate.
55317 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55318 return SDValue();
55319 NewOpc = X86ISD::ADD;
55320 break;
55321 case ISD::SUB:
55322 // If the carry or overflow flag is used, we can't truncate.
55323 if (needCarryOrOverflowFlag(SDValue(N, 0)))
55324 return SDValue();
55325 NewOpc = X86ISD::SUB;
55326 break;
55327 }
55328
55329 // We found an op we can narrow. Truncate its inputs.
55330 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
55331 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
55332
55333 // Use a X86 specific opcode to avoid DAG combine messing with it.
55334 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55335 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
55336
55337 // For AND, keep a CMP so that we can match the test pattern.
55338 if (NewOpc == X86ISD::AND)
55339 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55340 DAG.getConstant(0, dl, VT));
55341
55342 // Return the flags.
55343 return Op.getValue(1);
55344}
55345
55346static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
55347 TargetLowering::DAGCombinerInfo &DCI) {
55348 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55349, __extension__
__PRETTY_FUNCTION__))
55349 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55349, __extension__
__PRETTY_FUNCTION__))
;
55350
55351 SDLoc DL(N);
55352 SDValue LHS = N->getOperand(0);
55353 SDValue RHS = N->getOperand(1);
55354 MVT VT = LHS.getSimpleValueType();
55355 bool IsSub = X86ISD::SUB == N->getOpcode();
55356 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
55357
55358 // If we don't use the flag result, simplify back to a generic ADD/SUB.
55359 if (!N->hasAnyUseOfValue(1)) {
55360 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
55361 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
55362 }
55363
55364 // Fold any similar generic ADD/SUB opcodes to reuse this node.
55365 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
55366 SDValue Ops[] = {N0, N1};
55367 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55368 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
55369 SDValue Op(N, 0);
55370 if (Negate)
55371 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
55372 DCI.CombineTo(GenericAddSub, Op);
55373 }
55374 };
55375 MatchGeneric(LHS, RHS, false);
55376 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55377
55378 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
55379 // EFLAGS result doesn't change.
55380 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
55381 /*ZeroSecondOpOnly*/ true);
55382}
55383
55384static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
55385 SDValue LHS = N->getOperand(0);
55386 SDValue RHS = N->getOperand(1);
55387 SDValue BorrowIn = N->getOperand(2);
55388
55389 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
55390 MVT VT = N->getSimpleValueType(0);
55391 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55392 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
55393 }
55394
55395 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55396 // iff the flag result is dead.
55397 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
55398 !N->hasAnyUseOfValue(1))
55399 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55400 LHS.getOperand(1), BorrowIn);
55401
55402 return SDValue();
55403}
55404
55405// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
55406static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
55407 TargetLowering::DAGCombinerInfo &DCI) {
55408 SDValue LHS = N->getOperand(0);
55409 SDValue RHS = N->getOperand(1);
55410 SDValue CarryIn = N->getOperand(2);
55411 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
55412 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
55413
55414 // Canonicalize constant to RHS.
55415 if (LHSC && !RHSC)
55416 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55417 CarryIn);
55418
55419 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
55420 // the result is either zero or one (depending on the input carry bit).
55421 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
55422 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55423 // We don't have a good way to replace an EFLAGS use, so only do this when
55424 // dead right now.
55425 SDValue(N, 1).use_empty()) {
55426 SDLoc DL(N);
55427 EVT VT = N->getValueType(0);
55428 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55429 SDValue Res1 = DAG.getNode(
55430 ISD::AND, DL, VT,
55431 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
55432 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
55433 DAG.getConstant(1, DL, VT));
55434 return DCI.CombineTo(N, Res1, CarryOut);
55435 }
55436
55437 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55438 // iff the flag result is dead.
55439 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
55440 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55441 SDLoc DL(N);
55442 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55443 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55444 DAG.getConstant(0, DL, LHS.getValueType()),
55445 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
55446 }
55447
55448 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
55449 MVT VT = N->getSimpleValueType(0);
55450 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55451 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
55452 }
55453
55454 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55455 // iff the flag result is dead.
55456 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55457 !N->hasAnyUseOfValue(1))
55458 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55459 LHS.getOperand(1), CarryIn);
55460
55461 return SDValue();
55462}
55463
55464static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
55465 const SDLoc &DL, EVT VT,
55466 const X86Subtarget &Subtarget) {
55467 // Example of pattern we try to detect:
55468 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
55469 //(add (build_vector (extract_elt t, 0),
55470 // (extract_elt t, 2),
55471 // (extract_elt t, 4),
55472 // (extract_elt t, 6)),
55473 // (build_vector (extract_elt t, 1),
55474 // (extract_elt t, 3),
55475 // (extract_elt t, 5),
55476 // (extract_elt t, 7)))
55477
55478 if (!Subtarget.hasSSE2())
55479 return SDValue();
55480
55481 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
55482 Op1.getOpcode() != ISD::BUILD_VECTOR)
55483 return SDValue();
55484
55485 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55486 VT.getVectorNumElements() < 4 ||
55487 !isPowerOf2_32(VT.getVectorNumElements()))
55488 return SDValue();
55489
55490 // Check if one of Op0,Op1 is of the form:
55491 // (build_vector (extract_elt Mul, 0),
55492 // (extract_elt Mul, 2),
55493 // (extract_elt Mul, 4),
55494 // ...
55495 // the other is of the form:
55496 // (build_vector (extract_elt Mul, 1),
55497 // (extract_elt Mul, 3),
55498 // (extract_elt Mul, 5),
55499 // ...
55500 // and identify Mul.
55501 SDValue Mul;
55502 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
55503 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55504 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55505 // TODO: Be more tolerant to undefs.
55506 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55507 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55508 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55509 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55510 return SDValue();
55511 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55512 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55513 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55514 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55515 if (!Const0L || !Const1L || !Const0H || !Const1H)
55516 return SDValue();
55517 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55518 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55519 // Commutativity of mul allows factors of a product to reorder.
55520 if (Idx0L > Idx1L)
55521 std::swap(Idx0L, Idx1L);
55522 if (Idx0H > Idx1H)
55523 std::swap(Idx0H, Idx1H);
55524 // Commutativity of add allows pairs of factors to reorder.
55525 if (Idx0L > Idx0H) {
55526 std::swap(Idx0L, Idx0H);
55527 std::swap(Idx1L, Idx1H);
55528 }
55529 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
55530 Idx1H != 2 * i + 3)
55531 return SDValue();
55532 if (!Mul) {
55533 // First time an extract_elt's source vector is visited. Must be a MUL
55534 // with 2X number of vector elements than the BUILD_VECTOR.
55535 // Both extracts must be from same MUL.
55536 Mul = Op0L->getOperand(0);
55537 if (Mul->getOpcode() != ISD::MUL ||
55538 Mul.getValueType().getVectorNumElements() != 2 * e)
55539 return SDValue();
55540 }
55541 // Check that the extract is from the same MUL previously seen.
55542 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55543 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55544 return SDValue();
55545 }
55546
55547 // Check if the Mul source can be safely shrunk.
55548 ShrinkMode Mode;
55549 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
55550 Mode == ShrinkMode::MULU16)
55551 return SDValue();
55552
55553 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55554 VT.getVectorNumElements() * 2);
55555 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
55556 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
55557
55558 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55559 ArrayRef<SDValue> Ops) {
55560 EVT InVT = Ops[0].getValueType();
55561 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55561, __extension__
__PRETTY_FUNCTION__))
;
55562 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55563 InVT.getVectorNumElements() / 2);
55564 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55565 };
55566 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
55567}
55568
55569// Attempt to turn this pattern into PMADDWD.
55570// (add (mul (sext (build_vector)), (sext (build_vector))),
55571// (mul (sext (build_vector)), (sext (build_vector)))
55572static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
55573 const SDLoc &DL, EVT VT,
55574 const X86Subtarget &Subtarget) {
55575 if (!Subtarget.hasSSE2())
55576 return SDValue();
55577
55578 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
55579 return SDValue();
55580
55581 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55582 VT.getVectorNumElements() < 4 ||
55583 !isPowerOf2_32(VT.getVectorNumElements()))
55584 return SDValue();
55585
55586 SDValue N00 = N0.getOperand(0);
55587 SDValue N01 = N0.getOperand(1);
55588 SDValue N10 = N1.getOperand(0);
55589 SDValue N11 = N1.getOperand(1);
55590
55591 // All inputs need to be sign extends.
55592 // TODO: Support ZERO_EXTEND from known positive?
55593 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
55594 N01.getOpcode() != ISD::SIGN_EXTEND ||
55595 N10.getOpcode() != ISD::SIGN_EXTEND ||
55596 N11.getOpcode() != ISD::SIGN_EXTEND)
55597 return SDValue();
55598
55599 // Peek through the extends.
55600 N00 = N00.getOperand(0);
55601 N01 = N01.getOperand(0);
55602 N10 = N10.getOperand(0);
55603 N11 = N11.getOperand(0);
55604
55605 // Must be extending from vXi16.
55606 EVT InVT = N00.getValueType();
55607 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
55608 N10.getValueType() != InVT || N11.getValueType() != InVT)
55609 return SDValue();
55610
55611 // All inputs should be build_vectors.
55612 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
55613 N01.getOpcode() != ISD::BUILD_VECTOR ||
55614 N10.getOpcode() != ISD::BUILD_VECTOR ||
55615 N11.getOpcode() != ISD::BUILD_VECTOR)
55616 return SDValue();
55617
55618 // For each element, we need to ensure we have an odd element from one vector
55619 // multiplied by the odd element of another vector and the even element from
55620 // one of the same vectors being multiplied by the even element from the
55621 // other vector. So we need to make sure for each element i, this operator
55622 // is being performed:
55623 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
55624 SDValue In0, In1;
55625 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
55626 SDValue N00Elt = N00.getOperand(i);
55627 SDValue N01Elt = N01.getOperand(i);
55628 SDValue N10Elt = N10.getOperand(i);
55629 SDValue N11Elt = N11.getOperand(i);
55630 // TODO: Be more tolerant to undefs.
55631 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55632 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55633 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55634 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55635 return SDValue();
55636 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55637 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55638 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55639 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55640 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55641 return SDValue();
55642 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55643 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55644 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55645 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55646 // Add is commutative so indices can be reordered.
55647 if (IdxN00 > IdxN10) {
55648 std::swap(IdxN00, IdxN10);
55649 std::swap(IdxN01, IdxN11);
55650 }
55651 // N0 indices be the even element. N1 indices must be the next odd element.
55652 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55653 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55654 return SDValue();
55655 SDValue N00In = N00Elt.getOperand(0);
55656 SDValue N01In = N01Elt.getOperand(0);
55657 SDValue N10In = N10Elt.getOperand(0);
55658 SDValue N11In = N11Elt.getOperand(0);
55659
55660 // First time we find an input capture it.
55661 if (!In0) {
55662 In0 = N00In;
55663 In1 = N01In;
55664
55665 // The input vectors must be at least as wide as the output.
55666 // If they are larger than the output, we extract subvector below.
55667 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55668 In1.getValueSizeInBits() < VT.getSizeInBits())
55669 return SDValue();
55670 }
55671 // Mul is commutative so the input vectors can be in any order.
55672 // Canonicalize to make the compares easier.
55673 if (In0 != N00In)
55674 std::swap(N00In, N01In);
55675 if (In0 != N10In)
55676 std::swap(N10In, N11In);
55677 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55678 return SDValue();
55679 }
55680
55681 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55682 ArrayRef<SDValue> Ops) {
55683 EVT OpVT = Ops[0].getValueType();
55684 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55685, __extension__
__PRETTY_FUNCTION__))
55685 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55685, __extension__
__PRETTY_FUNCTION__))
;
55686 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55686, __extension__
__PRETTY_FUNCTION__))
;
55687 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55688 OpVT.getVectorNumElements() / 2);
55689 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55690 };
55691
55692 // If the output is narrower than an input, extract the low part of the input
55693 // vector.
55694 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55695 VT.getVectorNumElements() * 2);
55696 if (OutVT16.bitsLT(In0.getValueType())) {
55697 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
55698 DAG.getIntPtrConstant(0, DL));
55699 }
55700 if (OutVT16.bitsLT(In1.getValueType())) {
55701 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
55702 DAG.getIntPtrConstant(0, DL));
55703 }
55704 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
55705 PMADDBuilder);
55706}
55707
55708// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
55709// If upper element in each pair of both VPMADDWD are zero then we can merge
55710// the operand elements and use the implicit add of VPMADDWD.
55711// TODO: Add support for VPMADDUBSW (which isn't commutable).
55712static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
55713 const SDLoc &DL, EVT VT) {
55714 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
55715 return SDValue();
55716
55717 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
55718 if (VT.getSizeInBits() > 128)
55719 return SDValue();
55720
55721 unsigned NumElts = VT.getVectorNumElements();
55722 MVT OpVT = N0.getOperand(0).getSimpleValueType();
55723 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
55724 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
55725
55726 bool Op0HiZero =
55727 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
55728 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
55729 bool Op1HiZero =
55730 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
55731 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
55732
55733 // TODO: Check for zero lower elements once we have actual codegen that
55734 // creates them.
55735 if (!Op0HiZero || !Op1HiZero)
55736 return SDValue();
55737
55738 // Create a shuffle mask packing the lower elements from each VPMADDWD.
55739 SmallVector<int> Mask;
55740 for (int i = 0; i != (int)NumElts; ++i) {
55741 Mask.push_back(2 * i);
55742 Mask.push_back(2 * (i + NumElts));
55743 }
55744
55745 SDValue LHS =
55746 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
55747 SDValue RHS =
55748 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
55749 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
55750}
55751
55752/// CMOV of constants requires materializing constant operands in registers.
55753/// Try to fold those constants into an 'add' instruction to reduce instruction
55754/// count. We do this with CMOV rather the generic 'select' because there are
55755/// earlier folds that may be used to turn select-of-constants into logic hacks.
55756static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
55757 const X86Subtarget &Subtarget) {
55758 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
55759 // better because we eliminate 1-2 instructions. This transform is still
55760 // an improvement without zero operands because we trade 2 move constants and
55761 // 1 add for 2 adds (LEA) as long as the constants can be represented as
55762 // immediate asm operands (fit in 32-bits).
55763 auto isSuitableCmov = [](SDValue V) {
55764 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
55765 return false;
55766 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
55767 !isa<ConstantSDNode>(V.getOperand(1)))
55768 return false;
55769 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
55770 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
55771 V.getConstantOperandAPInt(1).isSignedIntN(32));
55772 };
55773
55774 // Match an appropriate CMOV as the first operand of the add.
55775 SDValue Cmov = N->getOperand(0);
55776 SDValue OtherOp = N->getOperand(1);
55777 if (!isSuitableCmov(Cmov))
55778 std::swap(Cmov, OtherOp);
55779 if (!isSuitableCmov(Cmov))
55780 return SDValue();
55781
55782 // Don't remove a load folding opportunity for the add. That would neutralize
55783 // any improvements from removing constant materializations.
55784 if (X86::mayFoldLoad(OtherOp, Subtarget))
55785 return SDValue();
55786
55787 EVT VT = N->getValueType(0);
55788 SDLoc DL(N);
55789 SDValue FalseOp = Cmov.getOperand(0);
55790 SDValue TrueOp = Cmov.getOperand(1);
55791
55792 // We will push the add through the select, but we can potentially do better
55793 // if we know there is another add in the sequence and this is pointer math.
55794 // In that case, we can absorb an add into the trailing memory op and avoid
55795 // a 3-operand LEA which is likely slower than a 2-operand LEA.
55796 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
55797 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
55798 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
55799 all_of(N->uses(), [&](SDNode *Use) {
55800 auto *MemNode = dyn_cast<MemSDNode>(Use);
55801 return MemNode && MemNode->getBasePtr().getNode() == N;
55802 })) {
55803 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55804 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55805 // it is possible that choosing op1 might be better.
55806 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55807 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55808 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55809 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55810 Cmov.getOperand(2), Cmov.getOperand(3));
55811 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55812 }
55813
55814 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55815 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55816 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55817 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55818 Cmov.getOperand(3));
55819}
55820
55821static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
55822 TargetLowering::DAGCombinerInfo &DCI,
55823 const X86Subtarget &Subtarget) {
55824 EVT VT = N->getValueType(0);
55825 SDValue Op0 = N->getOperand(0);
55826 SDValue Op1 = N->getOperand(1);
55827 SDLoc DL(N);
55828
55829 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
55830 return Select;
55831
55832 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55833 return MAdd;
55834 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55835 return MAdd;
55836 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55837 return MAdd;
55838
55839 // Try to synthesize horizontal adds from adds of shuffles.
55840 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55841 return V;
55842
55843 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55844 // (sub Y, (sext (vXi1 X))).
55845 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55846 // generic DAG combine without a legal type check, but adding this there
55847 // caused regressions.
55848 if (VT.isVector()) {
55849 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55850 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55851 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55852 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55853 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55854 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55855 }
55856
55857 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55858 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55859 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55860 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55861 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55862 }
55863 }
55864
55865 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55866 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55867 X86::isZeroNode(Op0.getOperand(1))) {
55868 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55868, __extension__
__PRETTY_FUNCTION__))
;
55869 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55870 Op0.getOperand(0), Op0.getOperand(2));
55871 }
55872
55873 return combineAddOrSubToADCOrSBB(N, DAG);
55874}
55875
55876// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55877// condition comes from the subtract node that produced -X. This matches the
55878// cmov expansion for absolute value. By swapping the operands we convert abs
55879// to nabs.
55880static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
55881 SDValue N0 = N->getOperand(0);
55882 SDValue N1 = N->getOperand(1);
55883
55884 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55885 return SDValue();
55886
55887 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
55888 if (CC != X86::COND_S && CC != X86::COND_NS)
55889 return SDValue();
55890
55891 // Condition should come from a negate operation.
55892 SDValue Cond = N1.getOperand(3);
55893 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55894 return SDValue();
55895 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55895, __extension__
__PRETTY_FUNCTION__))
;
55896
55897 // Get the X and -X from the negate.
55898 SDValue NegX = Cond.getValue(0);
55899 SDValue X = Cond.getOperand(1);
55900
55901 SDValue FalseOp = N1.getOperand(0);
55902 SDValue TrueOp = N1.getOperand(1);
55903
55904 // Cmov operands should be X and NegX. Order doesn't matter.
55905 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55906 return SDValue();
55907
55908 // Build a new CMOV with the operands swapped.
55909 SDLoc DL(N);
55910 MVT VT = N->getSimpleValueType(0);
55911 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55912 N1.getOperand(2), Cond);
55913 // Convert sub to add.
55914 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55915}
55916
55917static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
55918 SDValue Op0 = N->getOperand(0);
55919 SDValue Op1 = N->getOperand(1);
55920
55921 // (sub C (zero_extend (setcc)))
55922 // =>
55923 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55924 // Don't disturb (sub 0 setcc), which is easily done with neg.
55925 EVT VT = N->getValueType(0);
55926 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55927 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55928 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55929 Op1.getOperand(0).hasOneUse()) {
55930 SDValue SetCC = Op1.getOperand(0);
55931 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
55932 X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
55933 uint64_t NewImm = Op0C->getZExtValue() - 1;
55934 SDLoc DL(Op1);
55935 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55936 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55937 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55938 DAG.getConstant(NewImm, DL, VT));
55939 }
55940
55941 return SDValue();
55942}
55943
55944static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
55945 TargetLowering::DAGCombinerInfo &DCI,
55946 const X86Subtarget &Subtarget) {
55947 SDValue Op0 = N->getOperand(0);
55948 SDValue Op1 = N->getOperand(1);
55949
55950 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55951 auto IsNonOpaqueConstant = [&](SDValue Op) {
55952 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
55953 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55954 return !Cst->isOpaque();
55955 return true;
55956 }
55957 return false;
55958 };
55959
55960 // X86 can't encode an immediate LHS of a sub. See if we can push the
55961 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55962 // one use and a constant, invert the immediate, saving one register.
55963 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55964 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55965 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
55966 SDLoc DL(N);
55967 EVT VT = Op0.getValueType();
55968 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55969 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55970 SDValue NewAdd =
55971 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55972 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55973 }
55974
55975 if (SDValue V = combineSubABS(N, DAG))
55976 return V;
55977
55978 // Try to synthesize horizontal subs from subs of shuffles.
55979 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55980 return V;
55981
55982 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55983 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55984 X86::isZeroNode(Op1.getOperand(1))) {
55985 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55985, __extension__
__PRETTY_FUNCTION__))
;
55986 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55987 Op1.getOperand(0), Op1.getOperand(2));
55988 }
55989
55990 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55991 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55992 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55993 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55994 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55994, __extension__
__PRETTY_FUNCTION__))
;
55995 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55996 Op1.getOperand(1), Op1.getOperand(2));
55997 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
55998 Op1.getOperand(0));
55999 }
56000
56001 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
56002 return V;
56003
56004 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
56005 return V;
56006
56007 return combineSubSetcc(N, DAG);
56008}
56009
56010static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
56011 const X86Subtarget &Subtarget) {
56012 MVT VT = N->getSimpleValueType(0);
56013 SDLoc DL(N);
56014
56015 if (N->getOperand(0) == N->getOperand(1)) {
56016 if (N->getOpcode() == X86ISD::PCMPEQ)
56017 return DAG.getConstant(-1, DL, VT);
56018 if (N->getOpcode() == X86ISD::PCMPGT)
56019 return DAG.getConstant(0, DL, VT);
56020 }
56021
56022 return SDValue();
56023}
56024
56025/// Helper that combines an array of subvector ops as if they were the operands
56026/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
56027/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
56028static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
56029 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
56030 TargetLowering::DAGCombinerInfo &DCI,
56031 const X86Subtarget &Subtarget) {
56032 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56032, __extension__
__PRETTY_FUNCTION__))
;
56033 unsigned EltSizeInBits = VT.getScalarSizeInBits();
56034
56035 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
56036 return DAG.getUNDEF(VT);
56037
56038 if (llvm::all_of(Ops, [](SDValue Op) {
56039 return ISD::isBuildVectorAllZeros(Op.getNode());
56040 }))
56041 return getZeroVector(VT, Subtarget, DAG, DL);
56042
56043 SDValue Op0 = Ops[0];
56044 bool IsSplat = llvm::all_equal(Ops);
56045
56046 // Repeated subvectors.
56047 if (IsSplat &&
56048 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56049 // If this broadcast is inserted into both halves, use a larger broadcast.
56050 if (Op0.getOpcode() == X86ISD::VBROADCAST)
56051 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
56052
56053 // If this simple subvector or scalar/subvector broadcast_load is inserted
56054 // into both halves, use a larger broadcast_load. Update other uses to use
56055 // an extracted subvector.
56056 if (ISD::isNormalLoad(Op0.getNode()) ||
56057 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56058 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
56059 auto *Mem = cast<MemSDNode>(Op0);
56060 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56061 ? X86ISD::VBROADCAST_LOAD
56062 : X86ISD::SUBV_BROADCAST_LOAD;
56063 if (SDValue BcastLd =
56064 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56065 SDValue BcastSrc =
56066 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56067 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56068 return BcastLd;
56069 }
56070 }
56071
56072 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56073 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
56074 (Subtarget.hasAVX2() ||
56075 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
56076 VT.getScalarType(), Subtarget)))
56077 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
56078 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
56079 Op0.getOperand(0),
56080 DAG.getIntPtrConstant(0, DL)));
56081
56082 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56083 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56084 (Subtarget.hasAVX2() ||
56085 (EltSizeInBits >= 32 &&
56086 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
56087 Op0.getOperand(0).getValueType() == VT.getScalarType())
56088 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
56089
56090 // concat_vectors(extract_subvector(broadcast(x)),
56091 // extract_subvector(broadcast(x))) -> broadcast(x)
56092 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56093 Op0.getOperand(0).getValueType() == VT) {
56094 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
56095 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
56096 return Op0.getOperand(0);
56097 }
56098 }
56099
56100 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56101 // Only concat of subvector high halves which vperm2x128 is best at.
56102 // TODO: This should go in combineX86ShufflesRecursively eventually.
56103 if (VT.is256BitVector() && Ops.size() == 2) {
56104 SDValue Src0 = peekThroughBitcasts(Ops[0]);
56105 SDValue Src1 = peekThroughBitcasts(Ops[1]);
56106 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56107 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
56108 EVT SrcVT0 = Src0.getOperand(0).getValueType();
56109 EVT SrcVT1 = Src1.getOperand(0).getValueType();
56110 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
56111 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
56112 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
56113 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
56114 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
56115 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
56116 DAG.getBitcast(VT, Src0.getOperand(0)),
56117 DAG.getBitcast(VT, Src1.getOperand(0)),
56118 DAG.getTargetConstant(0x31, DL, MVT::i8));
56119 }
56120 }
56121 }
56122
56123 // Repeated opcode.
56124 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56125 // but it currently struggles with different vector widths.
56126 if (llvm::all_of(Ops, [Op0](SDValue Op) {
56127 return Op.getOpcode() == Op0.getOpcode();
56128 })) {
56129 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
56130 SmallVector<SDValue> Subs;
56131 for (SDValue SubOp : SubOps)
56132 Subs.push_back(SubOp.getOperand(I));
56133 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
56134 };
56135 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
56136 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
56137 SDValue Sub = SubOps[I].getOperand(Op);
56138 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
56139 if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
56140 Sub.getOperand(0).getValueType() != VT ||
56141 Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
56142 return false;
56143 }
56144 return true;
56145 };
56146
56147 unsigned NumOps = Ops.size();
56148 switch (Op0.getOpcode()) {
56149 case X86ISD::VBROADCAST: {
56150 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
56151 return Op.getOperand(0).getValueType().is128BitVector();
56152 })) {
56153 if (VT == MVT::v4f64 || VT == MVT::v4i64)
56154 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
56155 ConcatSubOperand(VT, Ops, 0),
56156 ConcatSubOperand(VT, Ops, 0));
56157 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
56158 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
56159 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
56160 : X86ISD::PSHUFD,
56161 DL, VT, ConcatSubOperand(VT, Ops, 0),
56162 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56163 }
56164 break;
56165 }
56166 case X86ISD::MOVDDUP:
56167 case X86ISD::MOVSHDUP:
56168 case X86ISD::MOVSLDUP: {
56169 if (!IsSplat)
56170 return DAG.getNode(Op0.getOpcode(), DL, VT,
56171 ConcatSubOperand(VT, Ops, 0));
56172 break;
56173 }
56174 case X86ISD::SHUFP: {
56175 // Add SHUFPD support if/when necessary.
56176 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56177 llvm::all_of(Ops, [Op0](SDValue Op) {
56178 return Op.getOperand(2) == Op0.getOperand(2);
56179 })) {
56180 return DAG.getNode(Op0.getOpcode(), DL, VT,
56181 ConcatSubOperand(VT, Ops, 0),
56182 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56183 }
56184 break;
56185 }
56186 case X86ISD::PSHUFHW:
56187 case X86ISD::PSHUFLW:
56188 case X86ISD::PSHUFD:
56189 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56190 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56191 return DAG.getNode(Op0.getOpcode(), DL, VT,
56192 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56193 }
56194 [[fallthrough]];
56195 case X86ISD::VPERMILPI:
56196 if (!IsSplat && VT.getScalarSizeInBits() == 32 &&
56197 (VT.is256BitVector() ||
56198 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56199 all_of(Ops, [&Op0](SDValue Op) {
56200 return Op0.getOperand(1) == Op.getOperand(1);
56201 })) {
56202 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56203 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56204 Res =
56205 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56206 return DAG.getBitcast(VT, Res);
56207 }
56208 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56209 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56210 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56211 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56212 return DAG.getNode(Op0.getOpcode(), DL, VT,
56213 ConcatSubOperand(VT, Ops, 0),
56214 DAG.getTargetConstant(Idx, DL, MVT::i8));
56215 }
56216 break;
56217 case X86ISD::PSHUFB:
56218 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56219 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56220 return DAG.getNode(Op0.getOpcode(), DL, VT,
56221 ConcatSubOperand(VT, Ops, 0),
56222 ConcatSubOperand(VT, Ops, 1));
56223 }
56224 break;
56225 case X86ISD::VPERMV:
56226 if (!IsSplat && NumOps == 2 &&
56227 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56228 MVT OpVT = Op0.getSimpleValueType();
56229 int NumSrcElts = OpVT.getVectorNumElements();
56230 SmallVector<int, 64> ConcatMask;
56231 for (unsigned i = 0; i != NumOps; ++i) {
56232 SmallVector<int, 64> SubMask;
56233 SmallVector<SDValue, 2> SubOps;
56234 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56235 SubMask))
56236 break;
56237 for (int M : SubMask) {
56238 if (0 <= M)
56239 M += i * NumSrcElts;
56240 ConcatMask.push_back(M);
56241 }
56242 }
56243 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56244 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56245 Ops[1].getOperand(1), DAG, DL);
56246 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56247 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56248 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56249 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56250 }
56251 }
56252 break;
56253 case X86ISD::VPERMV3:
56254 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56255 MVT OpVT = Op0.getSimpleValueType();
56256 int NumSrcElts = OpVT.getVectorNumElements();
56257 SmallVector<int, 64> ConcatMask;
56258 for (unsigned i = 0; i != NumOps; ++i) {
56259 SmallVector<int, 64> SubMask;
56260 SmallVector<SDValue, 2> SubOps;
56261 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56262 SubMask))
56263 break;
56264 for (int M : SubMask) {
56265 if (0 <= M) {
56266 M += M < NumSrcElts ? 0 : NumSrcElts;
56267 M += i * NumSrcElts;
56268 }
56269 ConcatMask.push_back(M);
56270 }
56271 }
56272 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56273 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56274 Ops[1].getOperand(0), DAG, DL);
56275 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56276 Ops[1].getOperand(2), DAG, DL);
56277 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56278 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56279 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56280 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56281 }
56282 }
56283 break;
56284 case ISD::TRUNCATE:
56285 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56286 EVT SrcVT = Ops[0].getOperand(0).getValueType();
56287 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56288 SrcVT == Ops[1].getOperand(0).getValueType() &&
56289 Subtarget.useAVX512Regs() &&
56290 Subtarget.getPreferVectorWidth() >= 512 &&
56291 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56292 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56293 return DAG.getNode(ISD::TRUNCATE, DL, VT,
56294 ConcatSubOperand(NewSrcVT, Ops, 0));
56295 }
56296 }
56297 break;
56298 case X86ISD::VSHLI:
56299 case X86ISD::VSRLI:
56300 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56301 // TODO: Move this to LowerShiftByScalarImmediate?
56302 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56303 llvm::all_of(Ops, [](SDValue Op) {
56304 return Op.getConstantOperandAPInt(1) == 32;
56305 })) {
56306 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56307 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56308 if (Op0.getOpcode() == X86ISD::VSHLI) {
56309 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56310 {8, 0, 8, 2, 8, 4, 8, 6});
56311 } else {
56312 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56313 {1, 8, 3, 8, 5, 8, 7, 8});
56314 }
56315 return DAG.getBitcast(VT, Res);
56316 }
56317 [[fallthrough]];
56318 case X86ISD::VSRAI:
56319 case X86ISD::VSHL:
56320 case X86ISD::VSRL:
56321 case X86ISD::VSRA:
56322 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
56323 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56324 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
56325 llvm::all_of(Ops, [Op0](SDValue Op) {
56326 return Op0.getOperand(1) == Op.getOperand(1);
56327 })) {
56328 return DAG.getNode(Op0.getOpcode(), DL, VT,
56329 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56330 }
56331 break;
56332 case X86ISD::VPERMI:
56333 case X86ISD::VROTLI:
56334 case X86ISD::VROTRI:
56335 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56336 llvm::all_of(Ops, [Op0](SDValue Op) {
56337 return Op0.getOperand(1) == Op.getOperand(1);
56338 })) {
56339 return DAG.getNode(Op0.getOpcode(), DL, VT,
56340 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56341 }
56342 break;
56343 case ISD::AND:
56344 case ISD::OR:
56345 case ISD::XOR:
56346 case X86ISD::ANDNP:
56347 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56348 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56349 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56350 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56351 NumOps * SrcVT.getVectorNumElements());
56352 return DAG.getNode(Op0.getOpcode(), DL, VT,
56353 ConcatSubOperand(SrcVT, Ops, 0),
56354 ConcatSubOperand(SrcVT, Ops, 1));
56355 }
56356 break;
56357 case X86ISD::GF2P8AFFINEQB:
56358 if (!IsSplat &&
56359 (VT.is256BitVector() ||
56360 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56361 llvm::all_of(Ops, [Op0](SDValue Op) {
56362 return Op0.getOperand(2) == Op.getOperand(2);
56363 })) {
56364 return DAG.getNode(Op0.getOpcode(), DL, VT,
56365 ConcatSubOperand(VT, Ops, 0),
56366 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56367 }
56368 break;
56369 case ISD::ADD:
56370 case ISD::SUB:
56371 case ISD::MUL:
56372 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56373 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56374 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
56375 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56376 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56377 NumOps * SrcVT.getVectorNumElements());
56378 return DAG.getNode(Op0.getOpcode(), DL, VT,
56379 ConcatSubOperand(SrcVT, Ops, 0),
56380 ConcatSubOperand(SrcVT, Ops, 1));
56381 }
56382 break;
56383 case ISD::FADD:
56384 case ISD::FSUB:
56385 case ISD::FMUL:
56386 case ISD::FDIV:
56387 if (!IsSplat && (VT.is256BitVector() ||
56388 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56389 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56390 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56391 NumOps * SrcVT.getVectorNumElements());
56392 return DAG.getNode(Op0.getOpcode(), DL, VT,
56393 ConcatSubOperand(SrcVT, Ops, 0),
56394 ConcatSubOperand(SrcVT, Ops, 1));
56395 }
56396 break;
56397 case X86ISD::HADD:
56398 case X86ISD::HSUB:
56399 case X86ISD::FHADD:
56400 case X86ISD::FHSUB:
56401 case X86ISD::PACKSS:
56402 case X86ISD::PACKUS:
56403 if (!IsSplat && VT.is256BitVector() &&
56404 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
56405 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56406 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56407 NumOps * SrcVT.getVectorNumElements());
56408 return DAG.getNode(Op0.getOpcode(), DL, VT,
56409 ConcatSubOperand(SrcVT, Ops, 0),
56410 ConcatSubOperand(SrcVT, Ops, 1));
56411 }
56412 break;
56413 case X86ISD::PALIGNR:
56414 if (!IsSplat &&
56415 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56416 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
56417 llvm::all_of(Ops, [Op0](SDValue Op) {
56418 return Op0.getOperand(2) == Op.getOperand(2);
56419 })) {
56420 return DAG.getNode(Op0.getOpcode(), DL, VT,
56421 ConcatSubOperand(VT, Ops, 0),
56422 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56423 }
56424 break;
56425 case ISD::VSELECT:
56426 if (!IsSplat && Subtarget.hasAVX512() &&
56427 (VT.is256BitVector() ||
56428 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56429 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
56430 EVT SelVT = Ops[0].getOperand(0).getValueType();
56431 if (SelVT.getVectorElementType() == MVT::i1) {
56432 SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
56433 Ops.size() * SelVT.getVectorNumElements());
56434 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56435 return DAG.getNode(Op0.getOpcode(), DL, VT,
56436 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56437 ConcatSubOperand(VT, Ops, 1),
56438 ConcatSubOperand(VT, Ops, 2));
56439 }
56440 }
56441 [[fallthrough]];
56442 case X86ISD::BLENDV:
56443 if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
56444 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
56445 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
56446 EVT SelVT = Ops[0].getOperand(0).getValueType();
56447 SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56448 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
56449 return DAG.getNode(Op0.getOpcode(), DL, VT,
56450 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56451 ConcatSubOperand(VT, Ops, 1),
56452 ConcatSubOperand(VT, Ops, 2));
56453 }
56454 break;
56455 }
56456 }
56457
56458 // Fold subvector loads into one.
56459 // If needed, look through bitcasts to get to the load.
56460 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
56461 unsigned Fast;
56462 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
56463 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
56464 *FirstLd->getMemOperand(), &Fast) &&
56465 Fast) {
56466 if (SDValue Ld =
56467 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
56468 return Ld;
56469 }
56470 }
56471
56472 // Attempt to fold target constant loads.
56473 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
56474 SmallVector<APInt> EltBits;
56475 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
56476 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56477 APInt OpUndefElts;
56478 SmallVector<APInt> OpEltBits;
56479 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
56480 OpEltBits, true, false))
56481 break;
56482 EltBits.append(OpEltBits);
56483 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
56484 }
56485 if (EltBits.size() == VT.getVectorNumElements())
56486 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
56487 }
56488
56489 return SDValue();
56490}
56491
56492static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
56493 TargetLowering::DAGCombinerInfo &DCI,
56494 const X86Subtarget &Subtarget) {
56495 EVT VT = N->getValueType(0);
56496 EVT SrcVT = N->getOperand(0).getValueType();
56497 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56498 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56499
56500 if (VT.getVectorElementType() == MVT::i1) {
56501 // Attempt to constant fold.
56502 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56503 APInt Constant = APInt::getZero(VT.getSizeInBits());
56504 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56505 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56506 if (!C) break;
56507 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56508 if (I == (E - 1)) {
56509 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56510 if (TLI.isTypeLegal(IntVT))
56511 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56512 }
56513 }
56514
56515 // Don't do anything else for i1 vectors.
56516 return SDValue();
56517 }
56518
56519 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56520 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56521 DCI, Subtarget))
56522 return R;
56523 }
56524
56525 return SDValue();
56526}
56527
56528static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
56529 TargetLowering::DAGCombinerInfo &DCI,
56530 const X86Subtarget &Subtarget) {
56531 if (DCI.isBeforeLegalizeOps())
56532 return SDValue();
56533
56534 MVT OpVT = N->getSimpleValueType(0);
56535
56536 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56537
56538 SDLoc dl(N);
56539 SDValue Vec = N->getOperand(0);
56540 SDValue SubVec = N->getOperand(1);
56541
56542 uint64_t IdxVal = N->getConstantOperandVal(2);
56543 MVT SubVecVT = SubVec.getSimpleValueType();
56544
56545 if (Vec.isUndef() && SubVec.isUndef())
56546 return DAG.getUNDEF(OpVT);
56547
56548 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56549 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56550 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56551 return getZeroVector(OpVT, Subtarget, DAG, dl);
56552
56553 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
56554 // If we're inserting into a zero vector and then into a larger zero vector,
56555 // just insert into the larger zero vector directly.
56556 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56557 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
56558 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56559 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56560 getZeroVector(OpVT, Subtarget, DAG, dl),
56561 SubVec.getOperand(1),
56562 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56563 }
56564
56565 // If we're inserting into a zero vector and our input was extracted from an
56566 // insert into a zero vector of the same type and the extraction was at
56567 // least as large as the original insertion. Just insert the original
56568 // subvector into a zero vector.
56569 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56570 isNullConstant(SubVec.getOperand(1)) &&
56571 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
56572 SDValue Ins = SubVec.getOperand(0);
56573 if (isNullConstant(Ins.getOperand(2)) &&
56574 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56575 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56576 SubVecVT.getFixedSizeInBits())
56577 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56578 getZeroVector(OpVT, Subtarget, DAG, dl),
56579 Ins.getOperand(1), N->getOperand(2));
56580 }
56581 }
56582
56583 // Stop here if this is an i1 vector.
56584 if (IsI1Vector)
56585 return SDValue();
56586
56587 // Eliminate an intermediate vector widening:
56588 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56589 // insert_subvector X, Y, Idx
56590 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56591 // there?
56592 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56593 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56594 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56595 SubVec.getOperand(1), N->getOperand(2));
56596
56597 // If this is an insert of an extract, combine to a shuffle. Don't do this
56598 // if the insert or extract can be represented with a subregister operation.
56599 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56600 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56601 (IdxVal != 0 ||
56602 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56603 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56604 if (ExtIdxVal != 0) {
56605 int VecNumElts = OpVT.getVectorNumElements();
56606 int SubVecNumElts = SubVecVT.getVectorNumElements();
56607 SmallVector<int, 64> Mask(VecNumElts);
56608 // First create an identity shuffle mask.
56609 for (int i = 0; i != VecNumElts; ++i)
56610 Mask[i] = i;
56611 // Now insert the extracted portion.
56612 for (int i = 0; i != SubVecNumElts; ++i)
56613 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56614
56615 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56616 }
56617 }
56618
56619 // Match concat_vector style patterns.
56620 SmallVector<SDValue, 2> SubVectorOps;
56621 if (collectConcatOps(N, SubVectorOps, DAG)) {
56622 if (SDValue Fold =
56623 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56624 return Fold;
56625
56626 // If we're inserting all zeros into the upper half, change this to
56627 // a concat with zero. We will match this to a move
56628 // with implicit upper bit zeroing during isel.
56629 // We do this here because we don't want combineConcatVectorOps to
56630 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56631 if (SubVectorOps.size() == 2 &&
56632 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56633 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56634 getZeroVector(OpVT, Subtarget, DAG, dl),
56635 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56636 }
56637
56638 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56639 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56640 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56641
56642 // If this is a broadcast load inserted into an upper undef, use a larger
56643 // broadcast load.
56644 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56645 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56646 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56647 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56648 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56649 SDValue BcastLd =
56650 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
56651 MemIntr->getMemoryVT(),
56652 MemIntr->getMemOperand());
56653 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56654 return BcastLd;
56655 }
56656
56657 // If we're splatting the lower half subvector of a full vector load into the
56658 // upper half, attempt to create a subvector broadcast.
56659 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56660 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56661 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56662 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56663 if (VecLd && SubLd &&
56664 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56665 SubVec.getValueSizeInBits() / 8, 0))
56666 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56667 SubLd, 0, DAG);
56668 }
56669
56670 return SDValue();
56671}
56672
56673/// If we are extracting a subvector of a vector select and the select condition
56674/// is composed of concatenated vectors, try to narrow the select width. This
56675/// is a common pattern for AVX1 integer code because 256-bit selects may be
56676/// legal, but there is almost no integer math/logic available for 256-bit.
56677/// This function should only be called with legal types (otherwise, the calls
56678/// to get simple value types will assert).
56679static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
56680 SDValue Sel = Ext->getOperand(0);
56681 SmallVector<SDValue, 4> CatOps;
56682 if (Sel.getOpcode() != ISD::VSELECT ||
56683 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
56684 return SDValue();
56685
56686 // Note: We assume simple value types because this should only be called with
56687 // legal operations/types.
56688 // TODO: This can be extended to handle extraction to 256-bits.
56689 MVT VT = Ext->getSimpleValueType(0);
56690 if (!VT.is128BitVector())
56691 return SDValue();
56692
56693 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56694 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56695 return SDValue();
56696
56697 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56698 MVT SelVT = Sel.getSimpleValueType();
56699 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56700, __extension__
__PRETTY_FUNCTION__))
56700 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56700, __extension__
__PRETTY_FUNCTION__))
;
56701
56702 unsigned SelElts = SelVT.getVectorNumElements();
56703 unsigned CastedElts = WideVT.getVectorNumElements();
56704 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56705 if (SelElts % CastedElts == 0) {
56706 // The select has the same or more (narrower) elements than the extract
56707 // operand. The extraction index gets scaled by that factor.
56708 ExtIdx *= (SelElts / CastedElts);
56709 } else if (CastedElts % SelElts == 0) {
56710 // The select has less (wider) elements than the extract operand. Make sure
56711 // that the extraction index can be divided evenly.
56712 unsigned IndexDivisor = CastedElts / SelElts;
56713 if (ExtIdx % IndexDivisor != 0)
56714 return SDValue();
56715 ExtIdx /= IndexDivisor;
56716 } else {
56717 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56717)
;
56718 }
56719
56720 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56721 unsigned NarrowElts = SelElts / NarrowingFactor;
56722 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56723 SDLoc DL(Ext);
56724 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56725 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56726 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56727 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56728 return DAG.getBitcast(VT, NarrowSel);
56729}
56730
56731static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
56732 TargetLowering::DAGCombinerInfo &DCI,
56733 const X86Subtarget &Subtarget) {
56734 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56735 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56736 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56737 // We let generic combining take over from there to simplify the
56738 // insert/extract and 'not'.
56739 // This pattern emerges during AVX1 legalization. We handle it before lowering
56740 // to avoid complications like splitting constant vector loads.
56741
56742 // Capture the original wide type in the likely case that we need to bitcast
56743 // back to this type.
56744 if (!N->getValueType(0).isSimple())
56745 return SDValue();
56746
56747 MVT VT = N->getSimpleValueType(0);
56748 SDValue InVec = N->getOperand(0);
56749 unsigned IdxVal = N->getConstantOperandVal(1);
56750 SDValue InVecBC = peekThroughBitcasts(InVec);
56751 EVT InVecVT = InVec.getValueType();
56752 unsigned SizeInBits = VT.getSizeInBits();
56753 unsigned InSizeInBits = InVecVT.getSizeInBits();
56754 unsigned NumSubElts = VT.getVectorNumElements();
56755 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56756
56757 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56758 TLI.isTypeLegal(InVecVT) &&
56759 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56760 auto isConcatenatedNot = [](SDValue V) {
56761 V = peekThroughBitcasts(V);
56762 if (!isBitwiseNot(V))
56763 return false;
56764 SDValue NotOp = V->getOperand(0);
56765 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
56766 };
56767 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56768 isConcatenatedNot(InVecBC.getOperand(1))) {
56769 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56770 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
56771 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
56772 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56773 }
56774 }
56775
56776 if (DCI.isBeforeLegalizeOps())
56777 return SDValue();
56778
56779 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
56780 return V;
56781
56782 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
56783 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
56784
56785 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56786 if (VT.getScalarType() == MVT::i1)
56787 return DAG.getConstant(1, SDLoc(N), VT);
56788 return getOnesVector(VT, DAG, SDLoc(N));
56789 }
56790
56791 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56792 return DAG.getBuildVector(VT, SDLoc(N),
56793 InVec->ops().slice(IdxVal, NumSubElts));
56794
56795 // If we are extracting from an insert into a larger vector, replace with a
56796 // smaller insert if we don't access less than the original subvector. Don't
56797 // do this for i1 vectors.
56798 // TODO: Relax the matching indices requirement?
56799 if (VT.getVectorElementType() != MVT::i1 &&
56800 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56801 IdxVal == InVec.getConstantOperandVal(2) &&
56802 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56803 SDLoc DL(N);
56804 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56805 InVec.getOperand(0), N->getOperand(1));
56806 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56807 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56808 InVec.getOperand(1),
56809 DAG.getVectorIdxConstant(NewIdxVal, DL));
56810 }
56811
56812 // If we're extracting an upper subvector from a broadcast we should just
56813 // extract the lowest subvector instead which should allow
56814 // SimplifyDemandedVectorElts do more simplifications.
56815 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56816 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56817 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56818 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
56819
56820 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56821 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56822 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56823 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
56824
56825 // Attempt to extract from the source of a shuffle vector.
56826 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56827 SmallVector<int, 32> ShuffleMask;
56828 SmallVector<int, 32> ScaledMask;
56829 SmallVector<SDValue, 2> ShuffleInputs;
56830 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56831 // Decode the shuffle mask and scale it so its shuffling subvectors.
56832 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56833 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56834 unsigned SubVecIdx = IdxVal / NumSubElts;
56835 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56836 return DAG.getUNDEF(VT);
56837 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56838 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
56839 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56840 if (Src.getValueSizeInBits() == InSizeInBits) {
56841 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56842 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56843 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56844 SDLoc(N), SizeInBits);
56845 }
56846 }
56847 }
56848
56849 // If we're extracting the lowest subvector and we're the only user,
56850 // we may be able to perform this with a smaller vector width.
56851 unsigned InOpcode = InVec.getOpcode();
56852 if (InVec.hasOneUse()) {
56853 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56854 // v2f64 CVTDQ2PD(v4i32).
56855 if (InOpcode == ISD::SINT_TO_FP &&
56856 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56857 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
56858 }
56859 // v2f64 CVTUDQ2PD(v4i32).
56860 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56861 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56862 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
56863 }
56864 // v2f64 CVTPS2PD(v4f32).
56865 if (InOpcode == ISD::FP_EXTEND &&
56866 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56867 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
56868 }
56869 }
56870 if (IdxVal == 0 &&
56871 (InOpcode == ISD::ANY_EXTEND ||
56872 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
56873 InOpcode == ISD::ZERO_EXTEND ||
56874 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
56875 InOpcode == ISD::SIGN_EXTEND ||
56876 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56877 (SizeInBits == 128 || SizeInBits == 256) &&
56878 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56879 SDLoc DL(N);
56880 SDValue Ext = InVec.getOperand(0);
56881 if (Ext.getValueSizeInBits() > SizeInBits)
56882 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56883 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56884 return DAG.getNode(ExtOp, DL, VT, Ext);
56885 }
56886 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56887 InVec.getOperand(0).getValueType().is256BitVector() &&
56888 InVec.getOperand(1).getValueType().is256BitVector() &&
56889 InVec.getOperand(2).getValueType().is256BitVector()) {
56890 SDLoc DL(N);
56891 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56892 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56893 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56894 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56895 }
56896 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56897 (VT.is128BitVector() || VT.is256BitVector())) {
56898 SDLoc DL(N);
56899 SDValue InVecSrc = InVec.getOperand(0);
56900 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56901 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56902 return DAG.getNode(InOpcode, DL, VT, Ext);
56903 }
56904 if (InOpcode == X86ISD::MOVDDUP &&
56905 (VT.is128BitVector() || VT.is256BitVector())) {
56906 SDLoc DL(N);
56907 SDValue Ext0 =
56908 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56909 return DAG.getNode(InOpcode, DL, VT, Ext0);
56910 }
56911 }
56912
56913 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56914 // as this is very likely to fold into a shuffle/truncation.
56915 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56916 InVecVT.getScalarSizeInBits() == 64 &&
56917 InVec.getConstantOperandAPInt(1) == 32) {
56918 SDLoc DL(N);
56919 SDValue Ext =
56920 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56921 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56922 }
56923
56924 return SDValue();
56925}
56926
56927static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
56928 EVT VT = N->getValueType(0);
56929 SDValue Src = N->getOperand(0);
56930 SDLoc DL(N);
56931
56932 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56933 // This occurs frequently in our masked scalar intrinsic code and our
56934 // floating point select lowering with AVX512.
56935 // TODO: SimplifyDemandedBits instead?
56936 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56937 isOneConstant(Src.getOperand(1)))
56938 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56939
56940 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56941 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56942 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56943 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
56944 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
56945 if (C->isZero())
56946 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56947 Src.getOperand(1));
56948
56949 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56950 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56951 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56952 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56953 if (Op.getValueType() != MVT::i64)
56954 return SDValue();
56955 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
56956 if (Op.getOpcode() == Opc &&
56957 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
56958 return Op.getOperand(0);
56959 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
56960 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
56961 if (Ld->getExtensionType() == Ext &&
56962 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
56963 return Op;
56964 if (IsZeroExt) {
56965 KnownBits Known = DAG.computeKnownBits(Op);
56966 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
56967 return Op;
56968 }
56969 return SDValue();
56970 };
56971
56972 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
56973 return DAG.getBitcast(
56974 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56975 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
56976
56977 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
56978 return DAG.getBitcast(
56979 VT,
56980 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
56981 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56982 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
56983 }
56984
56985 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
56986 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
56987 Src.getOperand(0).getValueType() == MVT::x86mmx)
56988 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
56989
56990 // See if we're broadcasting the scalar value, in which case just reuse that.
56991 // Ensure the same SDValue from the SDNode use is being used.
56992 if (VT.getScalarType() == Src.getValueType())
56993 for (SDNode *User : Src->uses())
56994 if (User->getOpcode() == X86ISD::VBROADCAST &&
56995 Src == User->getOperand(0)) {
56996 unsigned SizeInBits = VT.getFixedSizeInBits();
56997 unsigned BroadcastSizeInBits =
56998 User->getValueSizeInBits(0).getFixedValue();
56999 if (BroadcastSizeInBits == SizeInBits)
57000 return SDValue(User, 0);
57001 if (BroadcastSizeInBits > SizeInBits)
57002 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57003 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57004 // coverage.
57005 }
57006
57007 return SDValue();
57008}
57009
57010// Simplify PMULDQ and PMULUDQ operations.
57011static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
57012 TargetLowering::DAGCombinerInfo &DCI,
57013 const X86Subtarget &Subtarget) {
57014 SDValue LHS = N->getOperand(0);
57015 SDValue RHS = N->getOperand(1);
57016
57017 // Canonicalize constant to RHS.
57018 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
57019 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
57020 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57021
57022 // Multiply by zero.
57023 // Don't return RHS as it may contain UNDEFs.
57024 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57025 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57026
57027 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57029 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57030 return SDValue(N, 0);
57031
57032 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57033 // convert it to any_extend_invec, due to the LegalOperations check, do the
57034 // conversion directly to a vector shuffle manually. This exposes combine
57035 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57036 // combineX86ShufflesRecursively on SSE4.1 targets.
57037 // FIXME: This is basically a hack around several other issues related to
57038 // ANY_EXTEND_VECTOR_INREG.
57039 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57040 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57041 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57042 LHS.getOperand(0).getValueType() == MVT::v4i32) {
57043 SDLoc dl(N);
57044 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57045 LHS.getOperand(0), { 0, -1, 1, -1 });
57046 LHS = DAG.getBitcast(MVT::v2i64, LHS);
57047 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57048 }
57049 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57050 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57051 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57052 RHS.getOperand(0).getValueType() == MVT::v4i32) {
57053 SDLoc dl(N);
57054 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57055 RHS.getOperand(0), { 0, -1, 1, -1 });
57056 RHS = DAG.getBitcast(MVT::v2i64, RHS);
57057 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57058 }
57059
57060 return SDValue();
57061}
57062
57063// Simplify VPMADDUBSW/VPMADDWD operations.
57064static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
57065 TargetLowering::DAGCombinerInfo &DCI) {
57066 EVT VT = N->getValueType(0);
57067 SDValue LHS = N->getOperand(0);
57068 SDValue RHS = N->getOperand(1);
57069
57070 // Multiply by zero.
57071 // Don't return LHS/RHS as it may contain UNDEFs.
57072 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57073 ISD::isBuildVectorAllZeros(RHS.getNode()))
57074 return DAG.getConstant(0, SDLoc(N), VT);
57075
57076 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57077 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57078 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57079 return SDValue(N, 0);
57080
57081 return SDValue();
57082}
57083
57084static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
57085 TargetLowering::DAGCombinerInfo &DCI,
57086 const X86Subtarget &Subtarget) {
57087 EVT VT = N->getValueType(0);
57088 SDValue In = N->getOperand(0);
57089 unsigned Opcode = N->getOpcode();
57090 unsigned InOpcode = In.getOpcode();
57091 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57092 SDLoc DL(N);
57093
57094 // Try to merge vector loads and extend_inreg to an extload.
57095 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57096 In.hasOneUse()) {
57097 auto *Ld = cast<LoadSDNode>(In);
57098 if (Ld->isSimple()) {
57099 MVT SVT = In.getSimpleValueType().getVectorElementType();
57100 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
57101 ? ISD::SEXTLOAD
57102 : ISD::ZEXTLOAD;
57103 EVT MemVT = VT.changeVectorElementType(SVT);
57104 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57105 SDValue Load = DAG.getExtLoad(
57106 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57107 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57108 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57109 return Load;
57110 }
57111 }
57112 }
57113
57114 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57115 if (Opcode == InOpcode)
57116 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57117
57118 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57119 // -> EXTEND_VECTOR_INREG(X).
57120 // TODO: Handle non-zero subvector indices.
57121 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57122 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57123 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57124 In.getValueSizeInBits())
57125 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57126
57127 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57128 // TODO: Move to DAGCombine?
57129 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57130 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57131 In.getValueSizeInBits() == VT.getSizeInBits()) {
57132 unsigned NumElts = VT.getVectorNumElements();
57133 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57134 EVT EltVT = In.getOperand(0).getValueType();
57135 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57136 for (unsigned I = 0; I != NumElts; ++I)
57137 Elts[I * Scale] = In.getOperand(I);
57138 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57139 }
57140
57141 // Attempt to combine as a shuffle on SSE41+ targets.
57142 if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
57143 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
57144 Subtarget.hasSSE41()) {
57145 SDValue Op(N, 0);
57146 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57147 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57148 return Res;
57149 }
57150
57151 return SDValue();
57152}
57153
57154static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
57155 TargetLowering::DAGCombinerInfo &DCI) {
57156 EVT VT = N->getValueType(0);
57157
57158 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57159 return DAG.getConstant(0, SDLoc(N), VT);
57160
57161 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57162 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57163 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57164 return SDValue(N, 0);
57165
57166 return SDValue();
57167}
57168
57169// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57170// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57171// extra instructions between the conversion due to going to scalar and back.
57172static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
57173 const X86Subtarget &Subtarget) {
57174 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57175 return SDValue();
57176
57177 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57178 return SDValue();
57179
57180 if (N->getValueType(0) != MVT::f32 ||
57181 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57182 return SDValue();
57183
57184 SDLoc dl(N);
57185 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57186 N->getOperand(0).getOperand(0));
57187 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57188 DAG.getTargetConstant(4, dl, MVT::i32));
57189 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57190 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57191 DAG.getIntPtrConstant(0, dl));
57192}
57193
57194static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
57195 const X86Subtarget &Subtarget) {
57196 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57197 return SDValue();
57198
57199 if (Subtarget.hasFP16())
57200 return SDValue();
57201
57202 bool IsStrict = N->isStrictFPOpcode();
57203 EVT VT = N->getValueType(0);
57204 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57205 EVT SrcVT = Src.getValueType();
57206
57207 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57208 return SDValue();
57209
57210 if (VT.getVectorElementType() != MVT::f32 &&
57211 VT.getVectorElementType() != MVT::f64)
57212 return SDValue();
57213
57214 unsigned NumElts = VT.getVectorNumElements();
57215 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57216 return SDValue();
57217
57218 SDLoc dl(N);
57219
57220 // Convert the input to vXi16.
57221 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57222 Src = DAG.getBitcast(IntVT, Src);
57223
57224 // Widen to at least 8 input elements.
57225 if (NumElts < 8) {
57226 unsigned NumConcats = 8 / NumElts;
57227 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57228 : DAG.getConstant(0, dl, IntVT);
57229 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57230 Ops[0] = Src;
57231 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57232 }
57233
57234 // Destination is vXf32 with at least 4 elements.
57235 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57236 std::max(4U, NumElts));
57237 SDValue Cvt, Chain;
57238 if (IsStrict) {
57239 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57240 {N->getOperand(0), Src});
57241 Chain = Cvt.getValue(1);
57242 } else {
57243 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57244 }
57245
57246 if (NumElts < 4) {
57247 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57247, __extension__
__PRETTY_FUNCTION__))
;
57248 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57249 DAG.getIntPtrConstant(0, dl));
57250 }
57251
57252 if (IsStrict) {
57253 // Extend to the original VT if necessary.
57254 if (Cvt.getValueType() != VT) {
57255 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57256 {Chain, Cvt});
57257 Chain = Cvt.getValue(1);
57258 }
57259 return DAG.getMergeValues({Cvt, Chain}, dl);
57260 }
57261
57262 // Extend to the original VT if necessary.
57263 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57264}
57265
57266// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57267// from. Limit this to cases where the loads have the same input chain and the
57268// output chains are unused. This avoids any memory ordering issues.
57269static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
57270 TargetLowering::DAGCombinerInfo &DCI) {
57271 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57273, __extension__
__PRETTY_FUNCTION__))
57272 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57273, __extension__
__PRETTY_FUNCTION__))
57273 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57273, __extension__
__PRETTY_FUNCTION__))
;
57274
57275 // Only do this if the chain result is unused.
57276 if (N->hasAnyUseOfValue(1))
57277 return SDValue();
57278
57279 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57280
57281 SDValue Ptr = MemIntrin->getBasePtr();
57282 SDValue Chain = MemIntrin->getChain();
57283 EVT VT = N->getSimpleValueType(0);
57284 EVT MemVT = MemIntrin->getMemoryVT();
57285
57286 // Look at other users of our base pointer and try to find a wider broadcast.
57287 // The input chain and the size of the memory VT must match.
57288 for (SDNode *User : Ptr->uses())
57289 if (User != N && User->getOpcode() == N->getOpcode() &&
57290 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57291 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57292 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57293 MemVT.getSizeInBits() &&
57294 !User->hasAnyUseOfValue(1) &&
57295 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57296 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57297 VT.getSizeInBits());
57298 Extract = DAG.getBitcast(VT, Extract);
57299 return DCI.CombineTo(N, Extract, SDValue(User, 1));
57300 }
57301
57302 return SDValue();
57303}
57304
57305static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
57306 const X86Subtarget &Subtarget) {
57307 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57308 return SDValue();
57309
57310 bool IsStrict = N->isStrictFPOpcode();
57311 EVT VT = N->getValueType(0);
57312 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57313 EVT SrcVT = Src.getValueType();
57314
57315 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57316 SrcVT.getVectorElementType() != MVT::f32)
57317 return SDValue();
57318
57319 SDLoc dl(N);
57320
57321 SDValue Cvt, Chain;
57322 unsigned NumElts = VT.getVectorNumElements();
57323 if (Subtarget.hasFP16()) {
57324 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
57325 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
57326 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
57327 SDValue Cvt0, Cvt1;
57328 SDValue Op0 = Src.getOperand(0);
57329 SDValue Op1 = Src.getOperand(1);
57330 bool IsOp0Strict = Op0->isStrictFPOpcode();
57331 if (Op0.getOpcode() != Op1.getOpcode() ||
57332 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
57333 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
57334 return SDValue();
57335 }
57336 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
57337 if (IsStrict) {
57338 assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57338, __extension__
__PRETTY_FUNCTION__))
;
57339 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
57340 ? X86ISD::STRICT_CVTSI2P
57341 : X86ISD::STRICT_CVTUI2P;
57342 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57343 {Op0.getOperand(0), Op0.getOperand(1)});
57344 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57345 {Op1.getOperand(0), Op1.getOperand(1)});
57346 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57347 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
57348 }
57349 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
57350 : X86ISD::CVTUI2P;
57351 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
57352 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
57353 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57354 }
57355 return SDValue();
57356 }
57357
57358 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57359 return SDValue();
57360
57361 // Widen to at least 4 input elements.
57362 if (NumElts < 4)
57363 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
57364 DAG.getConstantFP(0.0, dl, SrcVT));
57365
57366 // Destination is v8i16 with at least 8 elements.
57367 EVT CvtVT =
57368 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
57369 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
57370 if (IsStrict) {
57371 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
57372 {N->getOperand(0), Src, Rnd});
57373 Chain = Cvt.getValue(1);
57374 } else {
57375 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
57376 }
57377
57378 // Extract down to real number of elements.
57379 if (NumElts < 8) {
57380 EVT IntVT = VT.changeVectorElementTypeToInteger();
57381 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
57382 DAG.getIntPtrConstant(0, dl));
57383 }
57384
57385 Cvt = DAG.getBitcast(VT, Cvt);
57386
57387 if (IsStrict)
57388 return DAG.getMergeValues({Cvt, Chain}, dl);
57389
57390 return Cvt;
57391}
57392
57393static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
57394 SDValue Src = N->getOperand(0);
57395
57396 // Turn MOVDQ2Q+simple_load into an mmx load.
57397 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
57398 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
57399
57400 if (LN->isSimple()) {
57401 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57402 LN->getBasePtr(),
57403 LN->getPointerInfo(),
57404 LN->getOriginalAlign(),
57405 LN->getMemOperand()->getFlags());
57406 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
57407 return NewLd;
57408 }
57409 }
57410
57411 return SDValue();
57412}
57413
57414static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
57415 TargetLowering::DAGCombinerInfo &DCI) {
57416 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57417 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57418 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
57419 return SDValue(N, 0);
57420
57421 return SDValue();
57422}
57423
57424SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
57425 DAGCombinerInfo &DCI) const {
57426 SelectionDAG &DAG = DCI.DAG;
57427 switch (N->getOpcode()) {
57428 default: break;
57429 case ISD::SCALAR_TO_VECTOR:
57430 return combineScalarToVector(N, DAG);
57431 case ISD::EXTRACT_VECTOR_ELT:
57432 case X86ISD::PEXTRW:
57433 case X86ISD::PEXTRB:
57434 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57435 case ISD::CONCAT_VECTORS:
57436 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57437 case ISD::INSERT_SUBVECTOR:
57438 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57439 case ISD::EXTRACT_SUBVECTOR:
57440 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57441 case ISD::VSELECT:
57442 case ISD::SELECT:
57443 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57444 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57445 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57446 case X86ISD::CMP: return combineCMP(N, DAG);
57447 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57448 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57449 case X86ISD::ADD:
57450 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
57451 case X86ISD::SBB: return combineSBB(N, DAG);
57452 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57453 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57454 case ISD::SHL: return combineShiftLeft(N, DAG);
57455 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57456 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57457 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57458 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57459 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57460 case X86ISD::BEXTR:
57461 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57462 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57463 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57464 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57465 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57466 case X86ISD::VEXTRACT_STORE:
57467 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57468 case ISD::SINT_TO_FP:
57469 case ISD::STRICT_SINT_TO_FP:
57470 return combineSIntToFP(N, DAG, DCI, Subtarget);
57471 case ISD::UINT_TO_FP:
57472 case ISD::STRICT_UINT_TO_FP:
57473 return combineUIntToFP(N, DAG, Subtarget);
57474 case ISD::FADD:
57475 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57476 case X86ISD::VFCMULC:
57477 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57478 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57479 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57480 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57481 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57482 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57483 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57484 case X86ISD::FXOR:
57485 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57486 case X86ISD::FMIN:
57487 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57488 case ISD::FMINNUM:
57489 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57490 case X86ISD::CVTSI2P:
57491 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57492 case X86ISD::CVTP2SI:
57493 case X86ISD::CVTP2UI:
57494 case X86ISD::STRICT_CVTTP2SI:
57495 case X86ISD::CVTTP2SI:
57496 case X86ISD::STRICT_CVTTP2UI:
57497 case X86ISD::CVTTP2UI:
57498 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57499 case X86ISD::STRICT_CVTPH2PS:
57500 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57501 case X86ISD::BT: return combineBT(N, DAG, DCI);
57502 case ISD::ANY_EXTEND:
57503 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57504 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57505 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57506 case ISD::ANY_EXTEND_VECTOR_INREG:
57507 case ISD::SIGN_EXTEND_VECTOR_INREG:
57508 case ISD::ZERO_EXTEND_VECTOR_INREG:
57509 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57510 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57511 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57512 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57513 case X86ISD::PACKSS:
57514 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57515 case X86ISD::HADD:
57516 case X86ISD::HSUB:
57517 case X86ISD::FHADD:
57518 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57519 case X86ISD::VSHL:
57520 case X86ISD::VSRA:
57521 case X86ISD::VSRL:
57522 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57523 case X86ISD::VSHLI:
57524 case X86ISD::VSRAI:
57525 case X86ISD::VSRLI:
57526 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57527 case ISD::INSERT_VECTOR_ELT:
57528 case X86ISD::PINSRB:
57529 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57530 case X86ISD::SHUFP: // Handle all target specific shuffles
57531 case X86ISD::INSERTPS:
57532 case X86ISD::EXTRQI:
57533 case X86ISD::INSERTQI:
57534 case X86ISD::VALIGN:
57535 case X86ISD::PALIGNR:
57536 case X86ISD::VSHLDQ:
57537 case X86ISD::VSRLDQ:
57538 case X86ISD::BLENDI:
57539 case X86ISD::UNPCKH:
57540 case X86ISD::UNPCKL:
57541 case X86ISD::MOVHLPS:
57542 case X86ISD::MOVLHPS:
57543 case X86ISD::PSHUFB:
57544 case X86ISD::PSHUFD:
57545 case X86ISD::PSHUFHW:
57546 case X86ISD::PSHUFLW:
57547 case X86ISD::MOVSHDUP:
57548 case X86ISD::MOVSLDUP:
57549 case X86ISD::MOVDDUP:
57550 case X86ISD::MOVSS:
57551 case X86ISD::MOVSD:
57552 case X86ISD::MOVSH:
57553 case X86ISD::VBROADCAST:
57554 case X86ISD::VPPERM:
57555 case X86ISD::VPERMI:
57556 case X86ISD::VPERMV:
57557 case X86ISD::VPERMV3:
57558 case X86ISD::VPERMIL2:
57559 case X86ISD::VPERMILPI:
57560 case X86ISD::VPERMILPV:
57561 case X86ISD::VPERM2X128:
57562 case X86ISD::SHUF128:
57563 case X86ISD::VZEXT_MOVL:
57564 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57565 case X86ISD::FMADD_RND:
57566 case X86ISD::FMSUB:
57567 case X86ISD::STRICT_FMSUB:
57568 case X86ISD::FMSUB_RND:
57569 case X86ISD::FNMADD:
57570 case X86ISD::STRICT_FNMADD:
57571 case X86ISD::FNMADD_RND:
57572 case X86ISD::FNMSUB:
57573 case X86ISD::STRICT_FNMSUB:
57574 case X86ISD::FNMSUB_RND:
57575 case ISD::FMA:
57576 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57577 case X86ISD::FMADDSUB_RND:
57578 case X86ISD::FMSUBADD_RND:
57579 case X86ISD::FMADDSUB:
57580 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57581 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57582 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57583 case X86ISD::MGATHER:
57584 case X86ISD::MSCATTER:
57585 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
57586 case ISD::MGATHER:
57587 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57588 case X86ISD::PCMPEQ:
57589 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57590 case X86ISD::PMULDQ:
57591 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57592 case X86ISD::VPMADDUBSW:
57593 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57594 case X86ISD::KSHIFTL:
57595 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57596 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57597 case ISD::STRICT_FP_EXTEND:
57598 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
57599 case ISD::STRICT_FP_ROUND:
57600 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57601 case X86ISD::VBROADCAST_LOAD:
57602 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57603 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57604 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57605 }
57606
57607 return SDValue();
57608}
57609
57610bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
57611 return false;
57612}
57613
57614bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57615 if (!isTypeLegal(VT))
57616 return false;
57617
57618 // There are no vXi8 shifts.
57619 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57620 return false;
57621
57622 // TODO: Almost no 8-bit ops are desirable because they have no actual
57623 // size/speed advantages vs. 32-bit ops, but they do have a major
57624 // potential disadvantage by causing partial register stalls.
57625 //
57626 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57627 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57628 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57629 // check for a constant operand to the multiply.
57630 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57631 return false;
57632
57633 // i16 instruction encodings are longer and some i16 instructions are slow,
57634 // so those are not desirable.
57635 if (VT == MVT::i16) {
57636 switch (Opc) {
57637 default:
57638 break;
57639 case ISD::LOAD:
57640 case ISD::SIGN_EXTEND:
57641 case ISD::ZERO_EXTEND:
57642 case ISD::ANY_EXTEND:
57643 case ISD::SHL:
57644 case ISD::SRA:
57645 case ISD::SRL:
57646 case ISD::SUB:
57647 case ISD::ADD:
57648 case ISD::MUL:
57649 case ISD::AND:
57650 case ISD::OR:
57651 case ISD::XOR:
57652 return false;
57653 }
57654 }
57655
57656 // Any legal type not explicitly accounted for above here is desirable.
57657 return true;
57658}
57659
57660SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
57661 SDValue Value, SDValue Addr,
57662 SelectionDAG &DAG) const {
57663 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57664 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57665 if (IsCFProtectionSupported) {
57666 // In case control-flow branch protection is enabled, we need to add
57667 // notrack prefix to the indirect branch.
57668 // In order to do that we create NT_BRIND SDNode.
57669 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57670 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
57671 }
57672
57673 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
57674}
57675
57676TargetLowering::AndOrSETCCFoldKind
57677X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
57678 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57679 using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
57680 EVT VT = LogicOp->getValueType(0);
57681 EVT OpVT = SETCC0->getOperand(0).getValueType();
57682 if (!VT.isInteger())
57683 return AndOrSETCCFoldKind::None;
57684
57685 if (VT.isVector())
57686 return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
57687 (isOperationLegal(ISD::ABS, OpVT)
57688 ? AndOrSETCCFoldKind::ABS
57689 : AndOrSETCCFoldKind::None));
57690
57691 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57692 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57693 // `NotAnd` applies, `AddAnd` does as well.
57694 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57695 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57696 return AndOrSETCCFoldKind::AddAnd;
57697}
57698
57699bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
57700 EVT VT = Op.getValueType();
57701 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57702 isa<ConstantSDNode>(Op.getOperand(1));
57703
57704 // i16 is legal, but undesirable since i16 instruction encodings are longer
57705 // and some i16 instructions are slow.
57706 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57707 // using LEA and/or other ALU ops.
57708 if (VT != MVT::i16 && !Is8BitMulByConstant)
57709 return false;
57710
57711 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57712 if (!Op.hasOneUse())
57713 return false;
57714 SDNode *User = *Op->use_begin();
57715 if (!ISD::isNormalStore(User))
57716 return false;
57717 auto *Ld = cast<LoadSDNode>(Load);
57718 auto *St = cast<StoreSDNode>(User);
57719 return Ld->getBasePtr() == St->getBasePtr();
57720 };
57721
57722 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57723 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57724 return false;
57725 if (!Op.hasOneUse())
57726 return false;
57727 SDNode *User = *Op->use_begin();
57728 if (User->getOpcode() != ISD::ATOMIC_STORE)
57729 return false;
57730 auto *Ld = cast<AtomicSDNode>(Load);
57731 auto *St = cast<AtomicSDNode>(User);
57732 return Ld->getBasePtr() == St->getBasePtr();
57733 };
57734
57735 bool Commute = false;
57736 switch (Op.getOpcode()) {
57737 default: return false;
57738 case ISD::SIGN_EXTEND:
57739 case ISD::ZERO_EXTEND:
57740 case ISD::ANY_EXTEND:
57741 break;
57742 case ISD::SHL:
57743 case ISD::SRA:
57744 case ISD::SRL: {
57745 SDValue N0 = Op.getOperand(0);
57746 // Look out for (store (shl (load), x)).
57747 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57748 return false;
57749 break;
57750 }
57751 case ISD::ADD:
57752 case ISD::MUL:
57753 case ISD::AND:
57754 case ISD::OR:
57755 case ISD::XOR:
57756 Commute = true;
57757 [[fallthrough]];
57758 case ISD::SUB: {
57759 SDValue N0 = Op.getOperand(0);
57760 SDValue N1 = Op.getOperand(1);
57761 // Avoid disabling potential load folding opportunities.
57762 if (X86::mayFoldLoad(N1, Subtarget) &&
57763 (!Commute || !isa<ConstantSDNode>(N0) ||
57764 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57765 return false;
57766 if (X86::mayFoldLoad(N0, Subtarget) &&
57767 ((Commute && !isa<ConstantSDNode>(N1)) ||
57768 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57769 return false;
57770 if (IsFoldableAtomicRMW(N0, Op) ||
57771 (Commute && IsFoldableAtomicRMW(N1, Op)))
57772 return false;
57773 }
57774 }
57775
57776 PVT = MVT::i32;
57777 return true;
57778}
57779
57780//===----------------------------------------------------------------------===//
57781// X86 Inline Assembly Support
57782//===----------------------------------------------------------------------===//
57783
57784// Helper to match a string separated by whitespace.
57785static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
57786 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57787
57788 for (StringRef Piece : Pieces) {
57789 if (!S.startswith(Piece)) // Check if the piece matches.
57790 return false;
57791
57792 S = S.substr(Piece.size());
57793 StringRef::size_type Pos = S.find_first_not_of(" \t");
57794 if (Pos == 0) // We matched a prefix.
57795 return false;
57796
57797 S = S.substr(Pos);
57798 }
57799
57800 return S.empty();
57801}
57802
57803static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
57804
57805 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57806 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57807 llvm::is_contained(AsmPieces, "~{flags}") &&
57808 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57809
57810 if (AsmPieces.size() == 3)
57811 return true;
57812 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57813 return true;
57814 }
57815 }
57816 return false;
57817}
57818
57819bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
57820 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57821
57822 const std::string &AsmStr = IA->getAsmString();
57823
57824 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57825 if (!Ty || Ty->getBitWidth() % 16 != 0)
57826 return false;
57827
57828 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57829 SmallVector<StringRef, 4> AsmPieces;
57830 SplitString(AsmStr, AsmPieces, ";\n");
57831
57832 switch (AsmPieces.size()) {
57833 default: return false;
57834 case 1:
57835 // FIXME: this should verify that we are targeting a 486 or better. If not,
57836 // we will turn this bswap into something that will be lowered to logical
57837 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57838 // lower so don't worry about this.
57839 // bswap $0
57840 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57841 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57842 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57843 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57844 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57845 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57846 // No need to check constraints, nothing other than the equivalent of
57847 // "=r,0" would be valid here.
57848 return IntrinsicLowering::LowerToByteSwap(CI);
57849 }
57850
57851 // rorw $$8, ${0:w} --> llvm.bswap.i16
57852 if (CI->getType()->isIntegerTy(16) &&
57853 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57854 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57855 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57856 AsmPieces.clear();
57857 StringRef ConstraintsStr = IA->getConstraintString();
57858 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57859 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57860 if (clobbersFlagRegisters(AsmPieces))
57861 return IntrinsicLowering::LowerToByteSwap(CI);
57862 }
57863 break;
57864 case 3:
57865 if (CI->getType()->isIntegerTy(32) &&
57866 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57867 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57868 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57869 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57870 AsmPieces.clear();
57871 StringRef ConstraintsStr = IA->getConstraintString();
57872 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57873 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57874 if (clobbersFlagRegisters(AsmPieces))
57875 return IntrinsicLowering::LowerToByteSwap(CI);
57876 }
57877
57878 if (CI->getType()->isIntegerTy(64)) {
57879 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57880 if (Constraints.size() >= 2 &&
57881 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57882 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57883 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57884 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57885 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57886 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57887 return IntrinsicLowering::LowerToByteSwap(CI);
57888 }
57889 }
57890 break;
57891 }
57892 return false;
57893}
57894
57895static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
57896 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
57897 .Case("{@cca}", X86::COND_A)
57898 .Case("{@ccae}", X86::COND_AE)
57899 .Case("{@ccb}", X86::COND_B)
57900 .Case("{@ccbe}", X86::COND_BE)
57901 .Case("{@ccc}", X86::COND_B)
57902 .Case("{@cce}", X86::COND_E)
57903 .Case("{@ccz}", X86::COND_E)
57904 .Case("{@ccg}", X86::COND_G)
57905 .Case("{@ccge}", X86::COND_GE)
57906 .Case("{@ccl}", X86::COND_L)
57907 .Case("{@ccle}", X86::COND_LE)
57908 .Case("{@ccna}", X86::COND_BE)
57909 .Case("{@ccnae}", X86::COND_B)
57910 .Case("{@ccnb}", X86::COND_AE)
57911 .Case("{@ccnbe}", X86::COND_A)
57912 .Case("{@ccnc}", X86::COND_AE)
57913 .Case("{@ccne}", X86::COND_NE)
57914 .Case("{@ccnz}", X86::COND_NE)
57915 .Case("{@ccng}", X86::COND_LE)
57916 .Case("{@ccnge}", X86::COND_L)
57917 .Case("{@ccnl}", X86::COND_GE)
57918 .Case("{@ccnle}", X86::COND_G)
57919 .Case("{@ccno}", X86::COND_NO)
57920 .Case("{@ccnp}", X86::COND_NP)
57921 .Case("{@ccns}", X86::COND_NS)
57922 .Case("{@cco}", X86::COND_O)
57923 .Case("{@ccp}", X86::COND_P)
57924 .Case("{@ccs}", X86::COND_S)
57925 .Default(X86::COND_INVALID);
57926 return Cond;
57927}
57928
57929/// Given a constraint letter, return the type of constraint for this target.
57930X86TargetLowering::ConstraintType
57931X86TargetLowering::getConstraintType(StringRef Constraint) const {
57932 if (Constraint.size() == 1) {
57933 switch (Constraint[0]) {
57934 case 'R':
57935 case 'q':
57936 case 'Q':
57937 case 'f':
57938 case 't':
57939 case 'u':
57940 case 'y':
57941 case 'x':
57942 case 'v':
57943 case 'l':
57944 case 'k': // AVX512 masking registers.
57945 return C_RegisterClass;
57946 case 'a':
57947 case 'b':
57948 case 'c':
57949 case 'd':
57950 case 'S':
57951 case 'D':
57952 case 'A':
57953 return C_Register;
57954 case 'I':
57955 case 'J':
57956 case 'K':
57957 case 'N':
57958 case 'G':
57959 case 'L':
57960 case 'M':
57961 return C_Immediate;
57962 case 'C':
57963 case 'e':
57964 case 'Z':
57965 return C_Other;
57966 default:
57967 break;
57968 }
57969 }
57970 else if (Constraint.size() == 2) {
57971 switch (Constraint[0]) {
57972 default:
57973 break;
57974 case 'Y':
57975 switch (Constraint[1]) {
57976 default:
57977 break;
57978 case 'z':
57979 return C_Register;
57980 case 'i':
57981 case 'm':
57982 case 'k':
57983 case 't':
57984 case '2':
57985 return C_RegisterClass;
57986 }
57987 }
57988 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57989 return C_Other;
57990 return TargetLowering::getConstraintType(Constraint);
57991}
57992
57993/// Examine constraint type and operand type and determine a weight value.
57994/// This object must already have been set up with the operand type
57995/// and the current alternative constraint selected.
57996TargetLowering::ConstraintWeight
57997 X86TargetLowering::getSingleConstraintMatchWeight(
57998 AsmOperandInfo &info, const char *constraint) const {
57999 ConstraintWeight weight = CW_Invalid;
58000 Value *CallOperandVal = info.CallOperandVal;
58001 // If we don't have a value, we can't do a match,
58002 // but allow it at the lowest weight.
58003 if (!CallOperandVal)
58004 return CW_Default;
58005 Type *type = CallOperandVal->getType();
58006 // Look at the constraint type.
58007 switch (*constraint) {
58008 default:
58009 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
58010 [[fallthrough]];
58011 case 'R':
58012 case 'q':
58013 case 'Q':
58014 case 'a':
58015 case 'b':
58016 case 'c':
58017 case 'd':
58018 case 'S':
58019 case 'D':
58020 case 'A':
58021 if (CallOperandVal->getType()->isIntegerTy())
58022 weight = CW_SpecificReg;
58023 break;
58024 case 'f':
58025 case 't':
58026 case 'u':
58027 if (type->isFloatingPointTy())
58028 weight = CW_SpecificReg;
58029 break;
58030 case 'y':
58031 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58032 weight = CW_SpecificReg;
58033 break;
58034 case 'Y':
58035 if (StringRef(constraint).size() != 2)
58036 break;
58037 switch (constraint[1]) {
58038 default:
58039 return CW_Invalid;
58040 // XMM0
58041 case 'z':
58042 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58043 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58044 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58045 return CW_SpecificReg;
58046 return CW_Invalid;
58047 // Conditional OpMask regs (AVX512)
58048 case 'k':
58049 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58050 return CW_Register;
58051 return CW_Invalid;
58052 // Any MMX reg
58053 case 'm':
58054 if (type->isX86_MMXTy() && Subtarget.hasMMX())
58055 return weight;
58056 return CW_Invalid;
58057 // Any SSE reg when ISA >= SSE2, same as 'x'
58058 case 'i':
58059 case 't':
58060 case '2':
58061 if (!Subtarget.hasSSE2())
58062 return CW_Invalid;
58063 break;
58064 }
58065 break;
58066 case 'v':
58067 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58068 weight = CW_Register;
58069 [[fallthrough]];
58070 case 'x':
58071 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58072 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58073 weight = CW_Register;
58074 break;
58075 case 'k':
58076 // Enable conditional vector operations using %k<#> registers.
58077 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58078 weight = CW_Register;
58079 break;
58080 case 'I':
58081 if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
58082 if (C->getZExtValue() <= 31)
58083 weight = CW_Constant;
58084 }
58085 break;
58086 case 'J':
58087 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58088 if (C->getZExtValue() <= 63)
58089 weight = CW_Constant;
58090 }
58091 break;
58092 case 'K':
58093 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58094 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58095 weight = CW_Constant;
58096 }
58097 break;
58098 case 'L':
58099 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58100 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58101 weight = CW_Constant;
58102 }
58103 break;
58104 case 'M':
58105 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58106 if (C->getZExtValue() <= 3)
58107 weight = CW_Constant;
58108 }
58109 break;
58110 case 'N':
58111 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58112 if (C->getZExtValue() <= 0xff)
58113 weight = CW_Constant;
58114 }
58115 break;
58116 case 'G':
58117 case 'C':
58118 if (isa<ConstantFP>(CallOperandVal)) {
58119 weight = CW_Constant;
58120 }
58121 break;
58122 case 'e':
58123 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58124 if ((C->getSExtValue() >= -0x80000000LL) &&
58125 (C->getSExtValue() <= 0x7fffffffLL))
58126 weight = CW_Constant;
58127 }
58128 break;
58129 case 'Z':
58130 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58131 if (C->getZExtValue() <= 0xffffffff)
58132 weight = CW_Constant;
58133 }
58134 break;
58135 }
58136 return weight;
58137}
58138
58139/// Try to replace an X constraint, which matches anything, with another that
58140/// has more specific requirements based on the type of the corresponding
58141/// operand.
58142const char *X86TargetLowering::
58143LowerXConstraint(EVT ConstraintVT) const {
58144 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58145 // 'f' like normal targets.
58146 if (ConstraintVT.isFloatingPoint()) {
58147 if (Subtarget.hasSSE1())
58148 return "x";
58149 }
58150
58151 return TargetLowering::LowerXConstraint(ConstraintVT);
58152}
58153
58154// Lower @cc targets via setcc.
58155SDValue X86TargetLowering::LowerAsmOutputForConstraint(
58156 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58157 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58158 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
58159 if (Cond == X86::COND_INVALID)
58160 return SDValue();
58161 // Check that return type is valid.
58162 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58163 OpInfo.ConstraintVT.getSizeInBits() < 8)
58164 report_fatal_error("Glue output operand is of invalid type");
58165
58166 // Get EFLAGS register. Only update chain when copyfrom is glued.
58167 if (Glue.getNode()) {
58168 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58169 Chain = Glue.getValue(1);
58170 } else
58171 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58172 // Extract CC code.
58173 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58174 // Extend to 32-bits
58175 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58176
58177 return Result;
58178}
58179
58180/// Lower the specified operand into the Ops vector.
58181/// If it is invalid, don't add anything to Ops.
58182void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
58183 std::string &Constraint,
58184 std::vector<SDValue>&Ops,
58185 SelectionDAG &DAG) const {
58186 SDValue Result;
58187
58188 // Only support length 1 constraints for now.
58189 if (Constraint.length() > 1) return;
58190
58191 char ConstraintLetter = Constraint[0];
58192 switch (ConstraintLetter) {
58193 default: break;
58194 case 'I':
58195 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58196 if (C->getZExtValue() <= 31) {
58197 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58198 Op.getValueType());
58199 break;
58200 }
58201 }
58202 return;
58203 case 'J':
58204 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58205 if (C->getZExtValue() <= 63) {
58206 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58207 Op.getValueType());
58208 break;
58209 }
58210 }
58211 return;
58212 case 'K':
58213 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58214 if (isInt<8>(C->getSExtValue())) {
58215 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58216 Op.getValueType());
58217 break;
58218 }
58219 }
58220 return;
58221 case 'L':
58222 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58223 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58224 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58225 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58226 Op.getValueType());
58227 break;
58228 }
58229 }
58230 return;
58231 case 'M':
58232 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58233 if (C->getZExtValue() <= 3) {
58234 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58235 Op.getValueType());
58236 break;
58237 }
58238 }
58239 return;
58240 case 'N':
58241 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58242 if (C->getZExtValue() <= 255) {
58243 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58244 Op.getValueType());
58245 break;
58246 }
58247 }
58248 return;
58249 case 'O':
58250 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58251 if (C->getZExtValue() <= 127) {
58252 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58253 Op.getValueType());
58254 break;
58255 }
58256 }
58257 return;
58258 case 'e': {
58259 // 32-bit signed value
58260 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58261 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58262 C->getSExtValue())) {
58263 // Widen to 64 bits here to get it sign extended.
58264 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58265 break;
58266 }
58267 // FIXME gcc accepts some relocatable values here too, but only in certain
58268 // memory models; it's complicated.
58269 }
58270 return;
58271 }
58272 case 'Z': {
58273 // 32-bit unsigned value
58274 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58275 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58276 C->getZExtValue())) {
58277 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58278 Op.getValueType());
58279 break;
58280 }
58281 }
58282 // FIXME gcc accepts some relocatable values here too, but only in certain
58283 // memory models; it's complicated.
58284 return;
58285 }
58286 case 'i': {
58287 // Literal immediates are always ok.
58288 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58289 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58290 BooleanContent BCont = getBooleanContents(MVT::i64);
58291 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58292 : ISD::SIGN_EXTEND;
58293 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58294 : CST->getSExtValue();
58295 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58296 break;
58297 }
58298
58299 // In any sort of PIC mode addresses need to be computed at runtime by
58300 // adding in a register or some sort of table lookup. These can't
58301 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58302 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58303 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58304 return;
58305
58306 // If we are in non-pic codegen mode, we allow the address of a global (with
58307 // an optional displacement) to be used with 'i'.
58308 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58309 // If we require an extra load to get this address, as in PIC mode, we
58310 // can't accept it.
58311 if (isGlobalStubReference(
58312 Subtarget.classifyGlobalReference(GA->getGlobal())))
58313 return;
58314 break;
58315 }
58316 }
58317
58318 if (Result.getNode()) {
58319 Ops.push_back(Result);
58320 return;
58321 }
58322 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58323}
58324
58325/// Check if \p RC is a general purpose register class.
58326/// I.e., GR* or one of their variant.
58327static bool isGRClass(const TargetRegisterClass &RC) {
58328 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
58329 RC.hasSuperClassEq(&X86::GR16RegClass) ||
58330 RC.hasSuperClassEq(&X86::GR32RegClass) ||
58331 RC.hasSuperClassEq(&X86::GR64RegClass) ||
58332 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
58333}
58334
58335/// Check if \p RC is a vector register class.
58336/// I.e., FR* / VR* or one of their variant.
58337static bool isFRClass(const TargetRegisterClass &RC) {
58338 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
58339 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
58340 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
58341 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
58342 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
58343 RC.hasSuperClassEq(&X86::VR512RegClass);
58344}
58345
58346/// Check if \p RC is a mask register class.
58347/// I.e., VK* or one of their variant.
58348static bool isVKClass(const TargetRegisterClass &RC) {
58349 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
58350 RC.hasSuperClassEq(&X86::VK2RegClass) ||
58351 RC.hasSuperClassEq(&X86::VK4RegClass) ||
58352 RC.hasSuperClassEq(&X86::VK8RegClass) ||
58353 RC.hasSuperClassEq(&X86::VK16RegClass) ||
58354 RC.hasSuperClassEq(&X86::VK32RegClass) ||
58355 RC.hasSuperClassEq(&X86::VK64RegClass);
58356}
58357
58358std::pair<unsigned, const TargetRegisterClass *>
58359X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
58360 StringRef Constraint,
58361 MVT VT) const {
58362 // First, see if this is a constraint that directly corresponds to an LLVM
58363 // register class.
58364 if (Constraint.size() == 1) {
58365 // GCC Constraint Letters
58366 switch (Constraint[0]) {
58367 default: break;
58368 // 'A' means [ER]AX + [ER]DX.
58369 case 'A':
58370 if (Subtarget.is64Bit())
58371 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
58372 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58373, __extension__
__PRETTY_FUNCTION__))
58373 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58373, __extension__
__PRETTY_FUNCTION__))
;
58374 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58375
58376 // TODO: Slight differences here in allocation order and leaving
58377 // RIP in the class. Do they matter any more here than they do
58378 // in the normal allocation?
58379 case 'k':
58380 if (Subtarget.hasAVX512()) {
58381 if (VT == MVT::i1)
58382 return std::make_pair(0U, &X86::VK1RegClass);
58383 if (VT == MVT::i8)
58384 return std::make_pair(0U, &X86::VK8RegClass);
58385 if (VT == MVT::i16)
58386 return std::make_pair(0U, &X86::VK16RegClass);
58387 }
58388 if (Subtarget.hasBWI()) {
58389 if (VT == MVT::i32)
58390 return std::make_pair(0U, &X86::VK32RegClass);
58391 if (VT == MVT::i64)
58392 return std::make_pair(0U, &X86::VK64RegClass);
58393 }
58394 break;
58395 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58396 if (Subtarget.is64Bit()) {
58397 if (VT == MVT::i8 || VT == MVT::i1)
58398 return std::make_pair(0U, &X86::GR8RegClass);
58399 if (VT == MVT::i16)
58400 return std::make_pair(0U, &X86::GR16RegClass);
58401 if (VT == MVT::i32 || VT == MVT::f32)
58402 return std::make_pair(0U, &X86::GR32RegClass);
58403 if (VT != MVT::f80 && !VT.isVector())
58404 return std::make_pair(0U, &X86::GR64RegClass);
58405 break;
58406 }
58407 [[fallthrough]];
58408 // 32-bit fallthrough
58409 case 'Q': // Q_REGS
58410 if (VT == MVT::i8 || VT == MVT::i1)
58411 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58412 if (VT == MVT::i16)
58413 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58414 if (VT == MVT::i32 || VT == MVT::f32 ||
58415 (!VT.isVector() && !Subtarget.is64Bit()))
58416 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58417 if (VT != MVT::f80 && !VT.isVector())
58418 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58419 break;
58420 case 'r': // GENERAL_REGS
58421 case 'l': // INDEX_REGS
58422 if (VT == MVT::i8 || VT == MVT::i1)
58423 return std::make_pair(0U, &X86::GR8RegClass);
58424 if (VT == MVT::i16)
58425 return std::make_pair(0U, &X86::GR16RegClass);
58426 if (VT == MVT::i32 || VT == MVT::f32 ||
58427 (!VT.isVector() && !Subtarget.is64Bit()))
58428 return std::make_pair(0U, &X86::GR32RegClass);
58429 if (VT != MVT::f80 && !VT.isVector())
58430 return std::make_pair(0U, &X86::GR64RegClass);
58431 break;
58432 case 'R': // LEGACY_REGS
58433 if (VT == MVT::i8 || VT == MVT::i1)
58434 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58435 if (VT == MVT::i16)
58436 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58437 if (VT == MVT::i32 || VT == MVT::f32 ||
58438 (!VT.isVector() && !Subtarget.is64Bit()))
58439 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58440 if (VT != MVT::f80 && !VT.isVector())
58441 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58442 break;
58443 case 'f': // FP Stack registers.
58444 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58445 // value to the correct fpstack register class.
58446 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58447 return std::make_pair(0U, &X86::RFP32RegClass);
58448 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58449 return std::make_pair(0U, &X86::RFP64RegClass);
58450 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58451 return std::make_pair(0U, &X86::RFP80RegClass);
58452 break;
58453 case 'y': // MMX_REGS if MMX allowed.
58454 if (!Subtarget.hasMMX()) break;
58455 return std::make_pair(0U, &X86::VR64RegClass);
58456 case 'v':
58457 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58458 if (!Subtarget.hasSSE1()) break;
58459 bool VConstraint = (Constraint[0] == 'v');
58460
58461 switch (VT.SimpleTy) {
58462 default: break;
58463 // Scalar SSE types.
58464 case MVT::f16:
58465 if (VConstraint && Subtarget.hasFP16())
58466 return std::make_pair(0U, &X86::FR16XRegClass);
58467 break;
58468 case MVT::f32:
58469 case MVT::i32:
58470 if (VConstraint && Subtarget.hasVLX())
58471 return std::make_pair(0U, &X86::FR32XRegClass);
58472 return std::make_pair(0U, &X86::FR32RegClass);
58473 case MVT::f64:
58474 case MVT::i64:
58475 if (VConstraint && Subtarget.hasVLX())
58476 return std::make_pair(0U, &X86::FR64XRegClass);
58477 return std::make_pair(0U, &X86::FR64RegClass);
58478 case MVT::i128:
58479 if (Subtarget.is64Bit()) {
58480 if (VConstraint && Subtarget.hasVLX())
58481 return std::make_pair(0U, &X86::VR128XRegClass);
58482 return std::make_pair(0U, &X86::VR128RegClass);
58483 }
58484 break;
58485 // Vector types and fp128.
58486 case MVT::v8f16:
58487 if (!Subtarget.hasFP16())
58488 break;
58489 [[fallthrough]];
58490 case MVT::f128:
58491 case MVT::v16i8:
58492 case MVT::v8i16:
58493 case MVT::v4i32:
58494 case MVT::v2i64:
58495 case MVT::v4f32:
58496 case MVT::v2f64:
58497 if (VConstraint && Subtarget.hasVLX())
58498 return std::make_pair(0U, &X86::VR128XRegClass);
58499 return std::make_pair(0U, &X86::VR128RegClass);
58500 // AVX types.
58501 case MVT::v16f16:
58502 if (!Subtarget.hasFP16())
58503 break;
58504 [[fallthrough]];
58505 case MVT::v32i8:
58506 case MVT::v16i16:
58507 case MVT::v8i32:
58508 case MVT::v4i64:
58509 case MVT::v8f32:
58510 case MVT::v4f64:
58511 if (VConstraint && Subtarget.hasVLX())
58512 return std::make_pair(0U, &X86::VR256XRegClass);
58513 if (Subtarget.hasAVX())
58514 return std::make_pair(0U, &X86::VR256RegClass);
58515 break;
58516 case MVT::v32f16:
58517 if (!Subtarget.hasFP16())
58518 break;
58519 [[fallthrough]];
58520 case MVT::v64i8:
58521 case MVT::v32i16:
58522 case MVT::v8f64:
58523 case MVT::v16f32:
58524 case MVT::v16i32:
58525 case MVT::v8i64:
58526 if (!Subtarget.hasAVX512()) break;
58527 if (VConstraint)
58528 return std::make_pair(0U, &X86::VR512RegClass);
58529 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58530 }
58531 break;
58532 }
58533 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58534 switch (Constraint[1]) {
58535 default:
58536 break;
58537 case 'i':
58538 case 't':
58539 case '2':
58540 return getRegForInlineAsmConstraint(TRI, "x", VT);
58541 case 'm':
58542 if (!Subtarget.hasMMX()) break;
58543 return std::make_pair(0U, &X86::VR64RegClass);
58544 case 'z':
58545 if (!Subtarget.hasSSE1()) break;
58546 switch (VT.SimpleTy) {
58547 default: break;
58548 // Scalar SSE types.
58549 case MVT::f16:
58550 if (!Subtarget.hasFP16())
58551 break;
58552 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58553 case MVT::f32:
58554 case MVT::i32:
58555 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58556 case MVT::f64:
58557 case MVT::i64:
58558 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58559 case MVT::v8f16:
58560 if (!Subtarget.hasFP16())
58561 break;
58562 [[fallthrough]];
58563 case MVT::f128:
58564 case MVT::v16i8:
58565 case MVT::v8i16:
58566 case MVT::v4i32:
58567 case MVT::v2i64:
58568 case MVT::v4f32:
58569 case MVT::v2f64:
58570 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58571 // AVX types.
58572 case MVT::v16f16:
58573 if (!Subtarget.hasFP16())
58574 break;
58575 [[fallthrough]];
58576 case MVT::v32i8:
58577 case MVT::v16i16:
58578 case MVT::v8i32:
58579 case MVT::v4i64:
58580 case MVT::v8f32:
58581 case MVT::v4f64:
58582 if (Subtarget.hasAVX())
58583 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58584 break;
58585 case MVT::v32f16:
58586 if (!Subtarget.hasFP16())
58587 break;
58588 [[fallthrough]];
58589 case MVT::v64i8:
58590 case MVT::v32i16:
58591 case MVT::v8f64:
58592 case MVT::v16f32:
58593 case MVT::v16i32:
58594 case MVT::v8i64:
58595 if (Subtarget.hasAVX512())
58596 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58597 break;
58598 }
58599 break;
58600 case 'k':
58601 // This register class doesn't allocate k0 for masked vector operation.
58602 if (Subtarget.hasAVX512()) {
58603 if (VT == MVT::i1)
58604 return std::make_pair(0U, &X86::VK1WMRegClass);
58605 if (VT == MVT::i8)
58606 return std::make_pair(0U, &X86::VK8WMRegClass);
58607 if (VT == MVT::i16)
58608 return std::make_pair(0U, &X86::VK16WMRegClass);
58609 }
58610 if (Subtarget.hasBWI()) {
58611 if (VT == MVT::i32)
58612 return std::make_pair(0U, &X86::VK32WMRegClass);
58613 if (VT == MVT::i64)
58614 return std::make_pair(0U, &X86::VK64WMRegClass);
58615 }
58616 break;
58617 }
58618 }
58619
58620 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58621 return std::make_pair(0U, &X86::GR32RegClass);
58622
58623 // Use the default implementation in TargetLowering to convert the register
58624 // constraint into a member of a register class.
58625 std::pair<Register, const TargetRegisterClass*> Res;
58626 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
58627
58628 // Not found as a standard register?
58629 if (!Res.second) {
58630 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58631 // to/from f80.
58632 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58633 // Map st(0) -> st(7) -> ST0
58634 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58635 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58636 Constraint[3] == '(' &&
58637 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58638 Constraint[5] == ')' && Constraint[6] == '}') {
58639 // st(7) is not allocatable and thus not a member of RFP80. Return
58640 // singleton class in cases where we have a reference to it.
58641 if (Constraint[4] == '7')
58642 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58643 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58644 &X86::RFP80RegClass);
58645 }
58646
58647 // GCC allows "st(0)" to be called just plain "st".
58648 if (StringRef("{st}").equals_insensitive(Constraint))
58649 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58650 }
58651
58652 // flags -> EFLAGS
58653 if (StringRef("{flags}").equals_insensitive(Constraint))
58654 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58655
58656 // dirflag -> DF
58657 // Only allow for clobber.
58658 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58659 VT == MVT::Other)
58660 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58661
58662 // fpsr -> FPSW
58663 if (StringRef("{fpsr}").equals_insensitive(Constraint))
58664 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58665
58666 return Res;
58667 }
58668
58669 // Make sure it isn't a register that requires 64-bit mode.
58670 if (!Subtarget.is64Bit() &&
58671 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58672 TRI->getEncodingValue(Res.first) >= 8) {
58673 // Register requires REX prefix, but we're in 32-bit mode.
58674 return std::make_pair(0, nullptr);
58675 }
58676
58677 // Make sure it isn't a register that requires AVX512.
58678 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58679 TRI->getEncodingValue(Res.first) & 0x10) {
58680 // Register requires EVEX prefix.
58681 return std::make_pair(0, nullptr);
58682 }
58683
58684 // Otherwise, check to see if this is a register class of the wrong value
58685 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58686 // turn into {ax},{dx}.
58687 // MVT::Other is used to specify clobber names.
58688 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58689 return Res; // Correct type already, nothing to do.
58690
58691 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58692 // return "eax". This should even work for things like getting 64bit integer
58693 // registers when given an f64 type.
58694 const TargetRegisterClass *Class = Res.second;
58695 // The generic code will match the first register class that contains the
58696 // given register. Thus, based on the ordering of the tablegened file,
58697 // the "plain" GR classes might not come first.
58698 // Therefore, use a helper method.
58699 if (isGRClass(*Class)) {
58700 unsigned Size = VT.getSizeInBits();
58701 if (Size == 1) Size = 8;
58702 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58703 return std::make_pair(0, nullptr);
58704 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58705 if (DestReg.isValid()) {
58706 bool is64Bit = Subtarget.is64Bit();
58707 const TargetRegisterClass *RC =
58708 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58709 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58710 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58711 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58712 if (Size == 64 && !is64Bit) {
58713 // Model GCC's behavior here and select a fixed pair of 32-bit
58714 // registers.
58715 switch (DestReg) {
58716 case X86::RAX:
58717 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58718 case X86::RDX:
58719 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58720 case X86::RCX:
58721 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58722 case X86::RBX:
58723 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58724 case X86::RSI:
58725 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58726 case X86::RDI:
58727 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58728 case X86::RBP:
58729 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58730 default:
58731 return std::make_pair(0, nullptr);
58732 }
58733 }
58734 if (RC && RC->contains(DestReg))
58735 return std::make_pair(DestReg, RC);
58736 return Res;
58737 }
58738 // No register found/type mismatch.
58739 return std::make_pair(0, nullptr);
58740 } else if (isFRClass(*Class)) {
58741 // Handle references to XMM physical registers that got mapped into the
58742 // wrong class. This can happen with constraints like {xmm0} where the
58743 // target independent register mapper will just pick the first match it can
58744 // find, ignoring the required type.
58745
58746 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58747 if (VT == MVT::f16)
58748 Res.second = &X86::FR16XRegClass;
58749 else if (VT == MVT::f32 || VT == MVT::i32)
58750 Res.second = &X86::FR32XRegClass;
58751 else if (VT == MVT::f64 || VT == MVT::i64)
58752 Res.second = &X86::FR64XRegClass;
58753 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58754 Res.second = &X86::VR128XRegClass;
58755 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58756 Res.second = &X86::VR256XRegClass;
58757 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58758 Res.second = &X86::VR512RegClass;
58759 else {
58760 // Type mismatch and not a clobber: Return an error;
58761 Res.first = 0;
58762 Res.second = nullptr;
58763 }
58764 } else if (isVKClass(*Class)) {
58765 if (VT == MVT::i1)
58766 Res.second = &X86::VK1RegClass;
58767 else if (VT == MVT::i8)
58768 Res.second = &X86::VK8RegClass;
58769 else if (VT == MVT::i16)
58770 Res.second = &X86::VK16RegClass;
58771 else if (VT == MVT::i32)
58772 Res.second = &X86::VK32RegClass;
58773 else if (VT == MVT::i64)
58774 Res.second = &X86::VK64RegClass;
58775 else {
58776 // Type mismatch and not a clobber: Return an error;
58777 Res.first = 0;
58778 Res.second = nullptr;
58779 }
58780 }
58781
58782 return Res;
58783}
58784
58785bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
58786 // Integer division on x86 is expensive. However, when aggressively optimizing
58787 // for code size, we prefer to use a div instruction, as it is usually smaller
58788 // than the alternative sequence.
58789 // The exception to this is vector division. Since x86 doesn't have vector
58790 // integer division, leaving the division as-is is a loss even in terms of
58791 // size, because it will have to be scalarized, while the alternative code
58792 // sequence can be performed in vector form.
58793 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
58794 return OptSize && !VT.isVector();
58795}
58796
58797void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
58798 if (!Subtarget.is64Bit())
58799 return;
58800
58801 // Update IsSplitCSR in X86MachineFunctionInfo.
58802 X86MachineFunctionInfo *AFI =
58803 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
58804 AFI->setIsSplitCSR(true);
58805}
58806
58807void X86TargetLowering::insertCopiesSplitCSR(
58808 MachineBasicBlock *Entry,
58809 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
58810 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
58811 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
58812 if (!IStart)
58813 return;
58814
58815 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
58816 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
58817 MachineBasicBlock::iterator MBBI = Entry->begin();
58818 for (const MCPhysReg *I = IStart; *I; ++I) {
58819 const TargetRegisterClass *RC = nullptr;
58820 if (X86::GR64RegClass.contains(*I))
58821 RC = &X86::GR64RegClass;
58822 else
58823 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58823)
;
58824
58825 Register NewVR = MRI->createVirtualRegister(RC);
58826 // Create copy from CSR to a virtual register.
58827 // FIXME: this currently does not emit CFI pseudo-instructions, it works
58828 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
58829 // nounwind. If we want to generalize this later, we may need to emit
58830 // CFI pseudo-instructions.
58831 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58833, __extension__
__PRETTY_FUNCTION__))
58832 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58833, __extension__
__PRETTY_FUNCTION__))
58833 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58833, __extension__
__PRETTY_FUNCTION__))
;
58834 Entry->addLiveIn(*I);
58835 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
58836 .addReg(*I);
58837
58838 // Insert the copy-back instructions right before the terminator.
58839 for (auto *Exit : Exits)
58840 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
58841 TII->get(TargetOpcode::COPY), *I)
58842 .addReg(NewVR);
58843 }
58844}
58845
58846bool X86TargetLowering::supportSwiftError() const {
58847 return Subtarget.is64Bit();
58848}
58849
58850/// Returns true if stack probing through a function call is requested.
58851bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
58852 return !getStackProbeSymbolName(MF).empty();
58853}
58854
58855/// Returns true if stack probing through inline assembly is requested.
58856bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
58857
58858 // No inline stack probe for Windows, they have their own mechanism.
58859 if (Subtarget.isOSWindows() ||
58860 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58861 return false;
58862
58863 // If the function specifically requests inline stack probes, emit them.
58864 if (MF.getFunction().hasFnAttribute("probe-stack"))
58865 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
58866 "inline-asm";
58867
58868 return false;
58869}
58870
58871/// Returns the name of the symbol used to emit stack probes or the empty
58872/// string if not applicable.
58873StringRef
58874X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
58875 // Inline Stack probes disable stack probe call
58876 if (hasInlineStackProbe(MF))
58877 return "";
58878
58879 // If the function specifically requests stack probes, emit them.
58880 if (MF.getFunction().hasFnAttribute("probe-stack"))
58881 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
58882
58883 // Generally, if we aren't on Windows, the platform ABI does not include
58884 // support for stack probes, so don't emit them.
58885 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
58886 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58887 return "";
58888
58889 // We need a stack probe to conform to the Windows ABI. Choose the right
58890 // symbol.
58891 if (Subtarget.is64Bit())
58892 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
58893 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
58894}
58895
58896unsigned
58897X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
58898 // The default stack probe size is 4096 if the function has no stackprobesize
58899 // attribute.
58900 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
58901 4096);
58902}
58903
58904Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
58905 if (ML->isInnermost() &&
58906 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
58907 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
58908 return TargetLowering::getPrefLoopAlignment();
58909}