Bug Summary

File:build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 17442, column 31
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16 -I lib/Target/X86 -I /build/source/llvm/lib/Target/X86 -I include -I /build/source/llvm/include -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -source-date-epoch 1674602410 -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-01-25-024556-16494-1 -x c++ /build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107/// Returns true if a CC can dynamically exclude a register from the list of
108/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
109/// params/returns.
110static bool shouldDisableCalleeSavedRegisterCC(CallingConv::ID CC) {
111 switch (CC) {
112 default:
113 return false;
114 case CallingConv::X86_RegCall:
115 case CallingConv::PreserveMost:
116 case CallingConv::PreserveAll:
117 return true;
118 }
119}
120
121X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
122 const X86Subtarget &STI)
123 : TargetLowering(TM), Subtarget(STI) {
124 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
125 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
126
127 // Set up the TargetLowering object.
128
129 // X86 is weird. It always uses i8 for shift amounts and setcc results.
130 setBooleanContents(ZeroOrOneBooleanContent);
131 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
132 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
133
134 // For 64-bit, since we have so many registers, use the ILP scheduler.
135 // For 32-bit, use the register pressure specific scheduling.
136 // For Atom, always use ILP scheduling.
137 if (Subtarget.isAtom())
138 setSchedulingPreference(Sched::ILP);
139 else if (Subtarget.is64Bit())
140 setSchedulingPreference(Sched::ILP);
141 else
142 setSchedulingPreference(Sched::RegPressure);
143 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
144 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
145
146 // Bypass expensive divides and use cheaper ones.
147 if (TM.getOptLevel() >= CodeGenOpt::Default) {
148 if (Subtarget.hasSlowDivide32())
149 addBypassSlowDiv(32, 8);
150 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
151 addBypassSlowDiv(64, 32);
152 }
153
154 // Setup Windows compiler runtime calls.
155 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
156 static const struct {
157 const RTLIB::Libcall Op;
158 const char * const Name;
159 const CallingConv::ID CC;
160 } LibraryCalls[] = {
161 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
162 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
163 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
164 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
165 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
166 };
167
168 for (const auto &LC : LibraryCalls) {
169 setLibcallName(LC.Op, LC.Name);
170 setLibcallCallingConv(LC.Op, LC.CC);
171 }
172 }
173
174 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
175 // MSVCRT doesn't have powi; fall back to pow
176 setLibcallName(RTLIB::POWI_F32, nullptr);
177 setLibcallName(RTLIB::POWI_F64, nullptr);
178 }
179
180 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
181 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
182 // FIXME: Should we be limiting the atomic size on other configs? Default is
183 // 1024.
184 if (!Subtarget.canUseCMPXCHG8B())
185 setMaxAtomicSizeInBitsSupported(32);
186
187 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
188
189 setMaxLargeFPConvertBitWidthSupported(128);
190
191 // Set up the register classes.
192 addRegisterClass(MVT::i8, &X86::GR8RegClass);
193 addRegisterClass(MVT::i16, &X86::GR16RegClass);
194 addRegisterClass(MVT::i32, &X86::GR32RegClass);
195 if (Subtarget.is64Bit())
196 addRegisterClass(MVT::i64, &X86::GR64RegClass);
197
198 for (MVT VT : MVT::integer_valuetypes())
199 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
200
201 // We don't accept any truncstore of integer registers.
202 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
203 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
204 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
205 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
206 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
207 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
208
209 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
210
211 // SETOEQ and SETUNE require checking two conditions.
212 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
213 setCondCodeAction(ISD::SETOEQ, VT, Expand);
214 setCondCodeAction(ISD::SETUNE, VT, Expand);
215 }
216
217 // Integer absolute.
218 if (Subtarget.canUseCMOV()) {
219 setOperationAction(ISD::ABS , MVT::i16 , Custom);
220 setOperationAction(ISD::ABS , MVT::i32 , Custom);
221 if (Subtarget.is64Bit())
222 setOperationAction(ISD::ABS , MVT::i64 , Custom);
223 }
224
225 // Signed saturation subtraction.
226 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
227 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
228 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
229 if (Subtarget.is64Bit())
230 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
231
232 // Funnel shifts.
233 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
234 // For slow shld targets we only lower for code size.
235 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
236
237 setOperationAction(ShiftOp , MVT::i8 , Custom);
238 setOperationAction(ShiftOp , MVT::i16 , Custom);
239 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
240 if (Subtarget.is64Bit())
241 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
242 }
243
244 if (!Subtarget.useSoftFloat()) {
245 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
246 // operation.
247 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
248 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
249 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
250 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
251 // We have an algorithm for SSE2, and we turn this into a 64-bit
252 // FILD or VCVTUSI2SS/SD for other targets.
253 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
254 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
255 // We have an algorithm for SSE2->double, and we turn this into a
256 // 64-bit FILD followed by conditional FADD for other targets.
257 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
258 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
259
260 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
261 // this operation.
262 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
263 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
264 // SSE has no i16 to fp conversion, only i32. We promote in the handler
265 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
267 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
268 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
270 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
271 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
272 // are Legal, f80 is custom lowered.
273 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
274 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
275
276 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
277 // this operation.
278 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
279 // FIXME: This doesn't generate invalid exception when it should. PR44019.
280 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
281 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
282 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
283 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
284 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
285 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
286 // are Legal, f80 is custom lowered.
287 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
288 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
289
290 // Handle FP_TO_UINT by promoting the destination to a larger signed
291 // conversion.
292 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
293 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
295 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
296 // FIXME: This doesn't generate invalid exception when it should. PR44019.
297 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
298 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
299 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
300 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
301 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
302
303 setOperationAction(ISD::LRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LRINT, MVT::f64, Custom);
305 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
306 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
307
308 if (!Subtarget.is64Bit()) {
309 setOperationAction(ISD::LRINT, MVT::i64, Custom);
310 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
311 }
312 }
313
314 if (Subtarget.hasSSE2()) {
315 // Custom lowering for saturating float to int conversions.
316 // We handle promotion to larger result types manually.
317 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
319 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
320 }
321 if (Subtarget.is64Bit()) {
322 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
323 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
324 }
325 }
326
327 // Handle address space casts between mixed sized pointers.
328 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
329 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
330
331 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
332 if (!Subtarget.hasSSE2()) {
333 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
334 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
335 if (Subtarget.is64Bit()) {
336 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
337 // Without SSE, i64->f64 goes through memory.
338 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
339 }
340 } else if (!Subtarget.is64Bit())
341 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
342
343 // Scalar integer divide and remainder are lowered to use operations that
344 // produce two results, to match the available instructions. This exposes
345 // the two-result form to trivial CSE, which is able to combine x/y and x%y
346 // into a single instruction.
347 //
348 // Scalar integer multiply-high is also lowered to use two-result
349 // operations, to match the available instructions. However, plain multiply
350 // (low) operations are left as Legal, as there are single-result
351 // instructions for this in x86. Using the two-result multiply instructions
352 // when both high and low results are needed must be arranged by dagcombine.
353 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
354 setOperationAction(ISD::MULHS, VT, Expand);
355 setOperationAction(ISD::MULHU, VT, Expand);
356 setOperationAction(ISD::SDIV, VT, Expand);
357 setOperationAction(ISD::UDIV, VT, Expand);
358 setOperationAction(ISD::SREM, VT, Expand);
359 setOperationAction(ISD::UREM, VT, Expand);
360 }
361
362 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
363 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
364 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
365 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
366 setOperationAction(ISD::BR_CC, VT, Expand);
367 setOperationAction(ISD::SELECT_CC, VT, Expand);
368 }
369 if (Subtarget.is64Bit())
370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
372 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
373 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
374
375 setOperationAction(ISD::FREM , MVT::f32 , Expand);
376 setOperationAction(ISD::FREM , MVT::f64 , Expand);
377 setOperationAction(ISD::FREM , MVT::f80 , Expand);
378 setOperationAction(ISD::FREM , MVT::f128 , Expand);
379
380 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
381 setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
382 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
383 }
384
385 // Promote the i8 variants and force them on up to i32 which has a shorter
386 // encoding.
387 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
388 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
389 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
390 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
391 // promote that too.
392 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
393 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
394
395 if (!Subtarget.hasBMI()) {
396 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
397 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
398 if (Subtarget.is64Bit()) {
399 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
400 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
401 }
402 }
403
404 if (Subtarget.hasLZCNT()) {
405 // When promoting the i8 variants, force them to i32 for a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
408 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
409 } else {
410 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
411 if (VT == MVT::i64 && !Subtarget.is64Bit())
412 continue;
413 setOperationAction(ISD::CTLZ , VT, Custom);
414 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
415 }
416 }
417
418 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
419 ISD::STRICT_FP_TO_FP16}) {
420 // Special handling for half-precision floating point conversions.
421 // If we don't have F16C support, then lower half float conversions
422 // into library calls.
423 setOperationAction(
424 Op, MVT::f32,
425 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
426 // There's never any support for operations beyond MVT::f32.
427 setOperationAction(Op, MVT::f64, Expand);
428 setOperationAction(Op, MVT::f80, Expand);
429 setOperationAction(Op, MVT::f128, Expand);
430 }
431
432 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
433 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
434 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
435 setTruncStoreAction(VT, MVT::f16, Expand);
436 setTruncStoreAction(VT, MVT::bf16, Expand);
437
438 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
439 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
440 }
441
442 setOperationAction(ISD::PARITY, MVT::i8, Custom);
443 setOperationAction(ISD::PARITY, MVT::i16, Custom);
444 setOperationAction(ISD::PARITY, MVT::i32, Custom);
445 if (Subtarget.is64Bit())
446 setOperationAction(ISD::PARITY, MVT::i64, Custom);
447 if (Subtarget.hasPOPCNT()) {
448 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
449 // popcntw is longer to encode than popcntl and also has a false dependency
450 // on the dest that popcntl hasn't had since Cannon Lake.
451 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
452 } else {
453 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
454 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
455 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
456 if (Subtarget.is64Bit())
457 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
458 else
459 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
460 }
461
462 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
463
464 if (!Subtarget.hasMOVBE())
465 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
466
467 // X86 wants to expand cmov itself.
468 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
469 setOperationAction(ISD::SELECT, VT, Custom);
470 setOperationAction(ISD::SETCC, VT, Custom);
471 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
472 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
473 }
474 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
475 if (VT == MVT::i64 && !Subtarget.is64Bit())
476 continue;
477 setOperationAction(ISD::SELECT, VT, Custom);
478 setOperationAction(ISD::SETCC, VT, Custom);
479 }
480
481 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
482 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
483 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
484
485 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
486 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
487 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
488 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
489 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
490 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
491 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
492 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
493
494 // Darwin ABI issue.
495 for (auto VT : { MVT::i32, MVT::i64 }) {
496 if (VT == MVT::i64 && !Subtarget.is64Bit())
497 continue;
498 setOperationAction(ISD::ConstantPool , VT, Custom);
499 setOperationAction(ISD::JumpTable , VT, Custom);
500 setOperationAction(ISD::GlobalAddress , VT, Custom);
501 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
502 setOperationAction(ISD::ExternalSymbol , VT, Custom);
503 setOperationAction(ISD::BlockAddress , VT, Custom);
504 }
505
506 // 64-bit shl, sra, srl (iff 32-bit x86)
507 for (auto VT : { MVT::i32, MVT::i64 }) {
508 if (VT == MVT::i64 && !Subtarget.is64Bit())
509 continue;
510 setOperationAction(ISD::SHL_PARTS, VT, Custom);
511 setOperationAction(ISD::SRA_PARTS, VT, Custom);
512 setOperationAction(ISD::SRL_PARTS, VT, Custom);
513 }
514
515 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
516 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
517
518 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
519
520 // Expand certain atomics
521 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
522 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
523 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
524 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
525 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
526 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
527 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
528 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
529 }
530
531 if (!Subtarget.is64Bit())
532 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
533
534 if (Subtarget.canUseCMPXCHG16B())
535 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
536
537 // FIXME - use subtarget debug flags
538 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
539 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
540 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
541 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
542 }
543
544 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
545 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
546
547 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
548 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
549
550 setOperationAction(ISD::TRAP, MVT::Other, Legal);
551 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
552 if (Subtarget.isTargetPS())
553 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
554 else
555 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
556
557 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
558 setOperationAction(ISD::VASTART , MVT::Other, Custom);
559 setOperationAction(ISD::VAEND , MVT::Other, Expand);
560 bool Is64Bit = Subtarget.is64Bit();
561 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
562 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
563
564 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
565 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
566
567 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
568
569 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
570 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
571 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
572
573 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
574
575 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
576 setOperationAction(ISD::FABS, VT, Action);
577 setOperationAction(ISD::FNEG, VT, Action);
578 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
579 setOperationAction(ISD::FREM, VT, Action);
580 setOperationAction(ISD::FMA, VT, Action);
581 setOperationAction(ISD::FMINNUM, VT, Action);
582 setOperationAction(ISD::FMAXNUM, VT, Action);
583 setOperationAction(ISD::FMINIMUM, VT, Action);
584 setOperationAction(ISD::FMAXIMUM, VT, Action);
585 setOperationAction(ISD::FSIN, VT, Action);
586 setOperationAction(ISD::FCOS, VT, Action);
587 setOperationAction(ISD::FSINCOS, VT, Action);
588 setOperationAction(ISD::FSQRT, VT, Action);
589 setOperationAction(ISD::FPOW, VT, Action);
590 setOperationAction(ISD::FLOG, VT, Action);
591 setOperationAction(ISD::FLOG2, VT, Action);
592 setOperationAction(ISD::FLOG10, VT, Action);
593 setOperationAction(ISD::FEXP, VT, Action);
594 setOperationAction(ISD::FEXP2, VT, Action);
595 setOperationAction(ISD::FCEIL, VT, Action);
596 setOperationAction(ISD::FFLOOR, VT, Action);
597 setOperationAction(ISD::FNEARBYINT, VT, Action);
598 setOperationAction(ISD::FRINT, VT, Action);
599 setOperationAction(ISD::BR_CC, VT, Action);
600 setOperationAction(ISD::SETCC, VT, Action);
601 setOperationAction(ISD::SELECT, VT, Custom);
602 setOperationAction(ISD::SELECT_CC, VT, Action);
603 setOperationAction(ISD::FROUND, VT, Action);
604 setOperationAction(ISD::FROUNDEVEN, VT, Action);
605 setOperationAction(ISD::FTRUNC, VT, Action);
606 };
607
608 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
609 // f16, f32 and f64 use SSE.
610 // Set up the FP register classes.
611 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
612 : &X86::FR16RegClass);
613 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
614 : &X86::FR32RegClass);
615 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
616 : &X86::FR64RegClass);
617
618 // Disable f32->f64 extload as we can only generate this in one instruction
619 // under optsize. So its easier to pattern match (fpext (load)) for that
620 // case instead of needing to emit 2 instructions for extload in the
621 // non-optsize case.
622 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
623
624 for (auto VT : { MVT::f32, MVT::f64 }) {
625 // Use ANDPD to simulate FABS.
626 setOperationAction(ISD::FABS, VT, Custom);
627
628 // Use XORP to simulate FNEG.
629 setOperationAction(ISD::FNEG, VT, Custom);
630
631 // Use ANDPD and ORPD to simulate FCOPYSIGN.
632 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
633
634 // These might be better off as horizontal vector ops.
635 setOperationAction(ISD::FADD, VT, Custom);
636 setOperationAction(ISD::FSUB, VT, Custom);
637
638 // We don't support sin/cos/fmod
639 setOperationAction(ISD::FSIN , VT, Expand);
640 setOperationAction(ISD::FCOS , VT, Expand);
641 setOperationAction(ISD::FSINCOS, VT, Expand);
642 }
643
644 // Half type will be promoted by default.
645 setF16Action(MVT::f16, Promote);
646 setOperationAction(ISD::FADD, MVT::f16, Promote);
647 setOperationAction(ISD::FSUB, MVT::f16, Promote);
648 setOperationAction(ISD::FMUL, MVT::f16, Promote);
649 setOperationAction(ISD::FDIV, MVT::f16, Promote);
650 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
651 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
652 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
653
654 setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
655 setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
656 setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
657 setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
658 setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
659 setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
660 setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
661 setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
662 setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
663 setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
664 setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
665 setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
666 setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
667 setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
668 setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
669 setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
670 setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
671 setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
672 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
673 setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
674 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
675 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
676 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
677 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
678 setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
679 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
680 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
681 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
682
683 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
684 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
685
686 // Lower this to MOVMSK plus an AND.
687 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
688 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
689
690 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
691 (UseX87 || Is64Bit)) {
692 // Use SSE for f32, x87 for f64.
693 // Set up the FP register classes.
694 addRegisterClass(MVT::f32, &X86::FR32RegClass);
695 if (UseX87)
696 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
697
698 // Use ANDPS to simulate FABS.
699 setOperationAction(ISD::FABS , MVT::f32, Custom);
700
701 // Use XORP to simulate FNEG.
702 setOperationAction(ISD::FNEG , MVT::f32, Custom);
703
704 if (UseX87)
705 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
706
707 // Use ANDPS and ORPS to simulate FCOPYSIGN.
708 if (UseX87)
709 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
710 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
711
712 // We don't support sin/cos/fmod
713 setOperationAction(ISD::FSIN , MVT::f32, Expand);
714 setOperationAction(ISD::FCOS , MVT::f32, Expand);
715 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
716
717 if (UseX87) {
718 // Always expand sin/cos functions even though x87 has an instruction.
719 setOperationAction(ISD::FSIN, MVT::f64, Expand);
720 setOperationAction(ISD::FCOS, MVT::f64, Expand);
721 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
722 }
723 } else if (UseX87) {
724 // f32 and f64 in x87.
725 // Set up the FP register classes.
726 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
727 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
728
729 for (auto VT : { MVT::f32, MVT::f64 }) {
730 setOperationAction(ISD::UNDEF, VT, Expand);
731 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
732
733 // Always expand sin/cos functions even though x87 has an instruction.
734 setOperationAction(ISD::FSIN , VT, Expand);
735 setOperationAction(ISD::FCOS , VT, Expand);
736 setOperationAction(ISD::FSINCOS, VT, Expand);
737 }
738 }
739
740 // Expand FP32 immediates into loads from the stack, save special cases.
741 if (isTypeLegal(MVT::f32)) {
742 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
743 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
744 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
745 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
746 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
747 } else // SSE immediates.
748 addLegalFPImmediate(APFloat(+0.0f)); // xorps
749 }
750 // Expand FP64 immediates into loads from the stack, save special cases.
751 if (isTypeLegal(MVT::f64)) {
752 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
753 addLegalFPImmediate(APFloat(+0.0)); // FLD0
754 addLegalFPImmediate(APFloat(+1.0)); // FLD1
755 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
756 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
757 } else // SSE immediates.
758 addLegalFPImmediate(APFloat(+0.0)); // xorpd
759 }
760 // Support fp16 0 immediate.
761 if (isTypeLegal(MVT::f16))
762 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
763
764 // Handle constrained floating-point operations of scalar.
765 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
766 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
767 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
768 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
769 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
770 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
771 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
772 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
773 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
774 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
775 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
776 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
777
778 // We don't support FMA.
779 setOperationAction(ISD::FMA, MVT::f64, Expand);
780 setOperationAction(ISD::FMA, MVT::f32, Expand);
781
782 // f80 always uses X87.
783 if (UseX87) {
784 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
785 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
786 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
787 {
788 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
789 addLegalFPImmediate(TmpFlt); // FLD0
790 TmpFlt.changeSign();
791 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
792
793 bool ignored;
794 APFloat TmpFlt2(+1.0);
795 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
796 &ignored);
797 addLegalFPImmediate(TmpFlt2); // FLD1
798 TmpFlt2.changeSign();
799 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
800 }
801
802 // Always expand sin/cos functions even though x87 has an instruction.
803 setOperationAction(ISD::FSIN , MVT::f80, Expand);
804 setOperationAction(ISD::FCOS , MVT::f80, Expand);
805 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
806
807 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
808 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
809 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
810 setOperationAction(ISD::FRINT, MVT::f80, Expand);
811 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
812 setOperationAction(ISD::FMA, MVT::f80, Expand);
813 setOperationAction(ISD::LROUND, MVT::f80, Expand);
814 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
815 setOperationAction(ISD::LRINT, MVT::f80, Custom);
816 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
817
818 // Handle constrained floating-point operations of scalar.
819 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
820 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
821 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
822 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
823 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
824 if (isTypeLegal(MVT::f16)) {
825 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
826 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
827 } else {
828 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
829 }
830 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
831 // as Custom.
832 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
833 }
834
835 // f128 uses xmm registers, but most operations require libcalls.
836 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
837 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
838 : &X86::VR128RegClass);
839
840 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
841
842 setOperationAction(ISD::FADD, MVT::f128, LibCall);
843 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
844 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
845 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
846 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
847 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
848 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
849 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
850 setOperationAction(ISD::FMA, MVT::f128, LibCall);
851 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
852
853 setOperationAction(ISD::FABS, MVT::f128, Custom);
854 setOperationAction(ISD::FNEG, MVT::f128, Custom);
855 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
856
857 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
858 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
859 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
860 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
861 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
862 // No STRICT_FSINCOS
863 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
864 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
865
866 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
867 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
868 // We need to custom handle any FP_ROUND with an f128 input, but
869 // LegalizeDAG uses the result type to know when to run a custom handler.
870 // So we have to list all legal floating point result types here.
871 if (isTypeLegal(MVT::f32)) {
872 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
873 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
874 }
875 if (isTypeLegal(MVT::f64)) {
876 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
877 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
878 }
879 if (isTypeLegal(MVT::f80)) {
880 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
881 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
882 }
883
884 setOperationAction(ISD::SETCC, MVT::f128, Custom);
885
886 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
887 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
888 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
889 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
890 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
891 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
892 }
893
894 // Always use a library call for pow.
895 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
896 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
897 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
898 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
899
900 setOperationAction(ISD::FLOG, MVT::f80, Expand);
901 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
902 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
903 setOperationAction(ISD::FEXP, MVT::f80, Expand);
904 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
905 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
906 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
907
908 // Some FP actions are always expanded for vector types.
909 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
910 MVT::v4f32, MVT::v8f32, MVT::v16f32,
911 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
912 setOperationAction(ISD::FSIN, VT, Expand);
913 setOperationAction(ISD::FSINCOS, VT, Expand);
914 setOperationAction(ISD::FCOS, VT, Expand);
915 setOperationAction(ISD::FREM, VT, Expand);
916 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
917 setOperationAction(ISD::FPOW, VT, Expand);
918 setOperationAction(ISD::FLOG, VT, Expand);
919 setOperationAction(ISD::FLOG2, VT, Expand);
920 setOperationAction(ISD::FLOG10, VT, Expand);
921 setOperationAction(ISD::FEXP, VT, Expand);
922 setOperationAction(ISD::FEXP2, VT, Expand);
923 }
924
925 // First set operation action for all vector types to either promote
926 // (for widening) or expand (for scalarization). Then we will selectively
927 // turn on ones that can be effectively codegen'd.
928 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
929 setOperationAction(ISD::SDIV, VT, Expand);
930 setOperationAction(ISD::UDIV, VT, Expand);
931 setOperationAction(ISD::SREM, VT, Expand);
932 setOperationAction(ISD::UREM, VT, Expand);
933 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
934 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
935 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
936 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
937 setOperationAction(ISD::FMA, VT, Expand);
938 setOperationAction(ISD::FFLOOR, VT, Expand);
939 setOperationAction(ISD::FCEIL, VT, Expand);
940 setOperationAction(ISD::FTRUNC, VT, Expand);
941 setOperationAction(ISD::FRINT, VT, Expand);
942 setOperationAction(ISD::FNEARBYINT, VT, Expand);
943 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
944 setOperationAction(ISD::MULHS, VT, Expand);
945 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
946 setOperationAction(ISD::MULHU, VT, Expand);
947 setOperationAction(ISD::SDIVREM, VT, Expand);
948 setOperationAction(ISD::UDIVREM, VT, Expand);
949 setOperationAction(ISD::CTPOP, VT, Expand);
950 setOperationAction(ISD::CTTZ, VT, Expand);
951 setOperationAction(ISD::CTLZ, VT, Expand);
952 setOperationAction(ISD::ROTL, VT, Expand);
953 setOperationAction(ISD::ROTR, VT, Expand);
954 setOperationAction(ISD::BSWAP, VT, Expand);
955 setOperationAction(ISD::SETCC, VT, Expand);
956 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
957 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
958 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
959 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
960 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
961 setOperationAction(ISD::TRUNCATE, VT, Expand);
962 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
963 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
964 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
965 setOperationAction(ISD::SELECT_CC, VT, Expand);
966 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
967 setTruncStoreAction(InnerVT, VT, Expand);
968
969 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
970 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
971
972 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
973 // types, we have to deal with them whether we ask for Expansion or not.
974 // Setting Expand causes its own optimisation problems though, so leave
975 // them legal.
976 if (VT.getVectorElementType() == MVT::i1)
977 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
978
979 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
980 // split/scalarized right now.
981 if (VT.getVectorElementType() == MVT::f16 ||
982 VT.getVectorElementType() == MVT::bf16)
983 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
984 }
985 }
986
987 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
988 // with -msoft-float, disable use of MMX as well.
989 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
990 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
991 // No operations on x86mmx supported, everything uses intrinsics.
992 }
993
994 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
995 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
996 : &X86::VR128RegClass);
997
998 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
999 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1000 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
1001 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
1002 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
1003 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
1004 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1005 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
1006
1007 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1008 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1009
1010 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1011 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1012 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1013 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1014 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1015 }
1016
1017 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1018 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1019 : &X86::VR128RegClass);
1020
1021 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1022 // registers cannot be used even for integer operations.
1023 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1024 : &X86::VR128RegClass);
1025 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1026 : &X86::VR128RegClass);
1027 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1028 : &X86::VR128RegClass);
1029 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1030 : &X86::VR128RegClass);
1031 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1032 : &X86::VR128RegClass);
1033
1034 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1035 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1036 setOperationAction(ISD::SDIV, VT, Custom);
1037 setOperationAction(ISD::SREM, VT, Custom);
1038 setOperationAction(ISD::UDIV, VT, Custom);
1039 setOperationAction(ISD::UREM, VT, Custom);
1040 }
1041
1042 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1043 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1044 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1045
1046 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1047 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1048 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1049 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1050 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1051 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1052 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1053 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1054 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1055 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1056 setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
1057 setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
1058
1059 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1060 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1061 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1064 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1065 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
1066
1067 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1068 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1069 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1070 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1071 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1072 }
1073
1074 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
1075 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
1076 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
1077 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
1078 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
1079 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
1080 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
1081 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
1082 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
1083 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
1084
1085 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1086 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1087 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1088 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1089
1090 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1091 setOperationAction(ISD::SETCC, VT, Custom);
1092 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1093 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1094 setOperationAction(ISD::CTPOP, VT, Custom);
1095 setOperationAction(ISD::ABS, VT, Custom);
1096
1097 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1098 // setcc all the way to isel and prefer SETGT in some isel patterns.
1099 setCondCodeAction(ISD::SETLT, VT, Custom);
1100 setCondCodeAction(ISD::SETLE, VT, Custom);
1101 }
1102
1103 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1104 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1105 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1106 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1107 setOperationAction(ISD::VSELECT, VT, Custom);
1108 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1109 }
1110
1111 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1112 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1113 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1114 setOperationAction(ISD::VSELECT, VT, Custom);
1115
1116 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1117 continue;
1118
1119 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1120 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1121 }
1122 setF16Action(MVT::v8f16, Expand);
1123 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1124 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1125 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1126 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1127
1128 // Custom lower v2i64 and v2f64 selects.
1129 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1130 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1131 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1132 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1133 setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
1134 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1135
1136 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
1137 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1138 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1139 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1140 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
1141 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1142
1143 // Custom legalize these to avoid over promotion or custom promotion.
1144 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1145 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1146 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1147 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1148 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1149 }
1150
1151 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1152 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
1153 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1154 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1155
1156 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1157 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1158
1159 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1160 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1161
1162 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1163 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1164 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1165 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1166 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1167
1168 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1169 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1170 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1171 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1172
1173 // We want to legalize this to an f64 load rather than an i64 load on
1174 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1175 // store.
1176 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1177 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1178 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1179 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1180 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1181 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1182
1183 // Add 32-bit vector stores to help vectorization opportunities.
1184 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1185 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1186
1187 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1188 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1189 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1190 if (!Subtarget.hasAVX512())
1191 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1192
1193 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1194 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1195 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1196
1197 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1198
1199 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1200 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1201 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1202 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1203 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1204 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1205
1206 // In the customized shift lowering, the legal v4i32/v2i64 cases
1207 // in AVX2 will be recognized.
1208 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1209 setOperationAction(ISD::SRL, VT, Custom);
1210 setOperationAction(ISD::SHL, VT, Custom);
1211 setOperationAction(ISD::SRA, VT, Custom);
1212 if (VT == MVT::v2i64) continue;
1213 setOperationAction(ISD::ROTL, VT, Custom);
1214 setOperationAction(ISD::ROTR, VT, Custom);
1215 setOperationAction(ISD::FSHL, VT, Custom);
1216 setOperationAction(ISD::FSHR, VT, Custom);
1217 }
1218
1219 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1220 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1221 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1222 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1223 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1224 }
1225
1226 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1227 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1228 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1229 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1230 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1231 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1232 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1233 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1234 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1235
1236 // These might be better off as horizontal vector ops.
1237 setOperationAction(ISD::ADD, MVT::i16, Custom);
1238 setOperationAction(ISD::ADD, MVT::i32, Custom);
1239 setOperationAction(ISD::SUB, MVT::i16, Custom);
1240 setOperationAction(ISD::SUB, MVT::i32, Custom);
1241 }
1242
1243 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1244 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1245 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1246 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1247 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1248 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1249 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1250 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1251 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1252 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1253 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1254 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1255 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1256 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1257
1258 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1259 }
1260
1261 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1262 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1263 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1264 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1265 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1266 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1267 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1268 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1269
1270 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1271 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1272 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1273
1274 // FIXME: Do we need to handle scalar-to-vector here?
1275 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1276 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1277
1278 // We directly match byte blends in the backend as they match the VSELECT
1279 // condition form.
1280 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1281
1282 // SSE41 brings specific instructions for doing vector sign extend even in
1283 // cases where we don't have SRA.
1284 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1285 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1286 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1287 }
1288
1289 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1290 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1291 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1292 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1293 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1294 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1295 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1296 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1297 }
1298
1299 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1300 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1301 // do the pre and post work in the vector domain.
1302 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1303 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1304 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1305 // so that DAG combine doesn't try to turn it into uint_to_fp.
1306 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1307 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1308 }
1309 }
1310
1311 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1312 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1313 }
1314
1315 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1316 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1317 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1318 setOperationAction(ISD::ROTL, VT, Custom);
1319 setOperationAction(ISD::ROTR, VT, Custom);
1320 }
1321
1322 // XOP can efficiently perform BITREVERSE with VPPERM.
1323 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1324 setOperationAction(ISD::BITREVERSE, VT, Custom);
1325
1326 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1327 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1328 setOperationAction(ISD::BITREVERSE, VT, Custom);
1329 }
1330
1331 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1332 bool HasInt256 = Subtarget.hasInt256();
1333
1334 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1335 : &X86::VR256RegClass);
1336 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1337 : &X86::VR256RegClass);
1338 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1339 : &X86::VR256RegClass);
1340 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1341 : &X86::VR256RegClass);
1342 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1343 : &X86::VR256RegClass);
1344 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1345 : &X86::VR256RegClass);
1346 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1347 : &X86::VR256RegClass);
1348
1349 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1350 setOperationAction(ISD::FFLOOR, VT, Legal);
1351 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1352 setOperationAction(ISD::FCEIL, VT, Legal);
1353 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1354 setOperationAction(ISD::FTRUNC, VT, Legal);
1355 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1356 setOperationAction(ISD::FRINT, VT, Legal);
1357 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1358 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1359 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1360 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1361 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1362
1363 setOperationAction(ISD::FROUND, VT, Custom);
1364
1365 setOperationAction(ISD::FNEG, VT, Custom);
1366 setOperationAction(ISD::FABS, VT, Custom);
1367 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1368 }
1369
1370 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1371 // even though v8i16 is a legal type.
1372 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1373 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1374 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1375 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1376 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
1377 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1378 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
1379
1380 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
1381 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
1382 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1383 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
1384 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1385 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
1386
1387 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1388 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1389 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1390 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1391 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1392 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1393 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1394 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1395 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1396 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1397 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1398
1399 if (!Subtarget.hasAVX512())
1400 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1401
1402 // In the customized shift lowering, the legal v8i32/v4i64 cases
1403 // in AVX2 will be recognized.
1404 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1405 setOperationAction(ISD::SRL, VT, Custom);
1406 setOperationAction(ISD::SHL, VT, Custom);
1407 setOperationAction(ISD::SRA, VT, Custom);
1408 if (VT == MVT::v4i64) continue;
1409 setOperationAction(ISD::ROTL, VT, Custom);
1410 setOperationAction(ISD::ROTR, VT, Custom);
1411 setOperationAction(ISD::FSHL, VT, Custom);
1412 setOperationAction(ISD::FSHR, VT, Custom);
1413 }
1414
1415 // These types need custom splitting if their input is a 128-bit vector.
1416 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1417 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1418 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1419 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1420
1421 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1422 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1423 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1424 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1425 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1426 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1427 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1428
1429 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1430 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1431 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1432 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1433 }
1434
1435 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1436 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1437 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1438 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1439
1440 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1441 setOperationAction(ISD::SETCC, VT, Custom);
1442 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1443 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1444 setOperationAction(ISD::CTPOP, VT, Custom);
1445 setOperationAction(ISD::CTLZ, VT, Custom);
1446
1447 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1448 // setcc all the way to isel and prefer SETGT in some isel patterns.
1449 setCondCodeAction(ISD::SETLT, VT, Custom);
1450 setCondCodeAction(ISD::SETLE, VT, Custom);
1451 }
1452
1453 if (Subtarget.hasAnyFMA()) {
1454 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1455 MVT::v2f64, MVT::v4f64 }) {
1456 setOperationAction(ISD::FMA, VT, Legal);
1457 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1458 }
1459 }
1460
1461 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1462 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1463 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1464 }
1465
1466 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1467 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1468 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1469 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1470
1471 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1472 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1473 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1474 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1475 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1476 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1477 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1478 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1479
1480 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1481 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1482
1483 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1484 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1485 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1486 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1487 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1488
1489 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1490 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1491 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1492 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1493 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1494 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1495 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1496 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1497 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1498 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1499 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1500 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1501
1502 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1503 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1504 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1505 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1506 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1507 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1508 }
1509
1510 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1511 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1512 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1513 }
1514
1515 if (HasInt256) {
1516 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1517 // when we have a 256bit-wide blend with immediate.
1518 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1519 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1520
1521 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1522 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1523 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1524 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1525 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1526 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1527 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1528 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1529 }
1530 }
1531
1532 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1533 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1534 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1535 setOperationAction(ISD::MSTORE, VT, Legal);
1536 }
1537
1538 // Extract subvector is special because the value type
1539 // (result) is 128-bit but the source is 256-bit wide.
1540 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1541 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1542 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1543 }
1544
1545 // Custom lower several nodes for 256-bit types.
1546 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1547 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1550 setOperationAction(ISD::VSELECT, VT, Custom);
1551 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1552 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1553 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1554 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1555 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1556 setOperationAction(ISD::STORE, VT, Custom);
1557 }
1558 setF16Action(MVT::v16f16, Expand);
1559 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1560 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1561 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1562 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1563
1564 if (HasInt256) {
1565 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1566
1567 // Custom legalize 2x32 to get a little better code.
1568 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1569 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1570
1571 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1572 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1573 setOperationAction(ISD::MGATHER, VT, Custom);
1574 }
1575 }
1576
1577 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1578 Subtarget.hasF16C()) {
1579 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1580 setOperationAction(ISD::FP_ROUND, VT, Custom);
1581 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1582 }
1583 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
1584 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1585 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
1586 }
1587 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1588 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1589 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1590 }
1591
1592 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1593 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
1594 }
1595
1596 // This block controls legalization of the mask vector sizes that are
1597 // available with AVX512. 512-bit vectors are in a separate block controlled
1598 // by useAVX512Regs.
1599 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1600 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1601 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1602 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1603 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1604 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1605
1606 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1607 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1608 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1609
1610 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1611 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1612 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1613 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1614 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1615 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1616 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1617 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1618 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1619 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1620 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1621 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1622
1623 // There is no byte sized k-register load or store without AVX512DQ.
1624 if (!Subtarget.hasDQI()) {
1625 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1626 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1627 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1628 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1629
1630 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1631 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1632 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1633 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1634 }
1635
1636 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1637 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1638 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1639 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1640 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1641 }
1642
1643 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1644 setOperationAction(ISD::VSELECT, VT, Expand);
1645
1646 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1647 setOperationAction(ISD::SETCC, VT, Custom);
1648 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1649 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1650 setOperationAction(ISD::SELECT, VT, Custom);
1651 setOperationAction(ISD::TRUNCATE, VT, Custom);
1652
1653 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1654 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1655 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1656 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1657 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1658 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1659 }
1660
1661 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1662 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1663 }
1664
1665 // This block controls legalization for 512-bit operations with 32/64 bit
1666 // elements. 512-bits can be disabled based on prefer-vector-width and
1667 // required-vector-width function attributes.
1668 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1669 bool HasBWI = Subtarget.hasBWI();
1670
1671 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1672 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1673 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1674 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1675 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1676 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1677 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1678
1679 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1680 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1681 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1682 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1683 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1684 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1685 if (HasBWI)
1686 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1687 }
1688
1689 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1690 setOperationAction(ISD::FNEG, VT, Custom);
1691 setOperationAction(ISD::FABS, VT, Custom);
1692 setOperationAction(ISD::FMA, VT, Legal);
1693 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1694 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1695 }
1696
1697 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1698 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1699 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1700 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1701 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1702 }
1703 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Custom);
1704 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Custom);
1705 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom);
1706 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom);
1707 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
1708 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
1709 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1710 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1711 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1712 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
1713
1714 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1715 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1716 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1717 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1718 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1719 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1720 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1721 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1722 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1723 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1724 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1725
1726 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1727 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1728 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1729 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1730 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1731 if (HasBWI)
1732 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1733
1734 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1735 // to 512-bit rather than use the AVX2 instructions so that we can use
1736 // k-masks.
1737 if (!Subtarget.hasVLX()) {
1738 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1739 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1740 setOperationAction(ISD::MLOAD, VT, Custom);
1741 setOperationAction(ISD::MSTORE, VT, Custom);
1742 }
1743 }
1744
1745 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1746 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1747 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1748 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1749 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1750 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1751 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1752 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1753 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1754 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1755 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1756 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1757 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1758
1759 if (HasBWI) {
1760 // Extends from v64i1 masks to 512-bit vectors.
1761 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1762 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1763 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1764 }
1765
1766 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1767 setOperationAction(ISD::FFLOOR, VT, Legal);
1768 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1769 setOperationAction(ISD::FCEIL, VT, Legal);
1770 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1771 setOperationAction(ISD::FTRUNC, VT, Legal);
1772 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1773 setOperationAction(ISD::FRINT, VT, Legal);
1774 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1775 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1776 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1777 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1778 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1779
1780 setOperationAction(ISD::FROUND, VT, Custom);
1781 }
1782
1783 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1784 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1785 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1786 }
1787
1788 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1789 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1790 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1791 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1792
1793 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1794 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1795 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1796 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1797
1798 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1799 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1800 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1801 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1802 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1803 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1804 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1805 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1806
1807 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1808 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1809
1810 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1811
1812 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1813 setOperationAction(ISD::SRL, VT, Custom);
1814 setOperationAction(ISD::SHL, VT, Custom);
1815 setOperationAction(ISD::SRA, VT, Custom);
1816 setOperationAction(ISD::ROTL, VT, Custom);
1817 setOperationAction(ISD::ROTR, VT, Custom);
1818 setOperationAction(ISD::SETCC, VT, Custom);
1819
1820 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1821 // setcc all the way to isel and prefer SETGT in some isel patterns.
1822 setCondCodeAction(ISD::SETLT, VT, Custom);
1823 setCondCodeAction(ISD::SETLE, VT, Custom);
1824 }
1825 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1826 setOperationAction(ISD::SMAX, VT, Legal);
1827 setOperationAction(ISD::UMAX, VT, Legal);
1828 setOperationAction(ISD::SMIN, VT, Legal);
1829 setOperationAction(ISD::UMIN, VT, Legal);
1830 setOperationAction(ISD::ABS, VT, Legal);
1831 setOperationAction(ISD::CTPOP, VT, Custom);
1832 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1833 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1834 }
1835
1836 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1837 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1838 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1839 setOperationAction(ISD::CTLZ, VT, Custom);
1840 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1841 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1842 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1843 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1844 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1845 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1846 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1847 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1848 }
1849
1850 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1851 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1852 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1853 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1854 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1855 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1856
1857 if (Subtarget.hasDQI()) {
1858 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1859 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1860 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1861 setOperationAction(Opc, MVT::v8i64, Custom);
1862 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1863 }
1864
1865 if (Subtarget.hasCDI()) {
1866 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1867 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1868 setOperationAction(ISD::CTLZ, VT, Legal);
1869 }
1870 } // Subtarget.hasCDI()
1871
1872 if (Subtarget.hasVPOPCNTDQ()) {
1873 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1874 setOperationAction(ISD::CTPOP, VT, Legal);
1875 }
1876
1877 // Extract subvector is special because the value type
1878 // (result) is 256-bit but the source is 512-bit wide.
1879 // 128-bit was made Legal under AVX1.
1880 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1881 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1882 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1883
1884 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1885 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1886 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1887 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1888 setOperationAction(ISD::SELECT, VT, Custom);
1889 setOperationAction(ISD::VSELECT, VT, Custom);
1890 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1891 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1892 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1893 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1894 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1895 }
1896 setF16Action(MVT::v32f16, Expand);
1897 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1898 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1899 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
1900 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1901 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1902 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1903 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1904 }
1905
1906 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1907 setOperationAction(ISD::MLOAD, VT, Legal);
1908 setOperationAction(ISD::MSTORE, VT, Legal);
1909 setOperationAction(ISD::MGATHER, VT, Custom);
1910 setOperationAction(ISD::MSCATTER, VT, Custom);
1911 }
1912 if (HasBWI) {
1913 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1914 setOperationAction(ISD::MLOAD, VT, Legal);
1915 setOperationAction(ISD::MSTORE, VT, Legal);
1916 }
1917 } else {
1918 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1919 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1920 }
1921
1922 if (Subtarget.hasVBMI2()) {
1923 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1924 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1925 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1926 setOperationAction(ISD::FSHL, VT, Custom);
1927 setOperationAction(ISD::FSHR, VT, Custom);
1928 }
1929
1930 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1931 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1932 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1933 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1934 }
1935 }// useAVX512Regs
1936
1937 // This block controls legalization for operations that don't have
1938 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1939 // narrower widths.
1940 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1941 // These operations are handled on non-VLX by artificially widening in
1942 // isel patterns.
1943
1944 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
1945 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
1946 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1947
1948 if (Subtarget.hasDQI()) {
1949 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1950 // v2f32 UINT_TO_FP is already custom under SSE2.
1951 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1953, __extension__
__PRETTY_FUNCTION__))
1952 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1953, __extension__
__PRETTY_FUNCTION__))
1953 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1953, __extension__
__PRETTY_FUNCTION__))
;
1954 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1955 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1956 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1957 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1958 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1959 }
1960
1961 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1962 setOperationAction(ISD::SMAX, VT, Legal);
1963 setOperationAction(ISD::UMAX, VT, Legal);
1964 setOperationAction(ISD::SMIN, VT, Legal);
1965 setOperationAction(ISD::UMIN, VT, Legal);
1966 setOperationAction(ISD::ABS, VT, Legal);
1967 }
1968
1969 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1970 setOperationAction(ISD::ROTL, VT, Custom);
1971 setOperationAction(ISD::ROTR, VT, Custom);
1972 }
1973
1974 // Custom legalize 2x32 to get a little better code.
1975 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1976 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1977
1978 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1979 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1980 setOperationAction(ISD::MSCATTER, VT, Custom);
1981
1982 if (Subtarget.hasDQI()) {
1983 for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1984 ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1985 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
1986 setOperationAction(Opc, MVT::v2i64, Custom);
1987 setOperationAction(Opc, MVT::v4i64, Custom);
1988 }
1989 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1990 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1991 }
1992
1993 if (Subtarget.hasCDI()) {
1994 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1995 setOperationAction(ISD::CTLZ, VT, Legal);
1996 }
1997 } // Subtarget.hasCDI()
1998
1999 if (Subtarget.hasVPOPCNTDQ()) {
2000 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2001 setOperationAction(ISD::CTPOP, VT, Legal);
2002 }
2003 }
2004
2005 // This block control legalization of v32i1/v64i1 which are available with
2006 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
2007 // useBWIRegs.
2008 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2009 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2010 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2011
2012 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2013 setOperationAction(ISD::VSELECT, VT, Expand);
2014 setOperationAction(ISD::TRUNCATE, VT, Custom);
2015 setOperationAction(ISD::SETCC, VT, Custom);
2016 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2017 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
2018 setOperationAction(ISD::SELECT, VT, Custom);
2019 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2020 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2021 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
2022 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
2023 }
2024
2025 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2026 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2027
2028 // Extends from v32i1 masks to 256-bit vectors.
2029 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
2030 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
2031 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
2032
2033 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2034 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2035 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2036 }
2037
2038 // These operations are handled on non-VLX by artificially widening in
2039 // isel patterns.
2040 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2041
2042 if (Subtarget.hasBITALG()) {
2043 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2044 setOperationAction(ISD::CTPOP, VT, Legal);
2045 }
2046 }
2047
2048 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2049 auto setGroup = [&] (MVT VT) {
2050 setOperationAction(ISD::FADD, VT, Legal);
2051 setOperationAction(ISD::STRICT_FADD, VT, Legal);
2052 setOperationAction(ISD::FSUB, VT, Legal);
2053 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
2054 setOperationAction(ISD::FMUL, VT, Legal);
2055 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
2056 setOperationAction(ISD::FDIV, VT, Legal);
2057 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
2058 setOperationAction(ISD::FSQRT, VT, Legal);
2059 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
2060
2061 setOperationAction(ISD::FFLOOR, VT, Legal);
2062 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
2063 setOperationAction(ISD::FCEIL, VT, Legal);
2064 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
2065 setOperationAction(ISD::FTRUNC, VT, Legal);
2066 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
2067 setOperationAction(ISD::FRINT, VT, Legal);
2068 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
2069 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2070 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
2071
2072 setOperationAction(ISD::LOAD, VT, Legal);
2073 setOperationAction(ISD::STORE, VT, Legal);
2074
2075 setOperationAction(ISD::FMA, VT, Legal);
2076 setOperationAction(ISD::STRICT_FMA, VT, Legal);
2077 setOperationAction(ISD::VSELECT, VT, Legal);
2078 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2079 setOperationAction(ISD::SELECT, VT, Custom);
2080
2081 setOperationAction(ISD::FNEG, VT, Custom);
2082 setOperationAction(ISD::FABS, VT, Custom);
2083 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
2084 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2085 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2086 };
2087
2088 // AVX512_FP16 scalar operations
2089 setGroup(MVT::f16);
2090 setOperationAction(ISD::FREM, MVT::f16, Promote);
2091 setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
2092 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
2093 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2094 setOperationAction(ISD::SETCC, MVT::f16, Custom);
2095 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
2096 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
2097 setOperationAction(ISD::FROUND, MVT::f16, Custom);
2098 setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
2099 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2100 setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
2101 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
2102 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
2103 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2104 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
2105
2106 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2107 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2108
2109 if (Subtarget.useAVX512Regs()) {
2110 setGroup(MVT::v32f16);
2111 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
2112 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
2113 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
2114 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
2115 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
2116 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2117 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
2118 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
2119 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
2120 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);
2121 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
2122 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
2123
2124 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
2125 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
2126 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
2127 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
2128 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2129 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2130 MVT::v32i16);
2131 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2132 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2133 MVT::v32i16);
2134 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2135 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2136 MVT::v32i16);
2137 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2138 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2139 MVT::v32i16);
2140
2141 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
2142 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2143 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2144
2145 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2146 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2147
2148 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2149 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2150 }
2151
2152 if (Subtarget.hasVLX()) {
2153 setGroup(MVT::v8f16);
2154 setGroup(MVT::v16f16);
2155
2156 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2157 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2158 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2159 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2160 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2161 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2162 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2163 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2164 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2165 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2166
2167 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2168 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2169 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2170 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2171 setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
2172 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2173 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
2174 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2175 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
2176 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
2177
2178 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2179 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2180 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2181
2182 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2183 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2184 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2185
2186 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2187 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2188 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2189 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2190
2191 // Need to custom widen these to prevent scalarization.
2192 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2193 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2194 }
2195 }
2196
2197 if (!Subtarget.useSoftFloat() &&
2198 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2199 addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
2200 addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
2201 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2202 // provide the method to promote BUILD_VECTOR. Set the operation action
2203 // Custom to do the customization later.
2204 setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2205 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2206 setF16Action(VT, Expand);
2207 setOperationAction(ISD::FADD, VT, Expand);
2208 setOperationAction(ISD::FSUB, VT, Expand);
2209 setOperationAction(ISD::FMUL, VT, Expand);
2210 setOperationAction(ISD::FDIV, VT, Expand);
2211 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2212 }
2213 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2214 }
2215
2216 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2217 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2218 setF16Action(MVT::v32bf16, Expand);
2219 setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2220 setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2221 setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2222 setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2223 setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2224 }
2225
2226 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2227 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2228 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2229 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2230 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2231 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2232
2233 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2234 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2235 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2236 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2237 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2238
2239 if (Subtarget.hasBWI()) {
2240 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2241 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2242 }
2243
2244 if (Subtarget.hasFP16()) {
2245 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2246 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2247 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2248 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2249 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2250 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2251 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2252 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2253 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2254 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2255 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2256 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2257 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2258 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2259 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2260 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2261 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2262 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2263 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2264 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2265 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2266 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2267 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2268 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2269 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2270 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2271 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2272 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2273 }
2274
2275 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2276 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2277 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2278 }
2279
2280 if (Subtarget.hasAMXTILE()) {
2281 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2282 }
2283
2284 // We want to custom lower some of our intrinsics.
2285 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2286 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2287 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2288 if (!Subtarget.is64Bit()) {
2289 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2290 }
2291
2292 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2293 // handle type legalization for these operations here.
2294 //
2295 // FIXME: We really should do custom legalization for addition and
2296 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2297 // than generic legalization for 64-bit multiplication-with-overflow, though.
2298 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2299 if (VT == MVT::i64 && !Subtarget.is64Bit())
2300 continue;
2301 // Add/Sub/Mul with overflow operations are custom lowered.
2302 setOperationAction(ISD::SADDO, VT, Custom);
2303 setOperationAction(ISD::UADDO, VT, Custom);
2304 setOperationAction(ISD::SSUBO, VT, Custom);
2305 setOperationAction(ISD::USUBO, VT, Custom);
2306 setOperationAction(ISD::SMULO, VT, Custom);
2307 setOperationAction(ISD::UMULO, VT, Custom);
2308
2309 // Support carry in as value rather than glue.
2310 setOperationAction(ISD::ADDCARRY, VT, Custom);
2311 setOperationAction(ISD::SUBCARRY, VT, Custom);
2312 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2313 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2314 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2315 }
2316
2317 if (!Subtarget.is64Bit()) {
2318 // These libcalls are not available in 32-bit.
2319 setLibcallName(RTLIB::SHL_I128, nullptr);
2320 setLibcallName(RTLIB::SRL_I128, nullptr);
2321 setLibcallName(RTLIB::SRA_I128, nullptr);
2322 setLibcallName(RTLIB::MUL_I128, nullptr);
2323 // The MULO libcall is not part of libgcc, only compiler-rt.
2324 setLibcallName(RTLIB::MULO_I64, nullptr);
2325 }
2326 // The MULO libcall is not part of libgcc, only compiler-rt.
2327 setLibcallName(RTLIB::MULO_I128, nullptr);
2328
2329 // Combine sin / cos into _sincos_stret if it is available.
2330 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2331 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2332 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2333 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2334 }
2335
2336 if (Subtarget.isTargetWin64()) {
2337 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2338 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2339 setOperationAction(ISD::SREM, MVT::i128, Custom);
2340 setOperationAction(ISD::UREM, MVT::i128, Custom);
2341 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2342 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2343 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2344 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2345 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2346 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2347 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2348 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2349 }
2350
2351 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2352 // is. We should promote the value to 64-bits to solve this.
2353 // This is what the CRT headers do - `fmodf` is an inline header
2354 // function casting to f64 and calling `fmod`.
2355 if (Subtarget.is32Bit() &&
2356 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2357 for (ISD::NodeType Op :
2358 {ISD::FCEIL, ISD::STRICT_FCEIL,
2359 ISD::FCOS, ISD::STRICT_FCOS,
2360 ISD::FEXP, ISD::STRICT_FEXP,
2361 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2362 ISD::FREM, ISD::STRICT_FREM,
2363 ISD::FLOG, ISD::STRICT_FLOG,
2364 ISD::FLOG10, ISD::STRICT_FLOG10,
2365 ISD::FPOW, ISD::STRICT_FPOW,
2366 ISD::FSIN, ISD::STRICT_FSIN})
2367 if (isOperationExpand(Op, MVT::f32))
2368 setOperationAction(Op, MVT::f32, Promote);
2369
2370 // We have target-specific dag combine patterns for the following nodes:
2371 setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2372 ISD::SCALAR_TO_VECTOR,
2373 ISD::INSERT_VECTOR_ELT,
2374 ISD::EXTRACT_VECTOR_ELT,
2375 ISD::CONCAT_VECTORS,
2376 ISD::INSERT_SUBVECTOR,
2377 ISD::EXTRACT_SUBVECTOR,
2378 ISD::BITCAST,
2379 ISD::VSELECT,
2380 ISD::SELECT,
2381 ISD::SHL,
2382 ISD::SRA,
2383 ISD::SRL,
2384 ISD::OR,
2385 ISD::AND,
2386 ISD::ADD,
2387 ISD::FADD,
2388 ISD::FSUB,
2389 ISD::FNEG,
2390 ISD::FMA,
2391 ISD::STRICT_FMA,
2392 ISD::FMINNUM,
2393 ISD::FMAXNUM,
2394 ISD::SUB,
2395 ISD::LOAD,
2396 ISD::MLOAD,
2397 ISD::STORE,
2398 ISD::MSTORE,
2399 ISD::TRUNCATE,
2400 ISD::ZERO_EXTEND,
2401 ISD::ANY_EXTEND,
2402 ISD::SIGN_EXTEND,
2403 ISD::SIGN_EXTEND_INREG,
2404 ISD::ANY_EXTEND_VECTOR_INREG,
2405 ISD::SIGN_EXTEND_VECTOR_INREG,
2406 ISD::ZERO_EXTEND_VECTOR_INREG,
2407 ISD::SINT_TO_FP,
2408 ISD::UINT_TO_FP,
2409 ISD::STRICT_SINT_TO_FP,
2410 ISD::STRICT_UINT_TO_FP,
2411 ISD::SETCC,
2412 ISD::MUL,
2413 ISD::XOR,
2414 ISD::MSCATTER,
2415 ISD::MGATHER,
2416 ISD::FP16_TO_FP,
2417 ISD::FP_EXTEND,
2418 ISD::STRICT_FP_EXTEND,
2419 ISD::FP_ROUND,
2420 ISD::STRICT_FP_ROUND});
2421
2422 computeRegisterProperties(Subtarget.getRegisterInfo());
2423
2424 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2425 MaxStoresPerMemsetOptSize = 8;
2426 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2427 MaxStoresPerMemcpyOptSize = 4;
2428 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2429 MaxStoresPerMemmoveOptSize = 4;
2430
2431 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2432 // that needs to benchmarked and balanced with the potential use of vector
2433 // load/store types (PR33329, PR33914).
2434 MaxLoadsPerMemcmp = 2;
2435 MaxLoadsPerMemcmpOptSize = 2;
2436
2437 // Default loop alignment, which can be overridden by -align-loops.
2438 setPrefLoopAlignment(Align(16));
2439
2440 // An out-of-order CPU can speculatively execute past a predictable branch,
2441 // but a conditional move could be stalled by an expensive earlier operation.
2442 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2443 EnableExtLdPromotion = true;
2444 setPrefFunctionAlignment(Align(16));
2445
2446 verifyIntrinsicTables();
2447
2448 // Default to having -disable-strictnode-mutation on
2449 IsStrictFPEnabled = true;
2450}
2451
2452// This has so far only been implemented for 64-bit MachO.
2453bool X86TargetLowering::useLoadStackGuardNode() const {
2454 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2455}
2456
2457bool X86TargetLowering::useStackGuardXorFP() const {
2458 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2459 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2460}
2461
2462SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2463 const SDLoc &DL) const {
2464 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2465 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2466 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2467 return SDValue(Node, 0);
2468}
2469
2470TargetLoweringBase::LegalizeTypeAction
2471X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2472 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2473 !Subtarget.hasBWI())
2474 return TypeSplitVector;
2475
2476 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2477 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2478 return TypeSplitVector;
2479
2480 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2481 VT.getVectorElementType() != MVT::i1)
2482 return TypeWidenVector;
2483
2484 return TargetLoweringBase::getPreferredVectorAction(VT);
2485}
2486
2487static std::pair<MVT, unsigned>
2488handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2489 const X86Subtarget &Subtarget) {
2490 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2491 // convention is one that uses k registers.
2492 if (NumElts == 2)
2493 return {MVT::v2i64, 1};
2494 if (NumElts == 4)
2495 return {MVT::v4i32, 1};
2496 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2497 CC != CallingConv::Intel_OCL_BI)
2498 return {MVT::v8i16, 1};
2499 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2500 CC != CallingConv::Intel_OCL_BI)
2501 return {MVT::v16i8, 1};
2502 // v32i1 passes in ymm unless we have BWI and the calling convention is
2503 // regcall.
2504 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2505 return {MVT::v32i8, 1};
2506 // Split v64i1 vectors if we don't have v64i8 available.
2507 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2508 if (Subtarget.useAVX512Regs())
2509 return {MVT::v64i8, 1};
2510 return {MVT::v32i8, 2};
2511 }
2512
2513 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2514 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2515 NumElts > 64)
2516 return {MVT::i8, NumElts};
2517
2518 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2519}
2520
2521MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2522 CallingConv::ID CC,
2523 EVT VT) const {
2524 if (VT.isVector()) {
2525 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2526 unsigned NumElts = VT.getVectorNumElements();
2527
2528 MVT RegisterVT;
2529 unsigned NumRegisters;
2530 std::tie(RegisterVT, NumRegisters) =
2531 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2532 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2533 return RegisterVT;
2534 }
2535
2536 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2537 return MVT::v8f16;
2538 }
2539
2540 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2541 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2542 !Subtarget.hasX87())
2543 return MVT::i32;
2544
2545 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2546 return getRegisterTypeForCallingConv(Context, CC,
2547 VT.changeVectorElementTypeToInteger());
2548
2549 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2550}
2551
2552unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2553 CallingConv::ID CC,
2554 EVT VT) const {
2555 if (VT.isVector()) {
2556 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2557 unsigned NumElts = VT.getVectorNumElements();
2558
2559 MVT RegisterVT;
2560 unsigned NumRegisters;
2561 std::tie(RegisterVT, NumRegisters) =
2562 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2563 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2564 return NumRegisters;
2565 }
2566
2567 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2568 return 1;
2569 }
2570
2571 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2572 // x87 is disabled.
2573 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2574 if (VT == MVT::f64)
2575 return 2;
2576 if (VT == MVT::f80)
2577 return 3;
2578 }
2579
2580 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2581 return getNumRegistersForCallingConv(Context, CC,
2582 VT.changeVectorElementTypeToInteger());
2583
2584 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2585}
2586
2587unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2588 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2589 unsigned &NumIntermediates, MVT &RegisterVT) const {
2590 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2591 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2592 Subtarget.hasAVX512() &&
2593 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2594 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2595 VT.getVectorNumElements() > 64)) {
2596 RegisterVT = MVT::i8;
2597 IntermediateVT = MVT::i1;
2598 NumIntermediates = VT.getVectorNumElements();
2599 return NumIntermediates;
2600 }
2601
2602 // Split v64i1 vectors if we don't have v64i8 available.
2603 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2604 CC != CallingConv::X86_RegCall) {
2605 RegisterVT = MVT::v32i8;
2606 IntermediateVT = MVT::v32i1;
2607 NumIntermediates = 2;
2608 return 2;
2609 }
2610
2611 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2612 NumIntermediates, RegisterVT);
2613}
2614
2615EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2616 LLVMContext& Context,
2617 EVT VT) const {
2618 if (!VT.isVector())
2619 return MVT::i8;
2620
2621 if (Subtarget.hasAVX512()) {
2622 // Figure out what this type will be legalized to.
2623 EVT LegalVT = VT;
2624 while (getTypeAction(Context, LegalVT) != TypeLegal)
2625 LegalVT = getTypeToTransformTo(Context, LegalVT);
2626
2627 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2628 if (LegalVT.getSimpleVT().is512BitVector())
2629 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2630
2631 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2632 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2633 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2634 // vXi16/vXi8.
2635 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2636 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2637 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2638 }
2639 }
2640
2641 return VT.changeVectorElementTypeToInteger();
2642}
2643
2644/// Helper for getByValTypeAlignment to determine
2645/// the desired ByVal argument alignment.
2646static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2647 if (MaxAlign == 16)
2648 return;
2649 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2650 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
2651 MaxAlign = Align(16);
2652 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2653 Align EltAlign;
2654 getMaxByValAlign(ATy->getElementType(), EltAlign);
2655 if (EltAlign > MaxAlign)
2656 MaxAlign = EltAlign;
2657 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2658 for (auto *EltTy : STy->elements()) {
2659 Align EltAlign;
2660 getMaxByValAlign(EltTy, EltAlign);
2661 if (EltAlign > MaxAlign)
2662 MaxAlign = EltAlign;
2663 if (MaxAlign == 16)
2664 break;
2665 }
2666 }
2667}
2668
2669/// Return the desired alignment for ByVal aggregate
2670/// function arguments in the caller parameter area. For X86, aggregates
2671/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2672/// are at 4-byte boundaries.
2673uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2674 const DataLayout &DL) const {
2675 if (Subtarget.is64Bit()) {
2676 // Max of 8 and alignment of type.
2677 Align TyAlign = DL.getABITypeAlign(Ty);
2678 if (TyAlign > 8)
2679 return TyAlign.value();
2680 return 8;
2681 }
2682
2683 Align Alignment(4);
2684 if (Subtarget.hasSSE1())
2685 getMaxByValAlign(Ty, Alignment);
2686 return Alignment.value();
2687}
2688
2689/// It returns EVT::Other if the type should be determined using generic
2690/// target-independent logic.
2691/// For vector ops we check that the overall size isn't larger than our
2692/// preferred vector width.
2693EVT X86TargetLowering::getOptimalMemOpType(
2694 const MemOp &Op, const AttributeList &FuncAttributes) const {
2695 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2696 if (Op.size() >= 16 &&
2697 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2698 // FIXME: Check if unaligned 64-byte accesses are slow.
2699 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2700 (Subtarget.getPreferVectorWidth() >= 512)) {
2701 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2702 }
2703 // FIXME: Check if unaligned 32-byte accesses are slow.
2704 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2705 Subtarget.useLight256BitInstructions()) {
2706 // Although this isn't a well-supported type for AVX1, we'll let
2707 // legalization and shuffle lowering produce the optimal codegen. If we
2708 // choose an optimal type with a vector element larger than a byte,
2709 // getMemsetStores() may create an intermediate splat (using an integer
2710 // multiply) before we splat as a vector.
2711 return MVT::v32i8;
2712 }
2713 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2714 return MVT::v16i8;
2715 // TODO: Can SSE1 handle a byte vector?
2716 // If we have SSE1 registers we should be able to use them.
2717 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2718 (Subtarget.getPreferVectorWidth() >= 128))
2719 return MVT::v4f32;
2720 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2721 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2722 // Do not use f64 to lower memcpy if source is string constant. It's
2723 // better to use i32 to avoid the loads.
2724 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2725 // The gymnastics of splatting a byte value into an XMM register and then
2726 // only using 8-byte stores (because this is a CPU with slow unaligned
2727 // 16-byte accesses) makes that a loser.
2728 return MVT::f64;
2729 }
2730 }
2731 // This is a compromise. If we reach here, unaligned accesses may be slow on
2732 // this target. However, creating smaller, aligned accesses could be even
2733 // slower and would certainly be a lot more code.
2734 if (Subtarget.is64Bit() && Op.size() >= 8)
2735 return MVT::i64;
2736 return MVT::i32;
2737}
2738
2739bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2740 if (VT == MVT::f32)
2741 return Subtarget.hasSSE1();
2742 if (VT == MVT::f64)
2743 return Subtarget.hasSSE2();
2744 return true;
2745}
2746
2747static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
2748 return (8 * Alignment.value()) % SizeInBits == 0;
2749}
2750
2751bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
2752 if (isBitAligned(Alignment, VT.getSizeInBits()))
2753 return true;
2754 switch (VT.getSizeInBits()) {
2755 default:
2756 // 8-byte and under are always assumed to be fast.
2757 return true;
2758 case 128:
2759 return !Subtarget.isUnalignedMem16Slow();
2760 case 256:
2761 return !Subtarget.isUnalignedMem32Slow();
2762 // TODO: What about AVX-512 (512-bit) accesses?
2763 }
2764}
2765
2766bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2767 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2768 unsigned *Fast) const {
2769 if (Fast)
2770 *Fast = isMemoryAccessFast(VT, Alignment);
2771 // NonTemporal vector memory ops must be aligned.
2772 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2773 // NT loads can only be vector aligned, so if its less aligned than the
2774 // minimum vector size (which we can split the vector down to), we might as
2775 // well use a regular unaligned vector load.
2776 // We don't have any NT loads pre-SSE41.
2777 if (!!(Flags & MachineMemOperand::MOLoad))
2778 return (Alignment < 16 || !Subtarget.hasSSE41());
2779 return false;
2780 }
2781 // Misaligned accesses of any size are always allowed.
2782 return true;
2783}
2784
2785bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
2786 const DataLayout &DL, EVT VT,
2787 unsigned AddrSpace, Align Alignment,
2788 MachineMemOperand::Flags Flags,
2789 unsigned *Fast) const {
2790 if (Fast)
2791 *Fast = isMemoryAccessFast(VT, Alignment);
2792 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2793 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
2794 /*Fast=*/nullptr))
2795 return true;
2796 // NonTemporal vector memory ops are special, and must be aligned.
2797 if (!isBitAligned(Alignment, VT.getSizeInBits()))
2798 return false;
2799 switch (VT.getSizeInBits()) {
2800 case 128:
2801 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
2802 return true;
2803 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
2804 return true;
2805 return false;
2806 case 256:
2807 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
2808 return true;
2809 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
2810 return true;
2811 return false;
2812 case 512:
2813 if (Subtarget.hasAVX512())
2814 return true;
2815 return false;
2816 default:
2817 return false; // Don't have NonTemporal vector memory ops of this size.
2818 }
2819 }
2820 return true;
2821}
2822
2823/// Return the entry encoding for a jump table in the
2824/// current function. The returned value is a member of the
2825/// MachineJumpTableInfo::JTEntryKind enum.
2826unsigned X86TargetLowering::getJumpTableEncoding() const {
2827 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2828 // symbol.
2829 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2830 return MachineJumpTableInfo::EK_Custom32;
2831
2832 // Otherwise, use the normal jump table encoding heuristics.
2833 return TargetLowering::getJumpTableEncoding();
2834}
2835
2836bool X86TargetLowering::splitValueIntoRegisterParts(
2837 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
2838 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
2839 bool IsABIRegCopy = CC.has_value();
2840 EVT ValueVT = Val.getValueType();
2841 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2842 unsigned ValueBits = ValueVT.getSizeInBits();
2843 unsigned PartBits = PartVT.getSizeInBits();
2844 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
2845 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
2846 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
2847 Parts[0] = Val;
2848 return true;
2849 }
2850 return false;
2851}
2852
2853SDValue X86TargetLowering::joinRegisterPartsIntoValue(
2854 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
2855 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
2856 bool IsABIRegCopy = CC.has_value();
2857 if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2858 unsigned ValueBits = ValueVT.getSizeInBits();
2859 unsigned PartBits = PartVT.getSizeInBits();
2860 SDValue Val = Parts[0];
2861
2862 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
2863 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
2864 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
2865 return Val;
2866 }
2867 return SDValue();
2868}
2869
2870bool X86TargetLowering::useSoftFloat() const {
2871 return Subtarget.useSoftFloat();
2872}
2873
2874void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2875 ArgListTy &Args) const {
2876
2877 // Only relabel X86-32 for C / Stdcall CCs.
2878 if (Subtarget.is64Bit())
2879 return;
2880 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2881 return;
2882 unsigned ParamRegs = 0;
2883 if (auto *M = MF->getFunction().getParent())
2884 ParamRegs = M->getNumberRegisterParameters();
2885
2886 // Mark the first N int arguments as having reg
2887 for (auto &Arg : Args) {
2888 Type *T = Arg.Ty;
2889 if (T->isIntOrPtrTy())
2890 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2891 unsigned numRegs = 1;
2892 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2893 numRegs = 2;
2894 if (ParamRegs < numRegs)
2895 return;
2896 ParamRegs -= numRegs;
2897 Arg.IsInReg = true;
2898 }
2899 }
2900}
2901
2902const MCExpr *
2903X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2904 const MachineBasicBlock *MBB,
2905 unsigned uid,MCContext &Ctx) const{
2906 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2906, __extension__
__PRETTY_FUNCTION__))
;
2907 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2908 // entries.
2909 return MCSymbolRefExpr::create(MBB->getSymbol(),
2910 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2911}
2912
2913/// Returns relocation base for the given PIC jumptable.
2914SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2915 SelectionDAG &DAG) const {
2916 if (!Subtarget.is64Bit())
2917 // This doesn't have SDLoc associated with it, but is not really the
2918 // same as a Register.
2919 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2920 getPointerTy(DAG.getDataLayout()));
2921 return Table;
2922}
2923
2924/// This returns the relocation base for the given PIC jumptable,
2925/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2926const MCExpr *X86TargetLowering::
2927getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2928 MCContext &Ctx) const {
2929 // X86-64 uses RIP relative addressing based on the jump table label.
2930 if (Subtarget.isPICStyleRIPRel())
2931 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2932
2933 // Otherwise, the reference is relative to the PIC base.
2934 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2935}
2936
2937std::pair<const TargetRegisterClass *, uint8_t>
2938X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2939 MVT VT) const {
2940 const TargetRegisterClass *RRC = nullptr;
2941 uint8_t Cost = 1;
2942 switch (VT.SimpleTy) {
2943 default:
2944 return TargetLowering::findRepresentativeClass(TRI, VT);
2945 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2946 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2947 break;
2948 case MVT::x86mmx:
2949 RRC = &X86::VR64RegClass;
2950 break;
2951 case MVT::f32: case MVT::f64:
2952 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2953 case MVT::v4f32: case MVT::v2f64:
2954 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2955 case MVT::v8f32: case MVT::v4f64:
2956 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2957 case MVT::v16f32: case MVT::v8f64:
2958 RRC = &X86::VR128XRegClass;
2959 break;
2960 }
2961 return std::make_pair(RRC, Cost);
2962}
2963
2964unsigned X86TargetLowering::getAddressSpace() const {
2965 if (Subtarget.is64Bit())
2966 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2967 return 256;
2968}
2969
2970static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2971 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2972 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2973}
2974
2975static Constant* SegmentOffset(IRBuilderBase &IRB,
2976 int Offset, unsigned AddressSpace) {
2977 return ConstantExpr::getIntToPtr(
2978 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2979 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2980}
2981
2982Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2983 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2984 // tcbhead_t; use it instead of the usual global variable (see
2985 // sysdeps/{i386,x86_64}/nptl/tls.h)
2986 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2987 if (Subtarget.isTargetFuchsia()) {
2988 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2989 return SegmentOffset(IRB, 0x10, getAddressSpace());
2990 } else {
2991 unsigned AddressSpace = getAddressSpace();
2992 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2993 // Specially, some users may customize the base reg and offset.
2994 int Offset = M->getStackProtectorGuardOffset();
2995 // If we don't set -stack-protector-guard-offset value:
2996 // %fs:0x28, unless we're using a Kernel code model, in which case
2997 // it's %gs:0x28. gs:0x14 on i386.
2998 if (Offset == INT_MAX2147483647)
2999 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
3000
3001 StringRef GuardReg = M->getStackProtectorGuardReg();
3002 if (GuardReg == "fs")
3003 AddressSpace = X86AS::FS;
3004 else if (GuardReg == "gs")
3005 AddressSpace = X86AS::GS;
3006
3007 // Use symbol guard if user specify.
3008 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
3009 if (!GuardSymb.empty()) {
3010 GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
3011 if (!GV) {
3012 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
3013 : Type::getInt32Ty(M->getContext());
3014 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
3015 nullptr, GuardSymb, nullptr,
3016 GlobalValue::NotThreadLocal, AddressSpace);
3017 }
3018 return GV;
3019 }
3020
3021 return SegmentOffset(IRB, Offset, AddressSpace);
3022 }
3023 }
3024 return TargetLowering::getIRStackGuard(IRB);
3025}
3026
3027void X86TargetLowering::insertSSPDeclarations(Module &M) const {
3028 // MSVC CRT provides functionalities for stack protection.
3029 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3030 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3031 // MSVC CRT has a global variable holding security cookie.
3032 M.getOrInsertGlobal("__security_cookie",
3033 Type::getInt8PtrTy(M.getContext()));
3034
3035 // MSVC CRT has a function to validate security cookie.
3036 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
3037 "__security_check_cookie", Type::getVoidTy(M.getContext()),
3038 Type::getInt8PtrTy(M.getContext()));
3039 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
3040 F->setCallingConv(CallingConv::X86_FastCall);
3041 F->addParamAttr(0, Attribute::AttrKind::InReg);
3042 }
3043 return;
3044 }
3045
3046 StringRef GuardMode = M.getStackProtectorGuard();
3047
3048 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
3049 if ((GuardMode == "tls" || GuardMode.empty()) &&
3050 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
3051 return;
3052 TargetLowering::insertSSPDeclarations(M);
3053}
3054
3055Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
3056 // MSVC CRT has a global variable holding security cookie.
3057 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3058 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3059 return M.getGlobalVariable("__security_cookie");
3060 }
3061 return TargetLowering::getSDagStackGuard(M);
3062}
3063
3064Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
3065 // MSVC CRT has a function to validate security cookie.
3066 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3067 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3068 return M.getFunction("__security_check_cookie");
3069 }
3070 return TargetLowering::getSSPStackGuardCheck(M);
3071}
3072
3073Value *
3074X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
3075 if (Subtarget.getTargetTriple().isOSContiki())
3076 return getDefaultSafeStackPointerLocation(IRB, false);
3077
3078 // Android provides a fixed TLS slot for the SafeStack pointer. See the
3079 // definition of TLS_SLOT_SAFESTACK in
3080 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
3081 if (Subtarget.isTargetAndroid()) {
3082 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
3083 // %gs:0x24 on i386
3084 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
3085 return SegmentOffset(IRB, Offset, getAddressSpace());
3086 }
3087
3088 // Fuchsia is similar.
3089 if (Subtarget.isTargetFuchsia()) {
3090 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
3091 return SegmentOffset(IRB, 0x18, getAddressSpace());
3092 }
3093
3094 return TargetLowering::getSafeStackPointerLocation(IRB);
3095}
3096
3097//===----------------------------------------------------------------------===//
3098// Return Value Calling Convention Implementation
3099//===----------------------------------------------------------------------===//
3100
3101bool X86TargetLowering::CanLowerReturn(
3102 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3103 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3104 SmallVector<CCValAssign, 16> RVLocs;
3105 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3106 return CCInfo.CheckReturn(Outs, RetCC_X86);
3107}
3108
3109const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
3110 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
3111 return ScratchRegs;
3112}
3113
3114/// Lowers masks values (v*i1) to the local register values
3115/// \returns DAG node after lowering to register type
3116static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
3117 const SDLoc &Dl, SelectionDAG &DAG) {
3118 EVT ValVT = ValArg.getValueType();
3119
3120 if (ValVT == MVT::v1i1)
3121 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
3122 DAG.getIntPtrConstant(0, Dl));
3123
3124 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
3125 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
3126 // Two stage lowering might be required
3127 // bitcast: v8i1 -> i8 / v16i1 -> i16
3128 // anyextend: i8 -> i32 / i16 -> i32
3129 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
3130 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
3131 if (ValLoc == MVT::i32)
3132 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
3133 return ValToCopy;
3134 }
3135
3136 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
3137 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
3138 // One stage lowering is required
3139 // bitcast: v32i1 -> i32 / v64i1 -> i64
3140 return DAG.getBitcast(ValLoc, ValArg);
3141 }
3142
3143 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
3144}
3145
3146/// Breaks v64i1 value into two registers and adds the new node to the DAG
3147static void Passv64i1ArgInRegs(
3148 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
3149 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
3150 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
3151 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3151, __extension__
__PRETTY_FUNCTION__))
;
3152 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3152, __extension__
__PRETTY_FUNCTION__))
;
3153 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3153, __extension__
__PRETTY_FUNCTION__))
;
3154 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3155, __extension__
__PRETTY_FUNCTION__))
3155 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3155, __extension__
__PRETTY_FUNCTION__))
;
3156
3157 // Before splitting the value we cast it to i64
3158 Arg = DAG.getBitcast(MVT::i64, Arg);
3159
3160 // Splitting the value into two i32 types
3161 SDValue Lo, Hi;
3162 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
3163 DAG.getConstant(0, Dl, MVT::i32));
3164 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
3165 DAG.getConstant(1, Dl, MVT::i32));
3166
3167 // Attach the two i32 types into corresponding registers
3168 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
3169 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
3170}
3171
3172SDValue
3173X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3174 bool isVarArg,
3175 const SmallVectorImpl<ISD::OutputArg> &Outs,
3176 const SmallVectorImpl<SDValue> &OutVals,
3177 const SDLoc &dl, SelectionDAG &DAG) const {
3178 MachineFunction &MF = DAG.getMachineFunction();
3179 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3180
3181 // In some cases we need to disable registers from the default CSR list.
3182 // For example, when they are used for argument passing.
3183 bool ShouldDisableCalleeSavedRegister =
3184 shouldDisableCalleeSavedRegisterCC(CallConv) ||
3185 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
3186
3187 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
3188 report_fatal_error("X86 interrupts may not return any value");
3189
3190 SmallVector<CCValAssign, 16> RVLocs;
3191 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
3192 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
3193
3194 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
3195 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
3196 ++I, ++OutsIndex) {
3197 CCValAssign &VA = RVLocs[I];
3198 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3198, __extension__
__PRETTY_FUNCTION__))
;
3199
3200 // Add the register to the CalleeSaveDisableRegs list.
3201 if (ShouldDisableCalleeSavedRegister)
3202 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
3203
3204 SDValue ValToCopy = OutVals[OutsIndex];
3205 EVT ValVT = ValToCopy.getValueType();
3206
3207 // Promote values to the appropriate types.
3208 if (VA.getLocInfo() == CCValAssign::SExt)
3209 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
3210 else if (VA.getLocInfo() == CCValAssign::ZExt)
3211 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
3212 else if (VA.getLocInfo() == CCValAssign::AExt) {
3213 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
3214 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
3215 else
3216 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
3217 }
3218 else if (VA.getLocInfo() == CCValAssign::BCvt)
3219 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
3220
3221 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3222, __extension__
__PRETTY_FUNCTION__))
3222 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3222, __extension__
__PRETTY_FUNCTION__))
;
3223
3224 // Report an error if we have attempted to return a value via an XMM
3225 // register and SSE was disabled.
3226 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3227 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3228 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3229 } else if (!Subtarget.hasSSE2() &&
3230 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3231 ValVT == MVT::f64) {
3232 // When returning a double via an XMM register, report an error if SSE2 is
3233 // not enabled.
3234 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3235 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3236 }
3237
3238 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
3239 // the RET instruction and handled by the FP Stackifier.
3240 if (VA.getLocReg() == X86::FP0 ||
3241 VA.getLocReg() == X86::FP1) {
3242 // If this is a copy from an xmm register to ST(0), use an FPExtend to
3243 // change the value to the FP stack register class.
3244 if (isScalarFPTypeInSSEReg(VA.getValVT()))
3245 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
3246 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3247 // Don't emit a copytoreg.
3248 continue;
3249 }
3250
3251 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
3252 // which is returned in RAX / RDX.
3253 if (Subtarget.is64Bit()) {
3254 if (ValVT == MVT::x86mmx) {
3255 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
3256 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
3257 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
3258 ValToCopy);
3259 // If we don't have SSE2 available, convert to v4f32 so the generated
3260 // register is legal.
3261 if (!Subtarget.hasSSE2())
3262 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3263 }
3264 }
3265 }
3266
3267 if (VA.needsCustom()) {
3268 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3269, __extension__
__PRETTY_FUNCTION__))
3269 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3269, __extension__
__PRETTY_FUNCTION__))
;
3270
3271 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3272 Subtarget);
3273
3274 // Add the second register to the CalleeSaveDisableRegs list.
3275 if (ShouldDisableCalleeSavedRegister)
3276 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3277 } else {
3278 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3279 }
3280 }
3281
3282 SDValue Flag;
3283 SmallVector<SDValue, 6> RetOps;
3284 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3285 // Operand #1 = Bytes To Pop
3286 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3287 MVT::i32));
3288
3289 // Copy the result values into the output registers.
3290 for (auto &RetVal : RetVals) {
3291 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3292 RetOps.push_back(RetVal.second);
3293 continue; // Don't emit a copytoreg.
3294 }
3295
3296 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
3297 Flag = Chain.getValue(1);
3298 RetOps.push_back(
3299 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3300 }
3301
3302 // Swift calling convention does not require we copy the sret argument
3303 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3304
3305 // All x86 ABIs require that for returning structs by value we copy
3306 // the sret argument into %rax/%eax (depending on ABI) for the return.
3307 // We saved the argument into a virtual register in the entry block,
3308 // so now we copy the value out and into %rax/%eax.
3309 //
3310 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3311 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3312 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3313 // either case FuncInfo->setSRetReturnReg() will have been called.
3314 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3315 // When we have both sret and another return value, we should use the
3316 // original Chain stored in RetOps[0], instead of the current Chain updated
3317 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3318
3319 // For the case of sret and another return value, we have
3320 // Chain_0 at the function entry
3321 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3322 // If we use Chain_1 in getCopyFromReg, we will have
3323 // Val = getCopyFromReg(Chain_1)
3324 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3325
3326 // getCopyToReg(Chain_0) will be glued together with
3327 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3328 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3329 // Data dependency from Unit B to Unit A due to usage of Val in
3330 // getCopyToReg(Chain_1, Val)
3331 // Chain dependency from Unit A to Unit B
3332
3333 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3334 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3335 getPointerTy(MF.getDataLayout()));
3336
3337 Register RetValReg
3338 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3339 X86::RAX : X86::EAX;
3340 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
3341 Flag = Chain.getValue(1);
3342
3343 // RAX/EAX now acts like a return value.
3344 RetOps.push_back(
3345 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3346
3347 // Add the returned register to the CalleeSaveDisableRegs list.
3348 if (ShouldDisableCalleeSavedRegister)
3349 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3350 }
3351
3352 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3353 const MCPhysReg *I =
3354 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3355 if (I) {
3356 for (; *I; ++I) {
3357 if (X86::GR64RegClass.contains(*I))
3358 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3359 else
3360 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3360)
;
3361 }
3362 }
3363
3364 RetOps[0] = Chain; // Update chain.
3365
3366 // Add the flag if we have it.
3367 if (Flag.getNode())
3368 RetOps.push_back(Flag);
3369
3370 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
3371 if (CallConv == CallingConv::X86_INTR)
3372 opcode = X86ISD::IRET;
3373 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3374}
3375
3376bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3377 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3378 return false;
3379
3380 SDValue TCChain = Chain;
3381 SDNode *Copy = *N->use_begin();
3382 if (Copy->getOpcode() == ISD::CopyToReg) {
3383 // If the copy has a glue operand, we conservatively assume it isn't safe to
3384 // perform a tail call.
3385 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3386 return false;
3387 TCChain = Copy->getOperand(0);
3388 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3389 return false;
3390
3391 bool HasRet = false;
3392 for (const SDNode *U : Copy->uses()) {
3393 if (U->getOpcode() != X86ISD::RET_FLAG)
3394 return false;
3395 // If we are returning more than one value, we can definitely
3396 // not make a tail call see PR19530
3397 if (U->getNumOperands() > 4)
3398 return false;
3399 if (U->getNumOperands() == 4 &&
3400 U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3401 return false;
3402 HasRet = true;
3403 }
3404
3405 if (!HasRet)
3406 return false;
3407
3408 Chain = TCChain;
3409 return true;
3410}
3411
3412EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3413 ISD::NodeType ExtendKind) const {
3414 MVT ReturnMVT = MVT::i32;
3415
3416 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3417 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3418 // The ABI does not require i1, i8 or i16 to be extended.
3419 //
3420 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3421 // always extending i8/i16 return values, so keep doing that for now.
3422 // (PR26665).
3423 ReturnMVT = MVT::i8;
3424 }
3425
3426 EVT MinVT = getRegisterType(Context, ReturnMVT);
3427 return VT.bitsLT(MinVT) ? MinVT : VT;
3428}
3429
3430/// Reads two 32 bit registers and creates a 64 bit mask value.
3431/// \param VA The current 32 bit value that need to be assigned.
3432/// \param NextVA The next 32 bit value that need to be assigned.
3433/// \param Root The parent DAG node.
3434/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
3435/// glue purposes. In the case the DAG is already using
3436/// physical register instead of virtual, we should glue
3437/// our new SDValue to InFlag SDvalue.
3438/// \return a new SDvalue of size 64bit.
3439static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3440 SDValue &Root, SelectionDAG &DAG,
3441 const SDLoc &Dl, const X86Subtarget &Subtarget,
3442 SDValue *InFlag = nullptr) {
3443 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3443, __extension__
__PRETTY_FUNCTION__))
;
3444 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3444, __extension__
__PRETTY_FUNCTION__))
;
3445 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3446, __extension__
__PRETTY_FUNCTION__))
3446 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3446, __extension__
__PRETTY_FUNCTION__))
;
3447 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3448, __extension__
__PRETTY_FUNCTION__))
3448 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3448, __extension__
__PRETTY_FUNCTION__))
;
3449 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3450, __extension__
__PRETTY_FUNCTION__))
3450 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3450, __extension__
__PRETTY_FUNCTION__))
;
3451
3452 SDValue Lo, Hi;
3453 SDValue ArgValueLo, ArgValueHi;
3454
3455 MachineFunction &MF = DAG.getMachineFunction();
3456 const TargetRegisterClass *RC = &X86::GR32RegClass;
3457
3458 // Read a 32 bit value from the registers.
3459 if (nullptr == InFlag) {
3460 // When no physical register is present,
3461 // create an intermediate virtual register.
3462 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3463 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3464 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3465 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3466 } else {
3467 // When a physical register is available read the value from it and glue
3468 // the reads together.
3469 ArgValueLo =
3470 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
3471 *InFlag = ArgValueLo.getValue(2);
3472 ArgValueHi =
3473 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3474 *InFlag = ArgValueHi.getValue(2);
3475 }
3476
3477 // Convert the i32 type into v32i1 type.
3478 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3479
3480 // Convert the i32 type into v32i1 type.
3481 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3482
3483 // Concatenate the two values together.
3484 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3485}
3486
3487/// The function will lower a register of various sizes (8/16/32/64)
3488/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3489/// \returns a DAG node contains the operand after lowering to mask type.
3490static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3491 const EVT &ValLoc, const SDLoc &Dl,
3492 SelectionDAG &DAG) {
3493 SDValue ValReturned = ValArg;
3494
3495 if (ValVT == MVT::v1i1)
3496 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3497
3498 if (ValVT == MVT::v64i1) {
3499 // In 32 bit machine, this case is handled by getv64i1Argument
3500 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__))
;
3501 // In 64 bit machine, There is no need to truncate the value only bitcast
3502 } else {
3503 MVT maskLen;
3504 switch (ValVT.getSimpleVT().SimpleTy) {
3505 case MVT::v8i1:
3506 maskLen = MVT::i8;
3507 break;
3508 case MVT::v16i1:
3509 maskLen = MVT::i16;
3510 break;
3511 case MVT::v32i1:
3512 maskLen = MVT::i32;
3513 break;
3514 default:
3515 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3515)
;
3516 }
3517
3518 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3519 }
3520 return DAG.getBitcast(ValVT, ValReturned);
3521}
3522
3523/// Lower the result values of a call into the
3524/// appropriate copies out of appropriate physical registers.
3525///
3526SDValue X86TargetLowering::LowerCallResult(
3527 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3528 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3529 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3530 uint32_t *RegMask) const {
3531
3532 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3533 // Assign locations to each value returned by this call.
3534 SmallVector<CCValAssign, 16> RVLocs;
3535 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3536 *DAG.getContext());
3537 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3538
3539 // Copy all of the result registers out of their specified physreg.
3540 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3541 ++I, ++InsIndex) {
3542 CCValAssign &VA = RVLocs[I];
3543 EVT CopyVT = VA.getLocVT();
3544
3545 // In some calling conventions we need to remove the used registers
3546 // from the register mask.
3547 if (RegMask) {
3548 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3549 SubRegs.isValid(); ++SubRegs)
3550 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3551 }
3552
3553 // Report an error if there was an attempt to return FP values via XMM
3554 // registers.
3555 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3556 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3557 if (VA.getLocReg() == X86::XMM1)
3558 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3559 else
3560 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3561 } else if (!Subtarget.hasSSE2() &&
3562 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3563 CopyVT == MVT::f64) {
3564 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3565 if (VA.getLocReg() == X86::XMM1)
3566 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3567 else
3568 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3569 }
3570
3571 // If we prefer to use the value in xmm registers, copy it out as f80 and
3572 // use a truncate to move it from fp stack reg to xmm reg.
3573 bool RoundAfterCopy = false;
3574 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3575 isScalarFPTypeInSSEReg(VA.getValVT())) {
3576 if (!Subtarget.hasX87())
3577 report_fatal_error("X87 register return with X87 disabled");
3578 CopyVT = MVT::f80;
3579 RoundAfterCopy = (CopyVT != VA.getLocVT());
3580 }
3581
3582 SDValue Val;
3583 if (VA.needsCustom()) {
3584 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3585, __extension__
__PRETTY_FUNCTION__))
3585 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3585, __extension__
__PRETTY_FUNCTION__))
;
3586 Val =
3587 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3588 } else {
3589 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3590 .getValue(1);
3591 Val = Chain.getValue(0);
3592 InFlag = Chain.getValue(2);
3593 }
3594
3595 if (RoundAfterCopy)
3596 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3597 // This truncation won't change the value.
3598 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
3599
3600 if (VA.isExtInLoc()) {
3601 if (VA.getValVT().isVector() &&
3602 VA.getValVT().getScalarType() == MVT::i1 &&
3603 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3604 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3605 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3606 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3607 } else
3608 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3609 }
3610
3611 if (VA.getLocInfo() == CCValAssign::BCvt)
3612 Val = DAG.getBitcast(VA.getValVT(), Val);
3613
3614 InVals.push_back(Val);
3615 }
3616
3617 return Chain;
3618}
3619
3620//===----------------------------------------------------------------------===//
3621// C & StdCall & Fast Calling Convention implementation
3622//===----------------------------------------------------------------------===//
3623// StdCall calling convention seems to be standard for many Windows' API
3624// routines and around. It differs from C calling convention just a little:
3625// callee should clean up the stack, not caller. Symbols should be also
3626// decorated in some fancy way :) It doesn't support any vector arguments.
3627// For info on fast calling convention see Fast Calling Convention (tail call)
3628// implementation LowerX86_32FastCCCallTo.
3629
3630/// Determines whether Args, either a set of outgoing arguments to a call, or a
3631/// set of incoming args of a call, contains an sret pointer that the callee
3632/// pops
3633template <typename T>
3634static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3635 const X86Subtarget &Subtarget) {
3636 // Not C++20 (yet), so no concepts available.
3637 static_assert(std::is_same_v<T, ISD::OutputArg> ||
3638 std::is_same_v<T, ISD::InputArg>,
3639 "requires ISD::OutputArg or ISD::InputArg");
3640
3641 // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
3642 // for most compilations.
3643 if (!Subtarget.is32Bit())
3644 return false;
3645
3646 if (Args.empty())
3647 return false;
3648
3649 // Most calls do not have an sret argument, check the arg next.
3650 const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3651 if (!Flags.isSRet() || Flags.isInReg())
3652 return false;
3653
3654 // The MSVCabi does not pop the sret.
3655 if (Subtarget.getTargetTriple().isOSMSVCRT())
3656 return false;
3657
3658 // MCUs don't pop the sret
3659 if (Subtarget.isTargetMCU())
3660 return false;
3661
3662 // Callee pops argument
3663 return true;
3664}
3665
3666/// Make a copy of an aggregate at address specified by "Src" to address
3667/// "Dst" with size and alignment information specified by the specific
3668/// parameter attribute. The copy will be passed as a byval function parameter.
3669static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3670 SDValue Chain, ISD::ArgFlagsTy Flags,
3671 SelectionDAG &DAG, const SDLoc &dl) {
3672 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3673
3674 return DAG.getMemcpy(
3675 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3676 /*isVolatile*/ false, /*AlwaysInline=*/true,
3677 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3678}
3679
3680/// Return true if the calling convention is one that we can guarantee TCO for.
3681static bool canGuaranteeTCO(CallingConv::ID CC) {
3682 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3683 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3684 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3685 CC == CallingConv::SwiftTail);
3686}
3687
3688/// Return true if we might ever do TCO for calls with this calling convention.
3689static bool mayTailCallThisCC(CallingConv::ID CC) {
3690 switch (CC) {
3691 // C calling conventions:
3692 case CallingConv::C:
3693 case CallingConv::Win64:
3694 case CallingConv::X86_64_SysV:
3695 // Callee pop conventions:
3696 case CallingConv::X86_ThisCall:
3697 case CallingConv::X86_StdCall:
3698 case CallingConv::X86_VectorCall:
3699 case CallingConv::X86_FastCall:
3700 // Swift:
3701 case CallingConv::Swift:
3702 return true;
3703 default:
3704 return canGuaranteeTCO(CC);
3705 }
3706}
3707
3708/// Return true if the function is being made into a tailcall target by
3709/// changing its ABI.
3710static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3711 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3712 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3713}
3714
3715bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3716 if (!CI->isTailCall())
3717 return false;
3718
3719 CallingConv::ID CalleeCC = CI->getCallingConv();
3720 if (!mayTailCallThisCC(CalleeCC))
3721 return false;
3722
3723 return true;
3724}
3725
3726SDValue
3727X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3728 const SmallVectorImpl<ISD::InputArg> &Ins,
3729 const SDLoc &dl, SelectionDAG &DAG,
3730 const CCValAssign &VA,
3731 MachineFrameInfo &MFI, unsigned i) const {
3732 // Create the nodes corresponding to a load from this parameter slot.
3733 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3734 bool AlwaysUseMutable = shouldGuaranteeTCO(
3735 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3736 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3737 EVT ValVT;
3738 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3739
3740 // If value is passed by pointer we have address passed instead of the value
3741 // itself. No need to extend if the mask value and location share the same
3742 // absolute size.
3743 bool ExtendedInMem =
3744 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3745 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3746
3747 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3748 ValVT = VA.getLocVT();
3749 else
3750 ValVT = VA.getValVT();
3751
3752 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3753 // changed with more analysis.
3754 // In case of tail call optimization mark all arguments mutable. Since they
3755 // could be overwritten by lowering of arguments in case of a tail call.
3756 if (Flags.isByVal()) {
3757 unsigned Bytes = Flags.getByValSize();
3758 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3759
3760 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3761 // can be improved with deeper analysis.
3762 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3763 /*isAliased=*/true);
3764 return DAG.getFrameIndex(FI, PtrVT);
3765 }
3766
3767 EVT ArgVT = Ins[i].ArgVT;
3768
3769 // If this is a vector that has been split into multiple parts, and the
3770 // scalar size of the parts don't match the vector element size, then we can't
3771 // elide the copy. The parts will have padding between them instead of being
3772 // packed like a vector.
3773 bool ScalarizedAndExtendedVector =
3774 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3775 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3776
3777 // This is an argument in memory. We might be able to perform copy elision.
3778 // If the argument is passed directly in memory without any extension, then we
3779 // can perform copy elision. Large vector types, for example, may be passed
3780 // indirectly by pointer.
3781 if (Flags.isCopyElisionCandidate() &&
3782 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3783 !ScalarizedAndExtendedVector) {
3784 SDValue PartAddr;
3785 if (Ins[i].PartOffset == 0) {
3786 // If this is a one-part value or the first part of a multi-part value,
3787 // create a stack object for the entire argument value type and return a
3788 // load from our portion of it. This assumes that if the first part of an
3789 // argument is in memory, the rest will also be in memory.
3790 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3791 /*IsImmutable=*/false);
3792 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3793 return DAG.getLoad(
3794 ValVT, dl, Chain, PartAddr,
3795 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3796 } else {
3797 // This is not the first piece of an argument in memory. See if there is
3798 // already a fixed stack object including this offset. If so, assume it
3799 // was created by the PartOffset == 0 branch above and create a load from
3800 // the appropriate offset into it.
3801 int64_t PartBegin = VA.getLocMemOffset();
3802 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3803 int FI = MFI.getObjectIndexBegin();
3804 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3805 int64_t ObjBegin = MFI.getObjectOffset(FI);
3806 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3807 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3808 break;
3809 }
3810 if (MFI.isFixedObjectIndex(FI)) {
3811 SDValue Addr =
3812 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3813 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3814 return DAG.getLoad(
3815 ValVT, dl, Chain, Addr,
3816 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3817 Ins[i].PartOffset));
3818 }
3819 }
3820 }
3821
3822 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3823 VA.getLocMemOffset(), isImmutable);
3824
3825 // Set SExt or ZExt flag.
3826 if (VA.getLocInfo() == CCValAssign::ZExt) {
3827 MFI.setObjectZExt(FI, true);
3828 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3829 MFI.setObjectSExt(FI, true);
3830 }
3831
3832 MaybeAlign Alignment;
3833 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3834 ValVT != MVT::f80)
3835 Alignment = MaybeAlign(4);
3836 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3837 SDValue Val = DAG.getLoad(
3838 ValVT, dl, Chain, FIN,
3839 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3840 Alignment);
3841 return ExtendedInMem
3842 ? (VA.getValVT().isVector()
3843 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3844 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3845 : Val;
3846}
3847
3848// FIXME: Get this from tablegen.
3849static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3850 const X86Subtarget &Subtarget) {
3851 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3851, __extension__ __PRETTY_FUNCTION__))
;
3852
3853 if (Subtarget.isCallingConvWin64(CallConv)) {
3854 static const MCPhysReg GPR64ArgRegsWin64[] = {
3855 X86::RCX, X86::RDX, X86::R8, X86::R9
3856 };
3857 return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3858 }
3859
3860 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3861 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3862 };
3863 return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3864}
3865
3866// FIXME: Get this from tablegen.
3867static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3868 CallingConv::ID CallConv,
3869 const X86Subtarget &Subtarget) {
3870 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3870, __extension__ __PRETTY_FUNCTION__))
;
3871 if (Subtarget.isCallingConvWin64(CallConv)) {
3872 // The XMM registers which might contain var arg parameters are shadowed
3873 // in their paired GPR. So we only need to save the GPR to their home
3874 // slots.
3875 // TODO: __vectorcall will change this.
3876 return std::nullopt;
3877 }
3878
3879 bool isSoftFloat = Subtarget.useSoftFloat();
3880 if (isSoftFloat || !Subtarget.hasSSE1())
3881 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3882 // registers.
3883 return std::nullopt;
3884
3885 static const MCPhysReg XMMArgRegs64Bit[] = {
3886 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3887 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3888 };
3889 return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3890}
3891
3892#ifndef NDEBUG
3893static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3894 return llvm::is_sorted(
3895 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3896 return A.getValNo() < B.getValNo();
3897 });
3898}
3899#endif
3900
3901namespace {
3902/// This is a helper class for lowering variable arguments parameters.
3903class VarArgsLoweringHelper {
3904public:
3905 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3906 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3907 CallingConv::ID CallConv, CCState &CCInfo)
3908 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3909 TheMachineFunction(DAG.getMachineFunction()),
3910 TheFunction(TheMachineFunction.getFunction()),
3911 FrameInfo(TheMachineFunction.getFrameInfo()),
3912 FrameLowering(*Subtarget.getFrameLowering()),
3913 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3914 CCInfo(CCInfo) {}
3915
3916 // Lower variable arguments parameters.
3917 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3918
3919private:
3920 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3921
3922 void forwardMustTailParameters(SDValue &Chain);
3923
3924 bool is64Bit() const { return Subtarget.is64Bit(); }
3925 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3926
3927 X86MachineFunctionInfo *FuncInfo;
3928 const SDLoc &DL;
3929 SelectionDAG &DAG;
3930 const X86Subtarget &Subtarget;
3931 MachineFunction &TheMachineFunction;
3932 const Function &TheFunction;
3933 MachineFrameInfo &FrameInfo;
3934 const TargetFrameLowering &FrameLowering;
3935 const TargetLowering &TargLowering;
3936 CallingConv::ID CallConv;
3937 CCState &CCInfo;
3938};
3939} // namespace
3940
3941void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3942 SDValue &Chain, unsigned StackSize) {
3943 // If the function takes variable number of arguments, make a frame index for
3944 // the start of the first vararg value... for expansion of llvm.va_start. We
3945 // can skip this if there are no va_start calls.
3946 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3947 CallConv != CallingConv::X86_ThisCall)) {
3948 FuncInfo->setVarArgsFrameIndex(
3949 FrameInfo.CreateFixedObject(1, StackSize, true));
3950 }
3951
3952 // 64-bit calling conventions support varargs and register parameters, so we
3953 // have to do extra work to spill them in the prologue.
3954 if (is64Bit()) {
3955 // Find the first unallocated argument registers.
3956 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3957 ArrayRef<MCPhysReg> ArgXMMs =
3958 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3959 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3960 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3961
3962 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3963, __extension__
__PRETTY_FUNCTION__))
3963 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3963, __extension__
__PRETTY_FUNCTION__))
;
3964
3965 if (isWin64()) {
3966 // Get to the caller-allocated home save location. Add 8 to account
3967 // for the return address.
3968 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3969 FuncInfo->setRegSaveFrameIndex(
3970 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3971 // Fixup to set vararg frame on shadow area (4 x i64).
3972 if (NumIntRegs < 4)
3973 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3974 } else {
3975 // For X86-64, if there are vararg parameters that are passed via
3976 // registers, then we must store them to their spots on the stack so
3977 // they may be loaded by dereferencing the result of va_next.
3978 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3979 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3980 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3981 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3982 }
3983
3984 SmallVector<SDValue, 6>
3985 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3986 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3987 // keeping live input value
3988 SDValue ALVal; // if applicable keeps SDValue for %al register
3989
3990 // Gather all the live in physical registers.
3991 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3992 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3993 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3994 }
3995 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3996 if (!AvailableXmms.empty()) {
3997 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3998 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3999 for (MCPhysReg Reg : AvailableXmms) {
4000 // FastRegisterAllocator spills virtual registers at basic
4001 // block boundary. That leads to usages of xmm registers
4002 // outside of check for %al. Pass physical registers to
4003 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
4004 TheMachineFunction.getRegInfo().addLiveIn(Reg);
4005 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
4006 }
4007 }
4008
4009 // Store the integer parameter registers.
4010 SmallVector<SDValue, 8> MemOps;
4011 SDValue RSFIN =
4012 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
4013 TargLowering.getPointerTy(DAG.getDataLayout()));
4014 unsigned Offset = FuncInfo->getVarArgsGPOffset();
4015 for (SDValue Val : LiveGPRs) {
4016 SDValue FIN = DAG.getNode(ISD::ADD, DL,
4017 TargLowering.getPointerTy(DAG.getDataLayout()),
4018 RSFIN, DAG.getIntPtrConstant(Offset, DL));
4019 SDValue Store =
4020 DAG.getStore(Val.getValue(1), DL, Val, FIN,
4021 MachinePointerInfo::getFixedStack(
4022 DAG.getMachineFunction(),
4023 FuncInfo->getRegSaveFrameIndex(), Offset));
4024 MemOps.push_back(Store);
4025 Offset += 8;
4026 }
4027
4028 // Now store the XMM (fp + vector) parameter registers.
4029 if (!LiveXMMRegs.empty()) {
4030 SmallVector<SDValue, 12> SaveXMMOps;
4031 SaveXMMOps.push_back(Chain);
4032 SaveXMMOps.push_back(ALVal);
4033 SaveXMMOps.push_back(RSFIN);
4034 SaveXMMOps.push_back(
4035 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
4036 llvm::append_range(SaveXMMOps, LiveXMMRegs);
4037 MachineMemOperand *StoreMMO =
4038 DAG.getMachineFunction().getMachineMemOperand(
4039 MachinePointerInfo::getFixedStack(
4040 DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
4041 Offset),
4042 MachineMemOperand::MOStore, 128, Align(16));
4043 MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
4044 DL, DAG.getVTList(MVT::Other),
4045 SaveXMMOps, MVT::i8, StoreMMO));
4046 }
4047
4048 if (!MemOps.empty())
4049 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4050 }
4051}
4052
4053void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
4054 // Find the largest legal vector type.
4055 MVT VecVT = MVT::Other;
4056 // FIXME: Only some x86_32 calling conventions support AVX512.
4057 if (Subtarget.useAVX512Regs() &&
4058 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
4059 CallConv == CallingConv::Intel_OCL_BI)))
4060 VecVT = MVT::v16f32;
4061 else if (Subtarget.hasAVX())
4062 VecVT = MVT::v8f32;
4063 else if (Subtarget.hasSSE2())
4064 VecVT = MVT::v4f32;
4065
4066 // We forward some GPRs and some vector types.
4067 SmallVector<MVT, 2> RegParmTypes;
4068 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
4069 RegParmTypes.push_back(IntVT);
4070 if (VecVT != MVT::Other)
4071 RegParmTypes.push_back(VecVT);
4072
4073 // Compute the set of forwarded registers. The rest are scratch.
4074 SmallVectorImpl<ForwardedRegister> &Forwards =
4075 FuncInfo->getForwardedMustTailRegParms();
4076 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
4077
4078 // Forward AL for SysV x86_64 targets, since it is used for varargs.
4079 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
4080 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4081 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
4082 }
4083
4084 // Copy all forwards from physical to virtual registers.
4085 for (ForwardedRegister &FR : Forwards) {
4086 // FIXME: Can we use a less constrained schedule?
4087 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
4088 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
4089 TargLowering.getRegClassFor(FR.VT));
4090 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
4091 }
4092}
4093
4094void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
4095 unsigned StackSize) {
4096 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
4097 // If necessary, it would be set into the correct value later.
4098 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
4099 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4100
4101 if (FrameInfo.hasVAStart())
4102 createVarArgAreaAndStoreRegisters(Chain, StackSize);
4103
4104 if (FrameInfo.hasMustTailInVarArgFunc())
4105 forwardMustTailParameters(Chain);
4106}
4107
4108SDValue X86TargetLowering::LowerFormalArguments(
4109 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
4110 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4111 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4112 MachineFunction &MF = DAG.getMachineFunction();
4113 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4114
4115 const Function &F = MF.getFunction();
4116 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
4117 F.getName() == "main")
4118 FuncInfo->setForceFramePointer(true);
4119
4120 MachineFrameInfo &MFI = MF.getFrameInfo();
4121 bool Is64Bit = Subtarget.is64Bit();
4122 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4123
4124 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4126, __extension__
__PRETTY_FUNCTION__))
4125 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4126, __extension__
__PRETTY_FUNCTION__))
4126 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4126, __extension__
__PRETTY_FUNCTION__))
;
4127
4128 // Assign locations to all of the incoming arguments.
4129 SmallVector<CCValAssign, 16> ArgLocs;
4130 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4131
4132 // Allocate shadow area for Win64.
4133 if (IsWin64)
4134 CCInfo.AllocateStack(32, Align(8));
4135
4136 CCInfo.AnalyzeArguments(Ins, CC_X86);
4137
4138 // In vectorcall calling convention a second pass is required for the HVA
4139 // types.
4140 if (CallingConv::X86_VectorCall == CallConv) {
4141 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
4142 }
4143
4144 // The next loop assumes that the locations are in the same order of the
4145 // input arguments.
4146 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4147, __extension__
__PRETTY_FUNCTION__))
4147 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4147, __extension__
__PRETTY_FUNCTION__))
;
4148
4149 SDValue ArgValue;
4150 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
4151 ++I, ++InsIndex) {
4152 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4152, __extension__
__PRETTY_FUNCTION__))
;
4153 CCValAssign &VA = ArgLocs[I];
4154
4155 if (VA.isRegLoc()) {
4156 EVT RegVT = VA.getLocVT();
4157 if (VA.needsCustom()) {
4158 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4160, __extension__
__PRETTY_FUNCTION__))
4159 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4160, __extension__
__PRETTY_FUNCTION__))
4160 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4160, __extension__
__PRETTY_FUNCTION__))
;
4161
4162 // v64i1 values, in regcall calling convention, that are
4163 // compiled to 32 bit arch, are split up into two registers.
4164 ArgValue =
4165 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
4166 } else {
4167 const TargetRegisterClass *RC;
4168 if (RegVT == MVT::i8)
4169 RC = &X86::GR8RegClass;
4170 else if (RegVT == MVT::i16)
4171 RC = &X86::GR16RegClass;
4172 else if (RegVT == MVT::i32)
4173 RC = &X86::GR32RegClass;
4174 else if (Is64Bit && RegVT == MVT::i64)
4175 RC = &X86::GR64RegClass;
4176 else if (RegVT == MVT::f16)
4177 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
4178 else if (RegVT == MVT::f32)
4179 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
4180 else if (RegVT == MVT::f64)
4181 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
4182 else if (RegVT == MVT::f80)
4183 RC = &X86::RFP80RegClass;
4184 else if (RegVT == MVT::f128)
4185 RC = &X86::VR128RegClass;
4186 else if (RegVT.is512BitVector())
4187 RC = &X86::VR512RegClass;
4188 else if (RegVT.is256BitVector())
4189 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
4190 else if (RegVT.is128BitVector())
4191 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
4192 else if (RegVT == MVT::x86mmx)
4193 RC = &X86::VR64RegClass;
4194 else if (RegVT == MVT::v1i1)
4195 RC = &X86::VK1RegClass;
4196 else if (RegVT == MVT::v8i1)
4197 RC = &X86::VK8RegClass;
4198 else if (RegVT == MVT::v16i1)
4199 RC = &X86::VK16RegClass;
4200 else if (RegVT == MVT::v32i1)
4201 RC = &X86::VK32RegClass;
4202 else if (RegVT == MVT::v64i1)
4203 RC = &X86::VK64RegClass;
4204 else
4205 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4205)
;
4206
4207 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4208 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4209 }
4210
4211 // If this is an 8 or 16-bit value, it is really passed promoted to 32
4212 // bits. Insert an assert[sz]ext to capture this, then truncate to the
4213 // right size.
4214 if (VA.getLocInfo() == CCValAssign::SExt)
4215 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4216 DAG.getValueType(VA.getValVT()));
4217 else if (VA.getLocInfo() == CCValAssign::ZExt)
4218 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4219 DAG.getValueType(VA.getValVT()));
4220 else if (VA.getLocInfo() == CCValAssign::BCvt)
4221 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
4222
4223 if (VA.isExtInLoc()) {
4224 // Handle MMX values passed in XMM regs.
4225 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
4226 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
4227 else if (VA.getValVT().isVector() &&
4228 VA.getValVT().getScalarType() == MVT::i1 &&
4229 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
4230 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
4231 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
4232 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
4233 } else
4234 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4235 }
4236 } else {
4237 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4237, __extension__ __PRETTY_FUNCTION__))
;
4238 ArgValue =
4239 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
4240 }
4241
4242 // If value is passed via pointer - do a load.
4243 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
4244 ArgValue =
4245 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
4246
4247 InVals.push_back(ArgValue);
4248 }
4249
4250 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4251 if (Ins[I].Flags.isSwiftAsync()) {
4252 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
4253 if (Subtarget.is64Bit())
4254 X86FI->setHasSwiftAsyncContext(true);
4255 else {
4256 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
4257 X86FI->setSwiftAsyncContextFrameIdx(FI);
4258 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
4259 DAG.getFrameIndex(FI, MVT::i32),
4260 MachinePointerInfo::getFixedStack(MF, FI));
4261 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4262 }
4263 }
4264
4265 // Swift calling convention does not require we copy the sret argument
4266 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4267 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4268 continue;
4269
4270 // All x86 ABIs require that for returning structs by value we copy the
4271 // sret argument into %rax/%eax (depending on ABI) for the return. Save
4272 // the argument into a virtual register so that we can access it from the
4273 // return points.
4274 if (Ins[I].Flags.isSRet()) {
4275 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4276, __extension__
__PRETTY_FUNCTION__))
4276 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4276, __extension__
__PRETTY_FUNCTION__))
;
4277 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4278 Register Reg =
4279 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4280 FuncInfo->setSRetReturnReg(Reg);
4281 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4282 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4283 break;
4284 }
4285 }
4286
4287 unsigned StackSize = CCInfo.getNextStackOffset();
4288 // Align stack specially for tail calls.
4289 if (shouldGuaranteeTCO(CallConv,
4290 MF.getTarget().Options.GuaranteedTailCallOpt))
4291 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4292
4293 if (IsVarArg)
4294 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4295 .lowerVarArgsParameters(Chain, StackSize);
4296
4297 // Some CCs need callee pop.
4298 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4299 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4300 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4301 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4302 // X86 interrupts must pop the error code (and the alignment padding) if
4303 // present.
4304 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4305 } else {
4306 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4307 // If this is an sret function, the return should pop the hidden pointer.
4308 if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4309 FuncInfo->setBytesToPopOnReturn(4);
4310 }
4311
4312 if (!Is64Bit) {
4313 // RegSaveFrameIndex is X86-64 only.
4314 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4315 }
4316
4317 FuncInfo->setArgumentStackSize(StackSize);
4318
4319 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4320 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4321 if (Personality == EHPersonality::CoreCLR) {
4322 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4322,
__extension__ __PRETTY_FUNCTION__))
;
4323 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4324 // that we'd prefer this slot be allocated towards the bottom of the frame
4325 // (i.e. near the stack pointer after allocating the frame). Every
4326 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4327 // offset from the bottom of this and each funclet's frame must be the
4328 // same, so the size of funclets' (mostly empty) frames is dictated by
4329 // how far this slot is from the bottom (since they allocate just enough
4330 // space to accommodate holding this slot at the correct offset).
4331 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4332 EHInfo->PSPSymFrameIdx = PSPSymFI;
4333 }
4334 }
4335
4336 if (shouldDisableCalleeSavedRegisterCC(CallConv) ||
4337 F.hasFnAttribute("no_caller_saved_registers")) {
4338 MachineRegisterInfo &MRI = MF.getRegInfo();
4339 for (std::pair<Register, Register> Pair : MRI.liveins())
4340 MRI.disableCalleeSavedRegister(Pair.first);
4341 }
4342
4343 return Chain;
4344}
4345
4346SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4347 SDValue Arg, const SDLoc &dl,
4348 SelectionDAG &DAG,
4349 const CCValAssign &VA,
4350 ISD::ArgFlagsTy Flags,
4351 bool isByVal) const {
4352 unsigned LocMemOffset = VA.getLocMemOffset();
4353 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4354 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4355 StackPtr, PtrOff);
4356 if (isByVal)
4357 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4358
4359 MaybeAlign Alignment;
4360 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4361 Arg.getSimpleValueType() != MVT::f80)
4362 Alignment = MaybeAlign(4);
4363 return DAG.getStore(
4364 Chain, dl, Arg, PtrOff,
4365 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4366 Alignment);
4367}
4368
4369/// Emit a load of return address if tail call
4370/// optimization is performed and it is required.
4371SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4372 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4373 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4374 // Adjust the Return address stack slot.
4375 EVT VT = getPointerTy(DAG.getDataLayout());
4376 OutRetAddr = getReturnAddressFrameIndex(DAG);
4377
4378 // Load the "old" Return address.
4379 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4380 return SDValue(OutRetAddr.getNode(), 1);
4381}
4382
4383/// Emit a store of the return address if tail call
4384/// optimization is performed and it is required (FPDiff!=0).
4385static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4386 SDValue Chain, SDValue RetAddrFrIdx,
4387 EVT PtrVT, unsigned SlotSize,
4388 int FPDiff, const SDLoc &dl) {
4389 // Store the return address to the appropriate stack slot.
4390 if (!FPDiff) return Chain;
4391 // Calculate the new stack slot for the return address.
4392 int NewReturnAddrFI =
4393 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4394 false);
4395 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4396 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4397 MachinePointerInfo::getFixedStack(
4398 DAG.getMachineFunction(), NewReturnAddrFI));
4399 return Chain;
4400}
4401
4402/// Returns a vector_shuffle mask for an movs{s|d}, movd
4403/// operation of specified width.
4404static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4405 SDValue V2) {
4406 unsigned NumElems = VT.getVectorNumElements();
4407 SmallVector<int, 8> Mask;
4408 Mask.push_back(NumElems);
4409 for (unsigned i = 1; i != NumElems; ++i)
4410 Mask.push_back(i);
4411 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4412}
4413
4414SDValue
4415X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4416 SmallVectorImpl<SDValue> &InVals) const {
4417 SelectionDAG &DAG = CLI.DAG;
4418 SDLoc &dl = CLI.DL;
4419 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4420 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4421 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4422 SDValue Chain = CLI.Chain;
4423 SDValue Callee = CLI.Callee;
4424 CallingConv::ID CallConv = CLI.CallConv;
4425 bool &isTailCall = CLI.IsTailCall;
4426 bool isVarArg = CLI.IsVarArg;
4427 const auto *CB = CLI.CB;
4428
4429 MachineFunction &MF = DAG.getMachineFunction();
4430 bool Is64Bit = Subtarget.is64Bit();
4431 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4432 bool IsSibcall = false;
4433 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4434 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4435 bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4436 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4437 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4438 CB->hasFnAttr("no_caller_saved_registers"));
4439 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4440 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4441 bool IsCFICall = IsIndirectCall && CLI.CFIType;
4442 const Module *M = MF.getMMI().getModule();
4443 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4444
4445 MachineFunction::CallSiteInfo CSInfo;
4446 if (CallConv == CallingConv::X86_INTR)
4447 report_fatal_error("X86 interrupts may not be called directly");
4448
4449 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4450 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4451 // If we are using a GOT, disable tail calls to external symbols with
4452 // default visibility. Tail calling such a symbol requires using a GOT
4453 // relocation, which forces early binding of the symbol. This breaks code
4454 // that require lazy function symbol resolution. Using musttail or
4455 // GuaranteedTailCallOpt will override this.
4456 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4457 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4458 G->getGlobal()->hasDefaultVisibility()))
4459 isTailCall = false;
4460 }
4461
4462 if (isTailCall && !IsMustTail) {
4463 // Check if it's really possible to do a tail call.
4464 isTailCall = IsEligibleForTailCallOptimization(
4465 Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4466 Ins, DAG);
4467
4468 // Sibcalls are automatically detected tailcalls which do not require
4469 // ABI changes.
4470 if (!IsGuaranteeTCO && isTailCall)
4471 IsSibcall = true;
4472
4473 if (isTailCall)
4474 ++NumTailCalls;
4475 }
4476
4477 if (IsMustTail && !isTailCall)
4478 report_fatal_error("failed to perform tail call elimination on a call "
4479 "site marked musttail");
4480
4481 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4482, __extension__
__PRETTY_FUNCTION__))
4482 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4482, __extension__
__PRETTY_FUNCTION__))
;
4483
4484 // Analyze operands of the call, assigning locations to each operand.
4485 SmallVector<CCValAssign, 16> ArgLocs;
4486 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4487
4488 // Allocate shadow area for Win64.
4489 if (IsWin64)
4490 CCInfo.AllocateStack(32, Align(8));
4491
4492 CCInfo.AnalyzeArguments(Outs, CC_X86);
4493
4494 // In vectorcall calling convention a second pass is required for the HVA
4495 // types.
4496 if (CallingConv::X86_VectorCall == CallConv) {
4497 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4498 }
4499
4500 // Get a count of how many bytes are to be pushed on the stack.
4501 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4502 if (IsSibcall)
4503 // This is a sibcall. The memory operands are available in caller's
4504 // own caller's stack.
4505 NumBytes = 0;
4506 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4507 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4508
4509 int FPDiff = 0;
4510 if (isTailCall &&
4511 shouldGuaranteeTCO(CallConv,
4512 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4513 // Lower arguments at fp - stackoffset + fpdiff.
4514 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4515
4516 FPDiff = NumBytesCallerPushed - NumBytes;
4517
4518 // Set the delta of movement of the returnaddr stackslot.
4519 // But only set if delta is greater than previous delta.
4520 if (FPDiff < X86Info->getTCReturnAddrDelta())
4521 X86Info->setTCReturnAddrDelta(FPDiff);
4522 }
4523
4524 unsigned NumBytesToPush = NumBytes;
4525 unsigned NumBytesToPop = NumBytes;
4526
4527 // If we have an inalloca argument, all stack space has already been allocated
4528 // for us and be right at the top of the stack. We don't support multiple
4529 // arguments passed in memory when using inalloca.
4530 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4531 NumBytesToPush = 0;
4532 if (!ArgLocs.back().isMemLoc())
4533 report_fatal_error("cannot use inalloca attribute on a register "
4534 "parameter");
4535 if (ArgLocs.back().getLocMemOffset() != 0)
4536 report_fatal_error("any parameter with the inalloca attribute must be "
4537 "the only memory argument");
4538 } else if (CLI.IsPreallocated) {
4539 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4541, __extension__
__PRETTY_FUNCTION__))
4540 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4541, __extension__
__PRETTY_FUNCTION__))
4541 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4541, __extension__
__PRETTY_FUNCTION__))
;
4542 SmallVector<size_t, 4> PreallocatedOffsets;
4543 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4544 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4545 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4546 }
4547 }
4548 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4549 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4550 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4551 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4552 NumBytesToPush = 0;
4553 }
4554
4555 if (!IsSibcall && !IsMustTail)
4556 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4557 NumBytes - NumBytesToPush, dl);
4558
4559 SDValue RetAddrFrIdx;
4560 // Load return address for tail calls.
4561 if (isTailCall && FPDiff)
4562 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4563 Is64Bit, FPDiff, dl);
4564
4565 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4566 SmallVector<SDValue, 8> MemOpChains;
4567 SDValue StackPtr;
4568
4569 // The next loop assumes that the locations are in the same order of the
4570 // input arguments.
4571 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4572, __extension__
__PRETTY_FUNCTION__))
4572 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4572, __extension__
__PRETTY_FUNCTION__))
;
4573
4574 // Walk the register/memloc assignments, inserting copies/loads. In the case
4575 // of tail call optimization arguments are handle later.
4576 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4577 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4578 ++I, ++OutIndex) {
4579 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4579, __extension__
__PRETTY_FUNCTION__))
;
4580 // Skip inalloca/preallocated arguments, they have already been written.
4581 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4582 if (Flags.isInAlloca() || Flags.isPreallocated())
4583 continue;
4584
4585 CCValAssign &VA = ArgLocs[I];
4586 EVT RegVT = VA.getLocVT();
4587 SDValue Arg = OutVals[OutIndex];
4588 bool isByVal = Flags.isByVal();
4589
4590 // Promote the value if needed.
4591 switch (VA.getLocInfo()) {
4592 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4592)
;
4593 case CCValAssign::Full: break;
4594 case CCValAssign::SExt:
4595 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4596 break;
4597 case CCValAssign::ZExt:
4598 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4599 break;
4600 case CCValAssign::AExt:
4601 if (Arg.getValueType().isVector() &&
4602 Arg.getValueType().getVectorElementType() == MVT::i1)
4603 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4604 else if (RegVT.is128BitVector()) {
4605 // Special case: passing MMX values in XMM registers.
4606 Arg = DAG.getBitcast(MVT::i64, Arg);
4607 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4608 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4609 } else
4610 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4611 break;
4612 case CCValAssign::BCvt:
4613 Arg = DAG.getBitcast(RegVT, Arg);
4614 break;
4615 case CCValAssign::Indirect: {
4616 if (isByVal) {
4617 // Memcpy the argument to a temporary stack slot to prevent
4618 // the caller from seeing any modifications the callee may make
4619 // as guaranteed by the `byval` attribute.
4620 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4621 Flags.getByValSize(),
4622 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4623 SDValue StackSlot =
4624 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4625 Chain =
4626 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4627 // From now on treat this as a regular pointer
4628 Arg = StackSlot;
4629 isByVal = false;
4630 } else {
4631 // Store the argument.
4632 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4633 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4634 Chain = DAG.getStore(
4635 Chain, dl, Arg, SpillSlot,
4636 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4637 Arg = SpillSlot;
4638 }
4639 break;
4640 }
4641 }
4642
4643 if (VA.needsCustom()) {
4644 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4645, __extension__
__PRETTY_FUNCTION__))
4645 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4645, __extension__
__PRETTY_FUNCTION__))
;
4646 // Split v64i1 value into two registers
4647 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4648 } else if (VA.isRegLoc()) {
4649 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4650 const TargetOptions &Options = DAG.getTarget().Options;
4651 if (Options.EmitCallSiteInfo)
4652 CSInfo.emplace_back(VA.getLocReg(), I);
4653 if (isVarArg && IsWin64) {
4654 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4655 // shadow reg if callee is a varargs function.
4656 Register ShadowReg;
4657 switch (VA.getLocReg()) {
4658 case X86::XMM0: ShadowReg = X86::RCX; break;
4659 case X86::XMM1: ShadowReg = X86::RDX; break;
4660 case X86::XMM2: ShadowReg = X86::R8; break;
4661 case X86::XMM3: ShadowReg = X86::R9; break;
4662 }
4663 if (ShadowReg)
4664 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4665 }
4666 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4667 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4667, __extension__ __PRETTY_FUNCTION__))
;
4668 if (!StackPtr.getNode())
4669 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4670 getPointerTy(DAG.getDataLayout()));
4671 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4672 dl, DAG, VA, Flags, isByVal));
4673 }
4674 }
4675
4676 if (!MemOpChains.empty())
4677 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4678
4679 if (Subtarget.isPICStyleGOT()) {
4680 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4681 // GOT pointer (except regcall).
4682 if (!isTailCall) {
4683 // Indirect call with RegCall calling convertion may use up all the
4684 // general registers, so it is not suitable to bind EBX reister for
4685 // GOT address, just let register allocator handle it.
4686 if (CallConv != CallingConv::X86_RegCall)
4687 RegsToPass.push_back(std::make_pair(
4688 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4689 getPointerTy(DAG.getDataLayout()))));
4690 } else {
4691 // If we are tail calling and generating PIC/GOT style code load the
4692 // address of the callee into ECX. The value in ecx is used as target of
4693 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4694 // for tail calls on PIC/GOT architectures. Normally we would just put the
4695 // address of GOT into ebx and then call target@PLT. But for tail calls
4696 // ebx would be restored (since ebx is callee saved) before jumping to the
4697 // target@PLT.
4698
4699 // Note: The actual moving to ECX is done further down.
4700 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4701 if (G && !G->getGlobal()->hasLocalLinkage() &&
4702 G->getGlobal()->hasDefaultVisibility())
4703 Callee = LowerGlobalAddress(Callee, DAG);
4704 else if (isa<ExternalSymbolSDNode>(Callee))
4705 Callee = LowerExternalSymbol(Callee, DAG);
4706 }
4707 }
4708
4709 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4710 (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4711 // From AMD64 ABI document:
4712 // For calls that may call functions that use varargs or stdargs
4713 // (prototype-less calls or calls to functions containing ellipsis (...) in
4714 // the declaration) %al is used as hidden argument to specify the number
4715 // of SSE registers used. The contents of %al do not need to match exactly
4716 // the number of registers, but must be an ubound on the number of SSE
4717 // registers used and is in the range 0 - 8 inclusive.
4718
4719 // Count the number of XMM registers allocated.
4720 static const MCPhysReg XMMArgRegs[] = {
4721 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4722 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4723 };
4724 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4725 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4726, __extension__
__PRETTY_FUNCTION__))
4726 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4726, __extension__
__PRETTY_FUNCTION__))
;
4727 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4728 DAG.getConstant(NumXMMRegs, dl,
4729 MVT::i8)));
4730 }
4731
4732 if (isVarArg && IsMustTail) {
4733 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4734 for (const auto &F : Forwards) {
4735 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4736 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4737 }
4738 }
4739
4740 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4741 // don't need this because the eligibility check rejects calls that require
4742 // shuffling arguments passed in memory.
4743 if (!IsSibcall && isTailCall) {
4744 // Force all the incoming stack arguments to be loaded from the stack
4745 // before any new outgoing arguments are stored to the stack, because the
4746 // outgoing stack slots may alias the incoming argument stack slots, and
4747 // the alias isn't otherwise explicit. This is slightly more conservative
4748 // than necessary, because it means that each store effectively depends
4749 // on every argument instead of just those arguments it would clobber.
4750 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4751
4752 SmallVector<SDValue, 8> MemOpChains2;
4753 SDValue FIN;
4754 int FI = 0;
4755 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4756 ++I, ++OutsIndex) {
4757 CCValAssign &VA = ArgLocs[I];
4758
4759 if (VA.isRegLoc()) {
4760 if (VA.needsCustom()) {
4761 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4762, __extension__
__PRETTY_FUNCTION__))
4762 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4762, __extension__
__PRETTY_FUNCTION__))
;
4763 // This means that we are in special case where one argument was
4764 // passed through two register locations - Skip the next location
4765 ++I;
4766 }
4767
4768 continue;
4769 }
4770
4771 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4771, __extension__ __PRETTY_FUNCTION__))
;
4772 SDValue Arg = OutVals[OutsIndex];
4773 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4774 // Skip inalloca/preallocated arguments. They don't require any work.
4775 if (Flags.isInAlloca() || Flags.isPreallocated())
4776 continue;
4777 // Create frame index.
4778 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4779 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4780 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4781 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4782
4783 if (Flags.isByVal()) {
4784 // Copy relative to framepointer.
4785 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4786 if (!StackPtr.getNode())
4787 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4788 getPointerTy(DAG.getDataLayout()));
4789 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4790 StackPtr, Source);
4791
4792 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4793 ArgChain,
4794 Flags, DAG, dl));
4795 } else {
4796 // Store relative to framepointer.
4797 MemOpChains2.push_back(DAG.getStore(
4798 ArgChain, dl, Arg, FIN,
4799 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4800 }
4801 }
4802
4803 if (!MemOpChains2.empty())
4804 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4805
4806 // Store the return address to the appropriate stack slot.
4807 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4808 getPointerTy(DAG.getDataLayout()),
4809 RegInfo->getSlotSize(), FPDiff, dl);
4810 }
4811
4812 // Build a sequence of copy-to-reg nodes chained together with token chain
4813 // and flag operands which copy the outgoing args into registers.
4814 SDValue InFlag;
4815 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4816 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4817 RegsToPass[i].second, InFlag);
4818 InFlag = Chain.getValue(1);
4819 }
4820
4821 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4822 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4822, __extension__
__PRETTY_FUNCTION__))
;
4823 // In the 64-bit large code model, we have to make all calls
4824 // through a register, since the call instruction's 32-bit
4825 // pc-relative offset may not be large enough to hold the whole
4826 // address.
4827 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4828 Callee->getOpcode() == ISD::ExternalSymbol) {
4829 // Lower direct calls to global addresses and external symbols. Setting
4830 // ForCall to true here has the effect of removing WrapperRIP when possible
4831 // to allow direct calls to be selected without first materializing the
4832 // address into a register.
4833 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4834 } else if (Subtarget.isTarget64BitILP32() &&
4835 Callee.getValueType() == MVT::i32) {
4836 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4837 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4838 }
4839
4840 // Returns a chain & a flag for retval copy to use.
4841 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4842 SmallVector<SDValue, 8> Ops;
4843
4844 if (!IsSibcall && isTailCall && !IsMustTail) {
4845 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InFlag, dl);
4846 InFlag = Chain.getValue(1);
4847 }
4848
4849 Ops.push_back(Chain);
4850 Ops.push_back(Callee);
4851
4852 if (isTailCall)
4853 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4854
4855 // Add argument registers to the end of the list so that they are known live
4856 // into the call.
4857 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4858 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4859 RegsToPass[i].second.getValueType()));
4860
4861 // Add a register mask operand representing the call-preserved registers.
4862 const uint32_t *Mask = [&]() {
4863 auto AdaptedCC = CallConv;
4864 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4865 // use X86_INTR calling convention because it has the same CSR mask
4866 // (same preserved registers).
4867 if (HasNCSR)
4868 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4869 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4870 // to use the CSR_NoRegs_RegMask.
4871 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4872 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4873 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4874 }();
4875 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4875, __extension__
__PRETTY_FUNCTION__))
;
4876
4877 // If this is an invoke in a 32-bit function using a funclet-based
4878 // personality, assume the function clobbers all registers. If an exception
4879 // is thrown, the runtime will not restore CSRs.
4880 // FIXME: Model this more precisely so that we can register allocate across
4881 // the normal edge and spill and fill across the exceptional edge.
4882 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4883 const Function &CallerFn = MF.getFunction();
4884 EHPersonality Pers =
4885 CallerFn.hasPersonalityFn()
4886 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4887 : EHPersonality::Unknown;
4888 if (isFuncletEHPersonality(Pers))
4889 Mask = RegInfo->getNoPreservedMask();
4890 }
4891
4892 // Define a new register mask from the existing mask.
4893 uint32_t *RegMask = nullptr;
4894
4895 // In some calling conventions we need to remove the used physical registers
4896 // from the reg mask.
4897 if (shouldDisableCalleeSavedRegisterCC(CallConv) || HasNCSR) {
4898 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4899
4900 // Allocate a new Reg Mask and copy Mask.
4901 RegMask = MF.allocateRegMask();
4902 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4903 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4904
4905 // Make sure all sub registers of the argument registers are reset
4906 // in the RegMask.
4907 for (auto const &RegPair : RegsToPass)
4908 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4909 SubRegs.isValid(); ++SubRegs)
4910 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4911
4912 // Create the RegMask Operand according to our updated mask.
4913 Ops.push_back(DAG.getRegisterMask(RegMask));
4914 } else {
4915 // Create the RegMask Operand according to the static mask.
4916 Ops.push_back(DAG.getRegisterMask(Mask));
4917 }
4918
4919 if (InFlag.getNode())
4920 Ops.push_back(InFlag);
4921
4922 if (isTailCall) {
4923 // We used to do:
4924 //// If this is the first return lowered for this function, add the regs
4925 //// to the liveout set for the function.
4926 // This isn't right, although it's probably harmless on x86; liveouts
4927 // should be computed from returns not tail calls. Consider a void
4928 // function making a tail call to a function returning int.
4929 MF.getFrameInfo().setHasTailCall();
4930 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4931
4932 if (IsCFICall)
4933 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4934
4935 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4936 return Ret;
4937 }
4938
4939 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4940 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4941 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4942 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4943 // expanded to the call, directly followed by a special marker sequence and
4944 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4945 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4946, __extension__
__PRETTY_FUNCTION__))
4946 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4946, __extension__
__PRETTY_FUNCTION__))
;
4947 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4947, __extension__
__PRETTY_FUNCTION__))
;
4948
4949 // Add a target global address for the retainRV/claimRV runtime function
4950 // just before the call target.
4951 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
4952 auto PtrVT = getPointerTy(DAG.getDataLayout());
4953 auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
4954 Ops.insert(Ops.begin() + 1, GA);
4955 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4956 } else {
4957 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4958 }
4959
4960 if (IsCFICall)
4961 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
4962
4963 InFlag = Chain.getValue(1);
4964 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4965 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4966
4967 // Save heapallocsite metadata.
4968 if (CLI.CB)
4969 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4970 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4971
4972 // Create the CALLSEQ_END node.
4973 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
4974 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4975 DAG.getTarget().Options.GuaranteedTailCallOpt))
4976 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4977 else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
4978 // If this call passes a struct-return pointer, the callee
4979 // pops that struct pointer.
4980 NumBytesForCalleeToPop = 4;
4981
4982 // Returns a flag for retval copy to use.
4983 if (!IsSibcall) {
4984 Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
4985 InFlag, dl);
4986 InFlag = Chain.getValue(1);
4987 }
4988
4989 // Handle result values, copying them out of physregs into vregs that we
4990 // return.
4991 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4992 InVals, RegMask);
4993}
4994
4995//===----------------------------------------------------------------------===//
4996// Fast Calling Convention (tail call) implementation
4997//===----------------------------------------------------------------------===//
4998
4999// Like std call, callee cleans arguments, convention except that ECX is
5000// reserved for storing the tail called function address. Only 2 registers are
5001// free for argument passing (inreg). Tail call optimization is performed
5002// provided:
5003// * tailcallopt is enabled
5004// * caller/callee are fastcc
5005// On X86_64 architecture with GOT-style position independent code only local
5006// (within module) calls are supported at the moment.
5007// To keep the stack aligned according to platform abi the function
5008// GetAlignedArgumentStackSize ensures that argument delta is always multiples
5009// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
5010// If a tail called function callee has more arguments than the caller the
5011// caller needs to make sure that there is room to move the RETADDR to. This is
5012// achieved by reserving an area the size of the argument delta right after the
5013// original RETADDR, but before the saved framepointer or the spilled registers
5014// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
5015// stack layout:
5016// arg1
5017// arg2
5018// RETADDR
5019// [ new RETADDR
5020// move area ]
5021// (possible EBP)
5022// ESI
5023// EDI
5024// local1 ..
5025
5026/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
5027/// requirement.
5028unsigned
5029X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
5030 SelectionDAG &DAG) const {
5031 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
5032 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
5033 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5034, __extension__
__PRETTY_FUNCTION__))
5034 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5034, __extension__
__PRETTY_FUNCTION__))
;
5035 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
5036}
5037
5038/// Return true if the given stack call argument is already available in the
5039/// same position (relatively) of the caller's incoming argument stack.
5040static
5041bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
5042 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
5043 const X86InstrInfo *TII, const CCValAssign &VA) {
5044 unsigned Bytes = Arg.getValueSizeInBits() / 8;
5045
5046 for (;;) {
5047 // Look through nodes that don't alter the bits of the incoming value.
5048 unsigned Op = Arg.getOpcode();
5049 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
5050 Arg = Arg.getOperand(0);
5051 continue;
5052 }
5053 if (Op == ISD::TRUNCATE) {
5054 const SDValue &TruncInput = Arg.getOperand(0);
5055 if (TruncInput.getOpcode() == ISD::AssertZext &&
5056 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
5057 Arg.getValueType()) {
5058 Arg = TruncInput.getOperand(0);
5059 continue;
5060 }
5061 }
5062 break;
5063 }
5064
5065 int FI = INT_MAX2147483647;
5066 if (Arg.getOpcode() == ISD::CopyFromReg) {
5067 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
5068 if (!VR.isVirtual())
5069 return false;
5070 MachineInstr *Def = MRI->getVRegDef(VR);
5071 if (!Def)
5072 return false;
5073 if (!Flags.isByVal()) {
5074 if (!TII->isLoadFromStackSlot(*Def, FI))
5075 return false;
5076 } else {
5077 unsigned Opcode = Def->getOpcode();
5078 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
5079 Opcode == X86::LEA64_32r) &&
5080 Def->getOperand(1).isFI()) {
5081 FI = Def->getOperand(1).getIndex();
5082 Bytes = Flags.getByValSize();
5083 } else
5084 return false;
5085 }
5086 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
5087 if (Flags.isByVal())
5088 // ByVal argument is passed in as a pointer but it's now being
5089 // dereferenced. e.g.
5090 // define @foo(%struct.X* %A) {
5091 // tail call @bar(%struct.X* byval %A)
5092 // }
5093 return false;
5094 SDValue Ptr = Ld->getBasePtr();
5095 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
5096 if (!FINode)
5097 return false;
5098 FI = FINode->getIndex();
5099 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
5100 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
5101 FI = FINode->getIndex();
5102 Bytes = Flags.getByValSize();
5103 } else
5104 return false;
5105
5106 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5106, __extension__ __PRETTY_FUNCTION__))
;
5107 if (!MFI.isFixedObjectIndex(FI))
5108 return false;
5109
5110 if (Offset != MFI.getObjectOffset(FI))
5111 return false;
5112
5113 // If this is not byval, check that the argument stack object is immutable.
5114 // inalloca and argument copy elision can create mutable argument stack
5115 // objects. Byval objects can be mutated, but a byval call intends to pass the
5116 // mutated memory.
5117 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
5118 return false;
5119
5120 if (VA.getLocVT().getFixedSizeInBits() >
5121 Arg.getValueSizeInBits().getFixedValue()) {
5122 // If the argument location is wider than the argument type, check that any
5123 // extension flags match.
5124 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
5125 Flags.isSExt() != MFI.isObjectSExt(FI)) {
5126 return false;
5127 }
5128 }
5129
5130 return Bytes == MFI.getObjectSize(FI);
5131}
5132
5133/// Check whether the call is eligible for tail call optimization. Targets
5134/// that want to do tail call optimization should implement this function.
5135bool X86TargetLowering::IsEligibleForTailCallOptimization(
5136 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
5137 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
5138 const SmallVectorImpl<SDValue> &OutVals,
5139 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5140 if (!mayTailCallThisCC(CalleeCC))
5141 return false;
5142
5143 // If -tailcallopt is specified, make fastcc functions tail-callable.
5144 MachineFunction &MF = DAG.getMachineFunction();
5145 const Function &CallerF = MF.getFunction();
5146
5147 // If the function return type is x86_fp80 and the callee return type is not,
5148 // then the FP_EXTEND of the call result is not a nop. It's not safe to
5149 // perform a tailcall optimization here.
5150 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
5151 return false;
5152
5153 CallingConv::ID CallerCC = CallerF.getCallingConv();
5154 bool CCMatch = CallerCC == CalleeCC;
5155 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
5156 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
5157 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
5158 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
5159
5160 // Win64 functions have extra shadow space for argument homing. Don't do the
5161 // sibcall if the caller and callee have mismatched expectations for this
5162 // space.
5163 if (IsCalleeWin64 != IsCallerWin64)
5164 return false;
5165
5166 if (IsGuaranteeTCO) {
5167 if (canGuaranteeTCO(CalleeCC) && CCMatch)
5168 return true;
5169 return false;
5170 }
5171
5172 // Look for obvious safe cases to perform tail call optimization that do not
5173 // require ABI changes. This is what gcc calls sibcall.
5174
5175 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
5176 // emit a special epilogue.
5177 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5178 if (RegInfo->hasStackRealignment(MF))
5179 return false;
5180
5181 // Also avoid sibcall optimization if we're an sret return fn and the callee
5182 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
5183 // insufficient.
5184 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
5185 // For a compatible tail call the callee must return our sret pointer. So it
5186 // needs to be (a) an sret function itself and (b) we pass our sret as its
5187 // sret. Condition #b is harder to determine.
5188 return false;
5189 } else if (IsCalleePopSRet)
5190 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
5191 // expect that.
5192 return false;
5193
5194 // Do not sibcall optimize vararg calls unless all arguments are passed via
5195 // registers.
5196 LLVMContext &C = *DAG.getContext();
5197 if (isVarArg && !Outs.empty()) {
5198 // Optimizing for varargs on Win64 is unlikely to be safe without
5199 // additional testing.
5200 if (IsCalleeWin64 || IsCallerWin64)
5201 return false;
5202
5203 SmallVector<CCValAssign, 16> ArgLocs;
5204 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5205
5206 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5207 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
5208 if (!ArgLocs[i].isRegLoc())
5209 return false;
5210 }
5211
5212 // If the call result is in ST0 / ST1, it needs to be popped off the x87
5213 // stack. Therefore, if it's not used by the call it is not safe to optimize
5214 // this into a sibcall.
5215 bool Unused = false;
5216 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5217 if (!Ins[i].Used) {
5218 Unused = true;
5219 break;
5220 }
5221 }
5222 if (Unused) {
5223 SmallVector<CCValAssign, 16> RVLocs;
5224 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
5225 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
5226 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5227 CCValAssign &VA = RVLocs[i];
5228 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
5229 return false;
5230 }
5231 }
5232
5233 // Check that the call results are passed in the same way.
5234 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5235 RetCC_X86, RetCC_X86))
5236 return false;
5237 // The callee has to preserve all registers the caller needs to preserve.
5238 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5239 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5240 if (!CCMatch) {
5241 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5242 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5243 return false;
5244 }
5245
5246 unsigned StackArgsSize = 0;
5247
5248 // If the callee takes no arguments then go on to check the results of the
5249 // call.
5250 if (!Outs.empty()) {
5251 // Check if stack adjustment is needed. For now, do not do this if any
5252 // argument is passed on the stack.
5253 SmallVector<CCValAssign, 16> ArgLocs;
5254 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5255
5256 // Allocate shadow area for Win64
5257 if (IsCalleeWin64)
5258 CCInfo.AllocateStack(32, Align(8));
5259
5260 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5261 StackArgsSize = CCInfo.getNextStackOffset();
5262
5263 if (CCInfo.getNextStackOffset()) {
5264 // Check if the arguments are already laid out in the right way as
5265 // the caller's fixed stack objects.
5266 MachineFrameInfo &MFI = MF.getFrameInfo();
5267 const MachineRegisterInfo *MRI = &MF.getRegInfo();
5268 const X86InstrInfo *TII = Subtarget.getInstrInfo();
5269 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5270 CCValAssign &VA = ArgLocs[i];
5271 SDValue Arg = OutVals[i];
5272 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5273 if (VA.getLocInfo() == CCValAssign::Indirect)
5274 return false;
5275 if (!VA.isRegLoc()) {
5276 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5277 MFI, MRI, TII, VA))
5278 return false;
5279 }
5280 }
5281 }
5282
5283 bool PositionIndependent = isPositionIndependent();
5284 // If the tailcall address may be in a register, then make sure it's
5285 // possible to register allocate for it. In 32-bit, the call address can
5286 // only target EAX, EDX, or ECX since the tail call must be scheduled after
5287 // callee-saved registers are restored. These happen to be the same
5288 // registers used to pass 'inreg' arguments so watch out for those.
5289 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5290 !isa<ExternalSymbolSDNode>(Callee)) ||
5291 PositionIndependent)) {
5292 unsigned NumInRegs = 0;
5293 // In PIC we need an extra register to formulate the address computation
5294 // for the callee.
5295 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5296
5297 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5298 CCValAssign &VA = ArgLocs[i];
5299 if (!VA.isRegLoc())
5300 continue;
5301 Register Reg = VA.getLocReg();
5302 switch (Reg) {
5303 default: break;
5304 case X86::EAX: case X86::EDX: case X86::ECX:
5305 if (++NumInRegs == MaxInRegs)
5306 return false;
5307 break;
5308 }
5309 }
5310 }
5311
5312 const MachineRegisterInfo &MRI = MF.getRegInfo();
5313 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5314 return false;
5315 }
5316
5317 bool CalleeWillPop =
5318 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5319 MF.getTarget().Options.GuaranteedTailCallOpt);
5320
5321 if (unsigned BytesToPop =
5322 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5323 // If we have bytes to pop, the callee must pop them.
5324 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5325 if (!CalleePopMatches)
5326 return false;
5327 } else if (CalleeWillPop && StackArgsSize > 0) {
5328 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5329 return false;
5330 }
5331
5332 return true;
5333}
5334
5335FastISel *
5336X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5337 const TargetLibraryInfo *libInfo) const {
5338 return X86::createFastISel(funcInfo, libInfo);
5339}
5340
5341//===----------------------------------------------------------------------===//
5342// Other Lowering Hooks
5343//===----------------------------------------------------------------------===//
5344
5345bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5346 bool AssumeSingleUse) {
5347 if (!AssumeSingleUse && !Op.hasOneUse())
5348 return false;
5349 if (!ISD::isNormalLoad(Op.getNode()))
5350 return false;
5351
5352 // If this is an unaligned vector, make sure the target supports folding it.
5353 auto *Ld = cast<LoadSDNode>(Op.getNode());
5354 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5355 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
5356 return false;
5357
5358 // TODO: If this is a non-temporal load and the target has an instruction
5359 // for it, it should not be folded. See "useNonTemporalLoad()".
5360
5361 return true;
5362}
5363
5364bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5365 const X86Subtarget &Subtarget,
5366 bool AssumeSingleUse) {
5367 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5367, __extension__
__PRETTY_FUNCTION__))
;
5368 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5369 return false;
5370
5371 // We can not replace a wide volatile load with a broadcast-from-memory,
5372 // because that would narrow the load, which isn't legal for volatiles.
5373 auto *Ld = cast<LoadSDNode>(Op.getNode());
5374 return !Ld->isVolatile() ||
5375 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5376}
5377
5378bool X86::mayFoldIntoStore(SDValue Op) {
5379 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5380}
5381
5382bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5383 if (Op.hasOneUse()) {
5384 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5385 return (ISD::ZERO_EXTEND == Opcode);
5386 }
5387 return false;
5388}
5389
5390static bool isTargetShuffle(unsigned Opcode) {
5391 switch(Opcode) {
5392 default: return false;
5393 case X86ISD::BLENDI:
5394 case X86ISD::PSHUFB:
5395 case X86ISD::PSHUFD:
5396 case X86ISD::PSHUFHW:
5397 case X86ISD::PSHUFLW:
5398 case X86ISD::SHUFP:
5399 case X86ISD::INSERTPS:
5400 case X86ISD::EXTRQI:
5401 case X86ISD::INSERTQI:
5402 case X86ISD::VALIGN:
5403 case X86ISD::PALIGNR:
5404 case X86ISD::VSHLDQ:
5405 case X86ISD::VSRLDQ:
5406 case X86ISD::MOVLHPS:
5407 case X86ISD::MOVHLPS:
5408 case X86ISD::MOVSHDUP:
5409 case X86ISD::MOVSLDUP:
5410 case X86ISD::MOVDDUP:
5411 case X86ISD::MOVSS:
5412 case X86ISD::MOVSD:
5413 case X86ISD::MOVSH:
5414 case X86ISD::UNPCKL:
5415 case X86ISD::UNPCKH:
5416 case X86ISD::VBROADCAST:
5417 case X86ISD::VPERMILPI:
5418 case X86ISD::VPERMILPV:
5419 case X86ISD::VPERM2X128:
5420 case X86ISD::SHUF128:
5421 case X86ISD::VPERMIL2:
5422 case X86ISD::VPERMI:
5423 case X86ISD::VPPERM:
5424 case X86ISD::VPERMV:
5425 case X86ISD::VPERMV3:
5426 case X86ISD::VZEXT_MOVL:
5427 return true;
5428 }
5429}
5430
5431static bool isTargetShuffleVariableMask(unsigned Opcode) {
5432 switch (Opcode) {
5433 default: return false;
5434 // Target Shuffles.
5435 case X86ISD::PSHUFB:
5436 case X86ISD::VPERMILPV:
5437 case X86ISD::VPERMIL2:
5438 case X86ISD::VPPERM:
5439 case X86ISD::VPERMV:
5440 case X86ISD::VPERMV3:
5441 return true;
5442 // 'Faux' Target Shuffles.
5443 case ISD::OR:
5444 case ISD::AND:
5445 case X86ISD::ANDNP:
5446 return true;
5447 }
5448}
5449
5450SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5451 MachineFunction &MF = DAG.getMachineFunction();
5452 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5453 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5454 int ReturnAddrIndex = FuncInfo->getRAIndex();
5455
5456 if (ReturnAddrIndex == 0) {
5457 // Set up a frame object for the return address.
5458 unsigned SlotSize = RegInfo->getSlotSize();
5459 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5460 -(int64_t)SlotSize,
5461 false);
5462 FuncInfo->setRAIndex(ReturnAddrIndex);
5463 }
5464
5465 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5466}
5467
5468bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5469 bool hasSymbolicDisplacement) {
5470 // Offset should fit into 32 bit immediate field.
5471 if (!isInt<32>(Offset))
5472 return false;
5473
5474 // If we don't have a symbolic displacement - we don't have any extra
5475 // restrictions.
5476 if (!hasSymbolicDisplacement)
5477 return true;
5478
5479 // FIXME: Some tweaks might be needed for medium code model.
5480 if (M != CodeModel::Small && M != CodeModel::Kernel)
5481 return false;
5482
5483 // For small code model we assume that latest object is 16MB before end of 31
5484 // bits boundary. We may also accept pretty large negative constants knowing
5485 // that all objects are in the positive half of address space.
5486 if (M == CodeModel::Small && Offset < 16*1024*1024)
5487 return true;
5488
5489 // For kernel code model we know that all object resist in the negative half
5490 // of 32bits address space. We may not accept negative offsets, since they may
5491 // be just off and we may accept pretty large positive ones.
5492 if (M == CodeModel::Kernel && Offset >= 0)
5493 return true;
5494
5495 return false;
5496}
5497
5498/// Determines whether the callee is required to pop its own arguments.
5499/// Callee pop is necessary to support tail calls.
5500bool X86::isCalleePop(CallingConv::ID CallingConv,
5501 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5502 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5503 // can guarantee TCO.
5504 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5505 return true;
5506
5507 switch (CallingConv) {
5508 default:
5509 return false;
5510 case CallingConv::X86_StdCall:
5511 case CallingConv::X86_FastCall:
5512 case CallingConv::X86_ThisCall:
5513 case CallingConv::X86_VectorCall:
5514 return !is64Bit;
5515 }
5516}
5517
5518/// Return true if the condition is an signed comparison operation.
5519static bool isX86CCSigned(unsigned X86CC) {
5520 switch (X86CC) {
5521 default:
5522 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5522)
;
5523 case X86::COND_E:
5524 case X86::COND_NE:
5525 case X86::COND_B:
5526 case X86::COND_A:
5527 case X86::COND_BE:
5528 case X86::COND_AE:
5529 return false;
5530 case X86::COND_G:
5531 case X86::COND_GE:
5532 case X86::COND_L:
5533 case X86::COND_LE:
5534 return true;
5535 }
5536}
5537
5538static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5539 switch (SetCCOpcode) {
5540 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5540)
;
5541 case ISD::SETEQ: return X86::COND_E;
5542 case ISD::SETGT: return X86::COND_G;
5543 case ISD::SETGE: return X86::COND_GE;
5544 case ISD::SETLT: return X86::COND_L;
5545 case ISD::SETLE: return X86::COND_LE;
5546 case ISD::SETNE: return X86::COND_NE;
5547 case ISD::SETULT: return X86::COND_B;
5548 case ISD::SETUGT: return X86::COND_A;
5549 case ISD::SETULE: return X86::COND_BE;
5550 case ISD::SETUGE: return X86::COND_AE;
5551 }
5552}
5553
5554/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5555/// condition code, returning the condition code and the LHS/RHS of the
5556/// comparison to make.
5557static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5558 bool isFP, SDValue &LHS, SDValue &RHS,
5559 SelectionDAG &DAG) {
5560 if (!isFP) {
5561 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5562 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5563 // X > -1 -> X == 0, jump !sign.
5564 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5565 return X86::COND_NS;
5566 }
5567 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5568 // X < 0 -> X == 0, jump on sign.
5569 return X86::COND_S;
5570 }
5571 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5572 // X >= 0 -> X == 0, jump on !sign.
5573 return X86::COND_NS;
5574 }
5575 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5576 // X < 1 -> X <= 0
5577 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5578 return X86::COND_LE;
5579 }
5580 }
5581
5582 return TranslateIntegerX86CC(SetCCOpcode);
5583 }
5584
5585 // First determine if it is required or is profitable to flip the operands.
5586
5587 // If LHS is a foldable load, but RHS is not, flip the condition.
5588 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5589 !ISD::isNON_EXTLoad(RHS.getNode())) {
5590 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5591 std::swap(LHS, RHS);
5592 }
5593
5594 switch (SetCCOpcode) {
5595 default: break;
5596 case ISD::SETOLT:
5597 case ISD::SETOLE:
5598 case ISD::SETUGT:
5599 case ISD::SETUGE:
5600 std::swap(LHS, RHS);
5601 break;
5602 }
5603
5604 // On a floating point condition, the flags are set as follows:
5605 // ZF PF CF op
5606 // 0 | 0 | 0 | X > Y
5607 // 0 | 0 | 1 | X < Y
5608 // 1 | 0 | 0 | X == Y
5609 // 1 | 1 | 1 | unordered
5610 switch (SetCCOpcode) {
5611 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5611)
;
5612 case ISD::SETUEQ:
5613 case ISD::SETEQ: return X86::COND_E;
5614 case ISD::SETOLT: // flipped
5615 case ISD::SETOGT:
5616 case ISD::SETGT: return X86::COND_A;
5617 case ISD::SETOLE: // flipped
5618 case ISD::SETOGE:
5619 case ISD::SETGE: return X86::COND_AE;
5620 case ISD::SETUGT: // flipped
5621 case ISD::SETULT:
5622 case ISD::SETLT: return X86::COND_B;
5623 case ISD::SETUGE: // flipped
5624 case ISD::SETULE:
5625 case ISD::SETLE: return X86::COND_BE;
5626 case ISD::SETONE:
5627 case ISD::SETNE: return X86::COND_NE;
5628 case ISD::SETUO: return X86::COND_P;
5629 case ISD::SETO: return X86::COND_NP;
5630 case ISD::SETOEQ:
5631 case ISD::SETUNE: return X86::COND_INVALID;
5632 }
5633}
5634
5635/// Is there a floating point cmov for the specific X86 condition code?
5636/// Current x86 isa includes the following FP cmov instructions:
5637/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5638static bool hasFPCMov(unsigned X86CC) {
5639 switch (X86CC) {
5640 default:
5641 return false;
5642 case X86::COND_B:
5643 case X86::COND_BE:
5644 case X86::COND_E:
5645 case X86::COND_P:
5646 case X86::COND_A:
5647 case X86::COND_AE:
5648 case X86::COND_NE:
5649 case X86::COND_NP:
5650 return true;
5651 }
5652}
5653
5654static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5655 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5656 VT.is512BitVector();
5657}
5658
5659bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5660 const CallInst &I,
5661 MachineFunction &MF,
5662 unsigned Intrinsic) const {
5663 Info.flags = MachineMemOperand::MONone;
5664 Info.offset = 0;
5665
5666 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5667 if (!IntrData) {
5668 switch (Intrinsic) {
5669 case Intrinsic::x86_aesenc128kl:
5670 case Intrinsic::x86_aesdec128kl:
5671 Info.opc = ISD::INTRINSIC_W_CHAIN;
5672 Info.ptrVal = I.getArgOperand(1);
5673 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5674 Info.align = Align(1);
5675 Info.flags |= MachineMemOperand::MOLoad;
5676 return true;
5677 case Intrinsic::x86_aesenc256kl:
5678 case Intrinsic::x86_aesdec256kl:
5679 Info.opc = ISD::INTRINSIC_W_CHAIN;
5680 Info.ptrVal = I.getArgOperand(1);
5681 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5682 Info.align = Align(1);
5683 Info.flags |= MachineMemOperand::MOLoad;
5684 return true;
5685 case Intrinsic::x86_aesencwide128kl:
5686 case Intrinsic::x86_aesdecwide128kl:
5687 Info.opc = ISD::INTRINSIC_W_CHAIN;
5688 Info.ptrVal = I.getArgOperand(0);
5689 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5690 Info.align = Align(1);
5691 Info.flags |= MachineMemOperand::MOLoad;
5692 return true;
5693 case Intrinsic::x86_aesencwide256kl:
5694 case Intrinsic::x86_aesdecwide256kl:
5695 Info.opc = ISD::INTRINSIC_W_CHAIN;
5696 Info.ptrVal = I.getArgOperand(0);
5697 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5698 Info.align = Align(1);
5699 Info.flags |= MachineMemOperand::MOLoad;
5700 return true;
5701 case Intrinsic::x86_cmpccxadd32:
5702 case Intrinsic::x86_cmpccxadd64:
5703 case Intrinsic::x86_atomic_bts:
5704 case Intrinsic::x86_atomic_btc:
5705 case Intrinsic::x86_atomic_btr: {
5706 Info.opc = ISD::INTRINSIC_W_CHAIN;
5707 Info.ptrVal = I.getArgOperand(0);
5708 unsigned Size = I.getType()->getScalarSizeInBits();
5709 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5710 Info.align = Align(Size);
5711 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5712 MachineMemOperand::MOVolatile;
5713 return true;
5714 }
5715 case Intrinsic::x86_atomic_bts_rm:
5716 case Intrinsic::x86_atomic_btc_rm:
5717 case Intrinsic::x86_atomic_btr_rm: {
5718 Info.opc = ISD::INTRINSIC_W_CHAIN;
5719 Info.ptrVal = I.getArgOperand(0);
5720 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5721 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5722 Info.align = Align(Size);
5723 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5724 MachineMemOperand::MOVolatile;
5725 return true;
5726 }
5727 case Intrinsic::x86_aadd32:
5728 case Intrinsic::x86_aadd64:
5729 case Intrinsic::x86_aand32:
5730 case Intrinsic::x86_aand64:
5731 case Intrinsic::x86_aor32:
5732 case Intrinsic::x86_aor64:
5733 case Intrinsic::x86_axor32:
5734 case Intrinsic::x86_axor64:
5735 case Intrinsic::x86_atomic_add_cc:
5736 case Intrinsic::x86_atomic_sub_cc:
5737 case Intrinsic::x86_atomic_or_cc:
5738 case Intrinsic::x86_atomic_and_cc:
5739 case Intrinsic::x86_atomic_xor_cc: {
5740 Info.opc = ISD::INTRINSIC_W_CHAIN;
5741 Info.ptrVal = I.getArgOperand(0);
5742 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5743 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5744 Info.align = Align(Size);
5745 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5746 MachineMemOperand::MOVolatile;
5747 return true;
5748 }
5749 }
5750 return false;
5751 }
5752
5753 switch (IntrData->Type) {
5754 case TRUNCATE_TO_MEM_VI8:
5755 case TRUNCATE_TO_MEM_VI16:
5756 case TRUNCATE_TO_MEM_VI32: {
5757 Info.opc = ISD::INTRINSIC_VOID;
5758 Info.ptrVal = I.getArgOperand(0);
5759 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5760 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5761 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5762 ScalarVT = MVT::i8;
5763 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5764 ScalarVT = MVT::i16;
5765 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5766 ScalarVT = MVT::i32;
5767
5768 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5769 Info.align = Align(1);
5770 Info.flags |= MachineMemOperand::MOStore;
5771 break;
5772 }
5773 case GATHER:
5774 case GATHER_AVX2: {
5775 Info.opc = ISD::INTRINSIC_W_CHAIN;
5776 Info.ptrVal = nullptr;
5777 MVT DataVT = MVT::getVT(I.getType());
5778 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5779 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5780 IndexVT.getVectorNumElements());
5781 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5782 Info.align = Align(1);
5783 Info.flags |= MachineMemOperand::MOLoad;
5784 break;
5785 }
5786 case SCATTER: {
5787 Info.opc = ISD::INTRINSIC_VOID;
5788 Info.ptrVal = nullptr;
5789 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5790 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5791 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5792 IndexVT.getVectorNumElements());
5793 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5794 Info.align = Align(1);
5795 Info.flags |= MachineMemOperand::MOStore;
5796 break;
5797 }
5798 default:
5799 return false;
5800 }
5801
5802 return true;
5803}
5804
5805/// Returns true if the target can instruction select the
5806/// specified FP immediate natively. If false, the legalizer will
5807/// materialize the FP immediate as a load from a constant pool.
5808bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5809 bool ForCodeSize) const {
5810 for (const APFloat &FPImm : LegalFPImmediates)
5811 if (Imm.bitwiseIsEqual(FPImm))
5812 return true;
5813 return false;
5814}
5815
5816bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5817 ISD::LoadExtType ExtTy,
5818 EVT NewVT) const {
5819 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5819, __extension__
__PRETTY_FUNCTION__))
;
5820
5821 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5822 // relocation target a movq or addq instruction: don't let the load shrink.
5823 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5824 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5825 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5826 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5827
5828 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5829 // those uses are extracted directly into a store, then the extract + store
5830 // can be store-folded. Therefore, it's probably not worth splitting the load.
5831 EVT VT = Load->getValueType(0);
5832 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5833 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5834 // Skip uses of the chain value. Result 0 of the node is the load value.
5835 if (UI.getUse().getResNo() != 0)
5836 continue;
5837
5838 // If this use is not an extract + store, it's probably worth splitting.
5839 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5840 UI->use_begin()->getOpcode() != ISD::STORE)
5841 return true;
5842 }
5843 // All non-chain uses are extract + store.
5844 return false;
5845 }
5846
5847 return true;
5848}
5849
5850/// Returns true if it is beneficial to convert a load of a constant
5851/// to just the constant itself.
5852bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5853 Type *Ty) const {
5854 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5854, __extension__ __PRETTY_FUNCTION__))
;
5855
5856 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5857 if (BitSize == 0 || BitSize > 64)
5858 return false;
5859 return true;
5860}
5861
5862bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5863 // If we are using XMM registers in the ABI and the condition of the select is
5864 // a floating-point compare and we have blendv or conditional move, then it is
5865 // cheaper to select instead of doing a cross-register move and creating a
5866 // load that depends on the compare result.
5867 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5868 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5869}
5870
5871bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5872 // TODO: It might be a win to ease or lift this restriction, but the generic
5873 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5874 if (VT.isVector() && Subtarget.hasAVX512())
5875 return false;
5876
5877 return true;
5878}
5879
5880bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5881 SDValue C) const {
5882 // TODO: We handle scalars using custom code, but generic combining could make
5883 // that unnecessary.
5884 APInt MulC;
5885 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5886 return false;
5887
5888 // Find the type this will be legalized too. Otherwise we might prematurely
5889 // convert this to shl+add/sub and then still have to type legalize those ops.
5890 // Another choice would be to defer the decision for illegal types until
5891 // after type legalization. But constant splat vectors of i64 can't make it
5892 // through type legalization on 32-bit targets so we would need to special
5893 // case vXi64.
5894 while (getTypeAction(Context, VT) != TypeLegal)
5895 VT = getTypeToTransformTo(Context, VT);
5896
5897 // If vector multiply is legal, assume that's faster than shl + add/sub.
5898 // Multiply is a complex op with higher latency and lower throughput in
5899 // most implementations, sub-vXi32 vector multiplies are always fast,
5900 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5901 // is always going to be slow.
5902 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5903 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5904 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5905 return false;
5906
5907 // shl+add, shl+sub, shl+add+neg
5908 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5909 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5910}
5911
5912bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5913 unsigned Index) const {
5914 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5915 return false;
5916
5917 // Mask vectors support all subregister combinations and operations that
5918 // extract half of vector.
5919 if (ResVT.getVectorElementType() == MVT::i1)
5920 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5921 (Index == ResVT.getVectorNumElements()));
5922
5923 return (Index % ResVT.getVectorNumElements()) == 0;
5924}
5925
5926bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5927 unsigned Opc = VecOp.getOpcode();
5928
5929 // Assume target opcodes can't be scalarized.
5930 // TODO - do we have any exceptions?
5931 if (Opc >= ISD::BUILTIN_OP_END)
5932 return false;
5933
5934 // If the vector op is not supported, try to convert to scalar.
5935 EVT VecVT = VecOp.getValueType();
5936 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5937 return true;
5938
5939 // If the vector op is supported, but the scalar op is not, the transform may
5940 // not be worthwhile.
5941 EVT ScalarVT = VecVT.getScalarType();
5942 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5943}
5944
5945bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5946 bool) const {
5947 // TODO: Allow vectors?
5948 if (VT.isVector())
5949 return false;
5950 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5951}
5952
5953bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
5954 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
5955 return Subtarget.hasBMI() ||
5956 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
5957}
5958
5959bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
5960 // Speculate ctlz only if we can directly use LZCNT.
5961 return Subtarget.hasLZCNT();
5962}
5963
5964bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
5965 return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
5966}
5967
5968bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
5969 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
5970 // expensive than a straight movsd. On the other hand, it's important to
5971 // shrink long double fp constant since fldt is very slow.
5972 return !Subtarget.hasSSE2() || VT == MVT::f80;
5973}
5974
5975bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
5976 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
5977 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
5978}
5979
5980bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5981 const SelectionDAG &DAG,
5982 const MachineMemOperand &MMO) const {
5983 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5984 BitcastVT.getVectorElementType() == MVT::i1)
5985 return false;
5986
5987 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5988 return false;
5989
5990 // If both types are legal vectors, it's always ok to convert them.
5991 if (LoadVT.isVector() && BitcastVT.isVector() &&
5992 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5993 return true;
5994
5995 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5996}
5997
5998bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5999 const MachineFunction &MF) const {
6000 // Do not merge to float value size (128 bytes) if no implicit
6001 // float attribute is set.
6002 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
6003
6004 if (NoFloat) {
6005 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
6006 return (MemVT.getSizeInBits() <= MaxIntSize);
6007 }
6008 // Make sure we don't merge greater than our preferred vector
6009 // width.
6010 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
6011 return false;
6012
6013 return true;
6014}
6015
6016bool X86TargetLowering::isCtlzFast() const {
6017 return Subtarget.hasFastLZCNT();
6018}
6019
6020bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
6021 const Instruction &AndI) const {
6022 return true;
6023}
6024
6025bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
6026 EVT VT = Y.getValueType();
6027
6028 if (VT.isVector())
6029 return false;
6030
6031 if (!Subtarget.hasBMI())
6032 return false;
6033
6034 // There are only 32-bit and 64-bit forms for 'andn'.
6035 if (VT != MVT::i32 && VT != MVT::i64)
6036 return false;
6037
6038 return !isa<ConstantSDNode>(Y);
6039}
6040
6041bool X86TargetLowering::hasAndNot(SDValue Y) const {
6042 EVT VT = Y.getValueType();
6043
6044 if (!VT.isVector())
6045 return hasAndNotCompare(Y);
6046
6047 // Vector.
6048
6049 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
6050 return false;
6051
6052 if (VT == MVT::v4i32)
6053 return true;
6054
6055 return Subtarget.hasSSE2();
6056}
6057
6058bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
6059 return X.getValueType().isScalarInteger(); // 'bt'
6060}
6061
6062bool X86TargetLowering::
6063 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6064 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
6065 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
6066 SelectionDAG &DAG) const {
6067 // Does baseline recommend not to perform the fold by default?
6068 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6069 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
6070 return false;
6071 // For scalars this transform is always beneficial.
6072 if (X.getValueType().isScalarInteger())
6073 return true;
6074 // If all the shift amounts are identical, then transform is beneficial even
6075 // with rudimentary SSE2 shifts.
6076 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
6077 return true;
6078 // If we have AVX2 with it's powerful shift operations, then it's also good.
6079 if (Subtarget.hasAVX2())
6080 return true;
6081 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
6082 return NewShiftOpcode == ISD::SHL;
6083}
6084
6085bool X86TargetLowering::preferScalarizeSplat(unsigned Opc) const {
6086 return Opc != ISD::FP_EXTEND;
6087}
6088
6089bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
6090 const SDNode *N, CombineLevel Level) const {
6091 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))
6092 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))
6093 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))
6094 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))
6095 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))
;
6096 // TODO: Should we always create i64 masks? Or only folded immediates?
6097 EVT VT = N->getValueType(0);
6098 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
6099 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
6100 // Only fold if the shift values are equal - so it folds to AND.
6101 // TODO - we should fold if either is a non-uniform vector but we don't do
6102 // the fold for non-splats yet.
6103 return N->getOperand(1) == N->getOperand(0).getOperand(1);
6104 }
6105 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
6106}
6107
6108bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
6109 EVT VT = Y.getValueType();
6110
6111 // For vectors, we don't have a preference, but we probably want a mask.
6112 if (VT.isVector())
6113 return false;
6114
6115 // 64-bit shifts on 32-bit targets produce really bad bloated code.
6116 if (VT == MVT::i64 && !Subtarget.is64Bit())
6117 return false;
6118
6119 return true;
6120}
6121
6122TargetLowering::ShiftLegalizationStrategy
6123X86TargetLowering::preferredShiftLegalizationStrategy(
6124 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
6125 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
6126 !Subtarget.isOSWindows())
6127 return ShiftLegalizationStrategy::LowerToLibcall;
6128 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6129 ExpansionFactor);
6130}
6131
6132bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
6133 // Any legal vector type can be splatted more efficiently than
6134 // loading/spilling from memory.
6135 return isTypeLegal(VT);
6136}
6137
6138MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
6139 MVT VT = MVT::getIntegerVT(NumBits);
6140 if (isTypeLegal(VT))
6141 return VT;
6142
6143 // PMOVMSKB can handle this.
6144 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
6145 return MVT::v16i8;
6146
6147 // VPMOVMSKB can handle this.
6148 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
6149 return MVT::v32i8;
6150
6151 // TODO: Allow 64-bit type for 32-bit target.
6152 // TODO: 512-bit types should be allowed, but make sure that those
6153 // cases are handled in combineVectorSizedSetCCEquality().
6154
6155 return MVT::INVALID_SIMPLE_VALUE_TYPE;
6156}
6157
6158/// Val is the undef sentinel value or equal to the specified value.
6159static bool isUndefOrEqual(int Val, int CmpVal) {
6160 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
6161}
6162
6163/// Return true if every element in Mask is the undef sentinel value or equal to
6164/// the specified value..
6165static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
6166 return llvm::all_of(Mask, [CmpVal](int M) {
6167 return (M == SM_SentinelUndef) || (M == CmpVal);
6168 });
6169}
6170
6171/// Val is either the undef or zero sentinel value.
6172static bool isUndefOrZero(int Val) {
6173 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
6174}
6175
6176/// Return true if every element in Mask, beginning from position Pos and ending
6177/// in Pos+Size is the undef sentinel value.
6178static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
6179 return llvm::all_of(Mask.slice(Pos, Size),
6180 [](int M) { return M == SM_SentinelUndef; });
6181}
6182
6183/// Return true if the mask creates a vector whose lower half is undefined.
6184static bool isUndefLowerHalf(ArrayRef<int> Mask) {
6185 unsigned NumElts = Mask.size();
6186 return isUndefInRange(Mask, 0, NumElts / 2);
6187}
6188
6189/// Return true if the mask creates a vector whose upper half is undefined.
6190static bool isUndefUpperHalf(ArrayRef<int> Mask) {
6191 unsigned NumElts = Mask.size();
6192 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
6193}
6194
6195/// Return true if Val falls within the specified range (L, H].
6196static bool isInRange(int Val, int Low, int Hi) {
6197 return (Val >= Low && Val < Hi);
6198}
6199
6200/// Return true if the value of any element in Mask falls within the specified
6201/// range (L, H].
6202static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
6203 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
6204}
6205
6206/// Return true if the value of any element in Mask is the zero sentinel value.
6207static bool isAnyZero(ArrayRef<int> Mask) {
6208 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
6209}
6210
6211/// Return true if the value of any element in Mask is the zero or undef
6212/// sentinel values.
6213static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
6214 return llvm::any_of(Mask, [](int M) {
6215 return M == SM_SentinelZero || M == SM_SentinelUndef;
6216 });
6217}
6218
6219/// Return true if Val is undef or if its value falls within the
6220/// specified range (L, H].
6221static bool isUndefOrInRange(int Val, int Low, int Hi) {
6222 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
6223}
6224
6225/// Return true if every element in Mask is undef or if its value
6226/// falls within the specified range (L, H].
6227static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6228 return llvm::all_of(
6229 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
6230}
6231
6232/// Return true if Val is undef, zero or if its value falls within the
6233/// specified range (L, H].
6234static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
6235 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
6236}
6237
6238/// Return true if every element in Mask is undef, zero or if its value
6239/// falls within the specified range (L, H].
6240static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6241 return llvm::all_of(
6242 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
6243}
6244
6245/// Return true if every element in Mask, beginning
6246/// from position Pos and ending in Pos + Size, falls within the specified
6247/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
6248static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
6249 unsigned Size, int Low, int Step = 1) {
6250 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6251 if (!isUndefOrEqual(Mask[i], Low))
6252 return false;
6253 return true;
6254}
6255
6256/// Return true if every element in Mask, beginning
6257/// from position Pos and ending in Pos+Size, falls within the specified
6258/// sequential range (Low, Low+Size], or is undef or is zero.
6259static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6260 unsigned Size, int Low,
6261 int Step = 1) {
6262 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6263 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
6264 return false;
6265 return true;
6266}
6267
6268/// Return true if every element in Mask, beginning
6269/// from position Pos and ending in Pos+Size is undef or is zero.
6270static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6271 unsigned Size) {
6272 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
6273}
6274
6275/// Helper function to test whether a shuffle mask could be
6276/// simplified by widening the elements being shuffled.
6277///
6278/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
6279/// leaves it in an unspecified state.
6280///
6281/// NOTE: This must handle normal vector shuffle masks and *target* vector
6282/// shuffle masks. The latter have the special property of a '-2' representing
6283/// a zero-ed lane of a vector.
6284static bool canWidenShuffleElements(ArrayRef<int> Mask,
6285 SmallVectorImpl<int> &WidenedMask) {
6286 WidenedMask.assign(Mask.size() / 2, 0);
6287 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
6288 int M0 = Mask[i];
6289 int M1 = Mask[i + 1];
6290
6291 // If both elements are undef, its trivial.
6292 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
6293 WidenedMask[i / 2] = SM_SentinelUndef;
6294 continue;
6295 }
6296
6297 // Check for an undef mask and a mask value properly aligned to fit with
6298 // a pair of values. If we find such a case, use the non-undef mask's value.
6299 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6300 WidenedMask[i / 2] = M1 / 2;
6301 continue;
6302 }
6303 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6304 WidenedMask[i / 2] = M0 / 2;
6305 continue;
6306 }
6307
6308 // When zeroing, we need to spread the zeroing across both lanes to widen.
6309 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6310 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6311 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6312 WidenedMask[i / 2] = SM_SentinelZero;
6313 continue;
6314 }
6315 return false;
6316 }
6317
6318 // Finally check if the two mask values are adjacent and aligned with
6319 // a pair.
6320 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6321 WidenedMask[i / 2] = M0 / 2;
6322 continue;
6323 }
6324
6325 // Otherwise we can't safely widen the elements used in this shuffle.
6326 return false;
6327 }
6328 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6329, __extension__
__PRETTY_FUNCTION__))
6329 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6329, __extension__
__PRETTY_FUNCTION__))
;
6330
6331 return true;
6332}
6333
6334static bool canWidenShuffleElements(ArrayRef<int> Mask,
6335 const APInt &Zeroable,
6336 bool V2IsZero,
6337 SmallVectorImpl<int> &WidenedMask) {
6338 // Create an alternative mask with info about zeroable elements.
6339 // Here we do not set undef elements as zeroable.
6340 SmallVector<int, 64> ZeroableMask(Mask);
6341 if (V2IsZero) {
6342 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6342, __extension__
__PRETTY_FUNCTION__))
;
6343 for (int i = 0, Size = Mask.size(); i != Size; ++i)
6344 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6345 ZeroableMask[i] = SM_SentinelZero;
6346 }
6347 return canWidenShuffleElements(ZeroableMask, WidenedMask);
6348}
6349
6350static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6351 SmallVector<int, 32> WidenedMask;
6352 return canWidenShuffleElements(Mask, WidenedMask);
6353}
6354
6355// Attempt to narrow/widen shuffle mask until it matches the target number of
6356// elements.
6357static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6358 SmallVectorImpl<int> &ScaledMask) {
6359 unsigned NumSrcElts = Mask.size();
6360 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6361, __extension__
__PRETTY_FUNCTION__))
6361 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6361, __extension__
__PRETTY_FUNCTION__))
;
6362
6363 // Narrowing is guaranteed to work.
6364 if (NumDstElts >= NumSrcElts) {
6365 int Scale = NumDstElts / NumSrcElts;
6366 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6367 return true;
6368 }
6369
6370 // We have to repeat the widening until we reach the target size, but we can
6371 // split out the first widening as it sets up ScaledMask for us.
6372 if (canWidenShuffleElements(Mask, ScaledMask)) {
6373 while (ScaledMask.size() > NumDstElts) {
6374 SmallVector<int, 16> WidenedMask;
6375 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6376 return false;
6377 ScaledMask = std::move(WidenedMask);
6378 }
6379 return true;
6380 }
6381
6382 return false;
6383}
6384
6385/// Returns true if Elt is a constant zero or a floating point constant +0.0.
6386bool X86::isZeroNode(SDValue Elt) {
6387 return isNullConstant(Elt) || isNullFPConstant(Elt);
6388}
6389
6390// Build a vector of constants.
6391// Use an UNDEF node if MaskElt == -1.
6392// Split 64-bit constants in the 32-bit mode.
6393static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6394 const SDLoc &dl, bool IsMask = false) {
6395
6396 SmallVector<SDValue, 32> Ops;
6397 bool Split = false;
6398
6399 MVT ConstVecVT = VT;
6400 unsigned NumElts = VT.getVectorNumElements();
6401 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6402 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6403 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6404 Split = true;
6405 }
6406
6407 MVT EltVT = ConstVecVT.getVectorElementType();
6408 for (unsigned i = 0; i < NumElts; ++i) {
6409 bool IsUndef = Values[i] < 0 && IsMask;
6410 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6411 DAG.getConstant(Values[i], dl, EltVT);
6412 Ops.push_back(OpNode);
6413 if (Split)
6414 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6415 DAG.getConstant(0, dl, EltVT));
6416 }
6417 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6418 if (Split)
6419 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6420 return ConstsNode;
6421}
6422
6423static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6424 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6425 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6426, __extension__
__PRETTY_FUNCTION__))
6426 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6426, __extension__
__PRETTY_FUNCTION__))
;
6427 SmallVector<SDValue, 32> Ops;
6428 bool Split = false;
6429
6430 MVT ConstVecVT = VT;
6431 unsigned NumElts = VT.getVectorNumElements();
6432 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6433 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6434 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6435 Split = true;
6436 }
6437
6438 MVT EltVT = ConstVecVT.getVectorElementType();
6439 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6440 if (Undefs[i]) {
6441 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6442 continue;
6443 }
6444 const APInt &V = Bits[i];
6445 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6445, __extension__
__PRETTY_FUNCTION__))
;
6446 if (Split) {
6447 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6448 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6449 } else if (EltVT == MVT::f32) {
6450 APFloat FV(APFloat::IEEEsingle(), V);
6451 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6452 } else if (EltVT == MVT::f64) {
6453 APFloat FV(APFloat::IEEEdouble(), V);
6454 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6455 } else {
6456 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6457 }
6458 }
6459
6460 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6461 return DAG.getBitcast(VT, ConstsNode);
6462}
6463
6464/// Returns a vector of specified type with all zero elements.
6465static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6466 SelectionDAG &DAG, const SDLoc &dl) {
6467 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6469, __extension__
__PRETTY_FUNCTION__))
6468 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6469, __extension__
__PRETTY_FUNCTION__))
6469 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6469, __extension__
__PRETTY_FUNCTION__))
;
6470
6471 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6472 // type. This ensures they get CSE'd. But if the integer type is not
6473 // available, use a floating-point +0.0 instead.
6474 SDValue Vec;
6475 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6476 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6477 } else if (VT.isFloatingPoint()) {
6478 Vec = DAG.getConstantFP(+0.0, dl, VT);
6479 } else if (VT.getVectorElementType() == MVT::i1) {
6480 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6481, __extension__
__PRETTY_FUNCTION__))
6481 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6481, __extension__
__PRETTY_FUNCTION__))
;
6482 Vec = DAG.getConstant(0, dl, VT);
6483 } else {
6484 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6485 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6486 }
6487 return DAG.getBitcast(VT, Vec);
6488}
6489
6490// Helper to determine if the ops are all the extracted subvectors come from a
6491// single source. If we allow commute they don't have to be in order (Lo/Hi).
6492static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6493 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6494 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6495 LHS.getValueType() != RHS.getValueType() ||
6496 LHS.getOperand(0) != RHS.getOperand(0))
6497 return SDValue();
6498
6499 SDValue Src = LHS.getOperand(0);
6500 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6501 return SDValue();
6502
6503 unsigned NumElts = LHS.getValueType().getVectorNumElements();
6504 if ((LHS.getConstantOperandAPInt(1) == 0 &&
6505 RHS.getConstantOperandAPInt(1) == NumElts) ||
6506 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6507 LHS.getConstantOperandAPInt(1) == NumElts))
6508 return Src;
6509
6510 return SDValue();
6511}
6512
6513static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6514 const SDLoc &dl, unsigned vectorWidth) {
6515 EVT VT = Vec.getValueType();
6516 EVT ElVT = VT.getVectorElementType();
6517 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6518 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6519 VT.getVectorNumElements() / Factor);
6520
6521 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6522 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6523 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__))
;
6524
6525 // This is the index of the first element of the vectorWidth-bit chunk
6526 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6527 IdxVal &= ~(ElemsPerChunk - 1);
6528
6529 // If the input is a buildvector just emit a smaller one.
6530 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6531 return DAG.getBuildVector(ResultVT, dl,
6532 Vec->ops().slice(IdxVal, ElemsPerChunk));
6533
6534 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6535 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6536}
6537
6538/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6539/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6540/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6541/// instructions or a simple subregister reference. Idx is an index in the
6542/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6543/// lowering EXTRACT_VECTOR_ELT operations easier.
6544static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6545 SelectionDAG &DAG, const SDLoc &dl) {
6546 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6547, __extension__
__PRETTY_FUNCTION__))
6547 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6547, __extension__
__PRETTY_FUNCTION__))
;
6548 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6549}
6550
6551/// Generate a DAG to grab 256-bits from a 512-bit vector.
6552static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6553 SelectionDAG &DAG, const SDLoc &dl) {
6554 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6554, __extension__
__PRETTY_FUNCTION__))
;
6555 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6556}
6557
6558static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6559 SelectionDAG &DAG, const SDLoc &dl,
6560 unsigned vectorWidth) {
6561 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6562, __extension__
__PRETTY_FUNCTION__))
6562 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6562, __extension__
__PRETTY_FUNCTION__))
;
6563 // Inserting UNDEF is Result
6564 if (Vec.isUndef())
6565 return Result;
6566 EVT VT = Vec.getValueType();
6567 EVT ElVT = VT.getVectorElementType();
6568 EVT ResultVT = Result.getValueType();
6569
6570 // Insert the relevant vectorWidth bits.
6571 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6572 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6572, __extension__
__PRETTY_FUNCTION__))
;
6573
6574 // This is the index of the first element of the vectorWidth-bit chunk
6575 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6576 IdxVal &= ~(ElemsPerChunk - 1);
6577
6578 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6579 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6580}
6581
6582/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6583/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6584/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6585/// simple superregister reference. Idx is an index in the 128 bits
6586/// we want. It need not be aligned to a 128-bit boundary. That makes
6587/// lowering INSERT_VECTOR_ELT operations easier.
6588static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6589 SelectionDAG &DAG, const SDLoc &dl) {
6590 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6590, __extension__
__PRETTY_FUNCTION__))
;
6591 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6592}
6593
6594/// Widen a vector to a larger size with the same scalar type, with the new
6595/// elements either zero or undef.
6596static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6597 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6598 const SDLoc &dl) {
6599 assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
6600 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
6601 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))
;
6602 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6603 : DAG.getUNDEF(VT);
6604 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6605 DAG.getIntPtrConstant(0, dl));
6606}
6607
6608/// Widen a vector to a larger size with the same scalar type, with the new
6609/// elements either zero or undef.
6610static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6611 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6612 const SDLoc &dl, unsigned WideSizeInBits) {
6613 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6615, __extension__
__PRETTY_FUNCTION__))
6614 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6615, __extension__
__PRETTY_FUNCTION__))
6615 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6615, __extension__
__PRETTY_FUNCTION__))
;
6616 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6617 MVT SVT = Vec.getSimpleValueType().getScalarType();
6618 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6619 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6620}
6621
6622// Helper function to collect subvector ops that are concatenated together,
6623// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6624// The subvectors in Ops are guaranteed to be the same type.
6625static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
6626 SelectionDAG &DAG) {
6627 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6627, __extension__
__PRETTY_FUNCTION__))
;
6628
6629 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6630 Ops.append(N->op_begin(), N->op_end());
6631 return true;
6632 }
6633
6634 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6635 SDValue Src = N->getOperand(0);
6636 SDValue Sub = N->getOperand(1);
6637 const APInt &Idx = N->getConstantOperandAPInt(2);
6638 EVT VT = Src.getValueType();
6639 EVT SubVT = Sub.getValueType();
6640
6641 // TODO - Handle more general insert_subvector chains.
6642 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
6643 // insert_subvector(undef, x, lo)
6644 if (Idx == 0 && Src.isUndef()) {
6645 Ops.push_back(Sub);
6646 Ops.push_back(DAG.getUNDEF(SubVT));
6647 return true;
6648 }
6649 if (Idx == (VT.getVectorNumElements() / 2)) {
6650 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6651 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6652 Src.getOperand(1).getValueType() == SubVT &&
6653 isNullConstant(Src.getOperand(2))) {
6654 Ops.push_back(Src.getOperand(1));
6655 Ops.push_back(Sub);
6656 return true;
6657 }
6658 // insert_subvector(x, extract_subvector(x, lo), hi)
6659 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6660 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6661 Ops.append(2, Sub);
6662 return true;
6663 }
6664 // insert_subvector(undef, x, hi)
6665 if (Src.isUndef()) {
6666 Ops.push_back(DAG.getUNDEF(SubVT));
6667 Ops.push_back(Sub);
6668 return true;
6669 }
6670 }
6671 }
6672 }
6673
6674 return false;
6675}
6676
6677static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6678 const SDLoc &dl) {
6679 EVT VT = Op.getValueType();
6680 unsigned NumElems = VT.getVectorNumElements();
6681 unsigned SizeInBits = VT.getSizeInBits();
6682 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6683, __extension__
__PRETTY_FUNCTION__))
6683 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6683, __extension__
__PRETTY_FUNCTION__))
;
6684
6685 // If this is a splat value (with no-undefs) then use the lower subvector,
6686 // which should be a free extraction.
6687 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6688 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6689 return std::make_pair(Lo, Lo);
6690
6691 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6692 return std::make_pair(Lo, Hi);
6693}
6694
6695/// Break an operation into 2 half sized ops and then concatenate the results.
6696static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6697 unsigned NumOps = Op.getNumOperands();
6698 EVT VT = Op.getValueType();
6699 SDLoc dl(Op);
6700
6701 // Extract the LHS Lo/Hi vectors
6702 SmallVector<SDValue> LoOps(NumOps, SDValue());
6703 SmallVector<SDValue> HiOps(NumOps, SDValue());
6704 for (unsigned I = 0; I != NumOps; ++I) {
6705 SDValue SrcOp = Op.getOperand(I);
6706 if (!SrcOp.getValueType().isVector()) {
6707 LoOps[I] = HiOps[I] = SrcOp;
6708 continue;
6709 }
6710 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6711 }
6712
6713 EVT LoVT, HiVT;
6714 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6715 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6716 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6717 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6718}
6719
6720/// Break an unary integer operation into 2 half sized ops and then
6721/// concatenate the result back.
6722static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6723 // Make sure we only try to split 256/512-bit types to avoid creating
6724 // narrow vectors.
6725 EVT VT = Op.getValueType();
6726 (void)VT;
6727 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6729, __extension__
__PRETTY_FUNCTION__))
6728 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6729, __extension__
__PRETTY_FUNCTION__))
6729 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6729, __extension__
__PRETTY_FUNCTION__))
;
6730 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
6731 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
6732 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))
;
6733 return splitVectorOp(Op, DAG);
6734}
6735
6736/// Break a binary integer operation into 2 half sized ops and then
6737/// concatenate the result back.
6738static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6739 // Assert that all the types match.
6740 EVT VT = Op.getValueType();
6741 (void)VT;
6742 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6743, __extension__
__PRETTY_FUNCTION__))
6743 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6743, __extension__
__PRETTY_FUNCTION__))
;
6744 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6744, __extension__
__PRETTY_FUNCTION__))
;
6745 return splitVectorOp(Op, DAG);
6746}
6747
6748// Helper for splitting operands of an operation to legal target size and
6749// apply a function on each part.
6750// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6751// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6752// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6753// The argument Builder is a function that will be applied on each split part:
6754// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6755template <typename F>
6756SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6757 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6758 F Builder, bool CheckBWI = true) {
6759 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6759, __extension__
__PRETTY_FUNCTION__))
;
6760 unsigned NumSubs = 1;
6761 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6762 (!CheckBWI && Subtarget.useAVX512Regs())) {
6763 if (VT.getSizeInBits() > 512) {
6764 NumSubs = VT.getSizeInBits() / 512;
6765 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))
;
6766 }
6767 } else if (Subtarget.hasAVX2()) {
6768 if (VT.getSizeInBits() > 256) {
6769 NumSubs = VT.getSizeInBits() / 256;
6770 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6770, __extension__
__PRETTY_FUNCTION__))
;
6771 }
6772 } else {
6773 if (VT.getSizeInBits() > 128) {
6774 NumSubs = VT.getSizeInBits() / 128;
6775 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6775, __extension__
__PRETTY_FUNCTION__))
;
6776 }
6777 }
6778
6779 if (NumSubs == 1)
6780 return Builder(DAG, DL, Ops);
6781
6782 SmallVector<SDValue, 4> Subs;
6783 for (unsigned i = 0; i != NumSubs; ++i) {
6784 SmallVector<SDValue, 2> SubOps;
6785 for (SDValue Op : Ops) {
6786 EVT OpVT = Op.getValueType();
6787 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6788 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6789 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6790 }
6791 Subs.push_back(Builder(DAG, DL, SubOps));
6792 }
6793 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6794}
6795
6796// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6797// targets.
6798static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6799 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6800 const X86Subtarget &Subtarget) {
6801 assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6801, __extension__
__PRETTY_FUNCTION__))
;
6802 MVT SVT = VT.getScalarType();
6803
6804 // If we have a 32/64 splatted constant, splat it to DstTy to
6805 // encourage a foldable broadcast'd operand.
6806 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6807 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6808 // AVX512 broadcasts 32/64-bit operands.
6809 // TODO: Support float once getAVX512Node is used by fp-ops.
6810 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6811 !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6812 return SDValue();
6813 // If we're not widening, don't bother if we're not bitcasting.
6814 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6815 return SDValue();
6816 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6817 APInt SplatValue, SplatUndef;
6818 unsigned SplatBitSize;
6819 bool HasAnyUndefs;
6820 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6821 HasAnyUndefs, OpEltSizeInBits) &&
6822 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6823 return DAG.getConstant(SplatValue, DL, DstVT);
6824 }
6825 return SDValue();
6826 };
6827
6828 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6829
6830 MVT DstVT = VT;
6831 if (Widen)
6832 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6833
6834 // Canonicalize src operands.
6835 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6836 for (SDValue &Op : SrcOps) {
6837 MVT OpVT = Op.getSimpleValueType();
6838 // Just pass through scalar operands.
6839 if (!OpVT.isVector())
6840 continue;
6841 assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6841, __extension__
__PRETTY_FUNCTION__))
;
6842
6843 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6844 Op = BroadcastOp;
6845 continue;
6846 }
6847
6848 // Just widen the subvector by inserting into an undef wide vector.
6849 if (Widen)
6850 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6851 }
6852
6853 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6854
6855 // Perform the 512-bit op then extract the bottom subvector.
6856 if (Widen)
6857 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6858 return Res;
6859}
6860
6861/// Insert i1-subvector to i1-vector.
6862static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6863 const X86Subtarget &Subtarget) {
6864
6865 SDLoc dl(Op);
6866 SDValue Vec = Op.getOperand(0);
6867 SDValue SubVec = Op.getOperand(1);
6868 SDValue Idx = Op.getOperand(2);
6869 unsigned IdxVal = Op.getConstantOperandVal(2);
6870
6871 // Inserting undef is a nop. We can just return the original vector.
6872 if (SubVec.isUndef())
6873 return Vec;
6874
6875 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6876 return Op;
6877
6878 MVT OpVT = Op.getSimpleValueType();
6879 unsigned NumElems = OpVT.getVectorNumElements();
6880 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6881
6882 // Extend to natively supported kshift.
6883 MVT WideOpVT = OpVT;
6884 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6885 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6886
6887 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6888 // if necessary.
6889 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6890 // May need to promote to a legal type.
6891 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6892 DAG.getConstant(0, dl, WideOpVT),
6893 SubVec, Idx);
6894 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6895 }
6896
6897 MVT SubVecVT = SubVec.getSimpleValueType();
6898 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6899 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6901, __extension__
__PRETTY_FUNCTION__))
6900 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6901, __extension__
__PRETTY_FUNCTION__))
6901 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6901, __extension__
__PRETTY_FUNCTION__))
;
6902
6903 SDValue Undef = DAG.getUNDEF(WideOpVT);
6904
6905 if (IdxVal == 0) {
6906 // Zero lower bits of the Vec
6907 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6908 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6909 ZeroIdx);
6910 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6911 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6912 // Merge them together, SubVec should be zero extended.
6913 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6914 DAG.getConstant(0, dl, WideOpVT),
6915 SubVec, ZeroIdx);
6916 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6918 }
6919
6920 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6921 Undef, SubVec, ZeroIdx);
6922
6923 if (Vec.isUndef()) {
6924 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6924, __extension__
__PRETTY_FUNCTION__))
;
6925 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6926 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6928 }
6929
6930 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6931 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6931, __extension__
__PRETTY_FUNCTION__))
;
6932 // If upper elements of Vec are known undef, then just shift into place.
6933 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6934 [](SDValue V) { return V.isUndef(); })) {
6935 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6936 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6937 } else {
6938 NumElems = WideOpVT.getVectorNumElements();
6939 unsigned ShiftLeft = NumElems - SubVecNumElems;
6940 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6941 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6942 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6943 if (ShiftRight != 0)
6944 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6945 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6946 }
6947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6948 }
6949
6950 // Simple case when we put subvector in the upper part
6951 if (IdxVal + SubVecNumElems == NumElems) {
6952 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6953 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6954 if (SubVecNumElems * 2 == NumElems) {
6955 // Special case, use legal zero extending insert_subvector. This allows
6956 // isel to optimize when bits are known zero.
6957 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6958 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6959 DAG.getConstant(0, dl, WideOpVT),
6960 Vec, ZeroIdx);
6961 } else {
6962 // Otherwise use explicit shifts to zero the bits.
6963 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6964 Undef, Vec, ZeroIdx);
6965 NumElems = WideOpVT.getVectorNumElements();
6966 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6967 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6968 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6969 }
6970 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6971 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6972 }
6973
6974 // Inserting into the middle is more complicated.
6975
6976 NumElems = WideOpVT.getVectorNumElements();
6977
6978 // Widen the vector if needed.
6979 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6980
6981 unsigned ShiftLeft = NumElems - SubVecNumElems;
6982 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6983
6984 // Do an optimization for the the most frequently used types.
6985 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6986 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6987 Mask0.flipAllBits();
6988 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6989 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6990 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6991 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6992 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6993 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6994 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6995 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6996
6997 // Reduce to original width if needed.
6998 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6999 }
7000
7001 // Clear the upper bits of the subvector and move it to its insert position.
7002 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7003 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7004 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7005 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7006
7007 // Isolate the bits below the insertion point.
7008 unsigned LowShift = NumElems - IdxVal;
7009 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
7010 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7011 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
7012 DAG.getTargetConstant(LowShift, dl, MVT::i8));
7013
7014 // Isolate the bits after the last inserted bit.
7015 unsigned HighShift = IdxVal + SubVecNumElems;
7016 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
7017 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7018 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
7019 DAG.getTargetConstant(HighShift, dl, MVT::i8));
7020
7021 // Now OR all 3 pieces together.
7022 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
7023 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
7024
7025 // Reduce to original width if needed.
7026 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7027}
7028
7029static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
7030 const SDLoc &dl) {
7031 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7031, __extension__
__PRETTY_FUNCTION__))
;
7032 EVT SubVT = V1.getValueType();
7033 EVT SubSVT = SubVT.getScalarType();
7034 unsigned SubNumElts = SubVT.getVectorNumElements();
7035 unsigned SubVectorWidth = SubVT.getSizeInBits();
7036 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
7037 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
7038 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
7039}
7040
7041/// Returns a vector of specified type with all bits set.
7042/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
7043/// Then bitcast to their original type, ensuring they get CSE'd.
7044static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
7045 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7046, __extension__
__PRETTY_FUNCTION__))
7046 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7046, __extension__
__PRETTY_FUNCTION__))
;
7047
7048 APInt Ones = APInt::getAllOnes(32);
7049 unsigned NumElts = VT.getSizeInBits() / 32;
7050 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
7051 return DAG.getBitcast(VT, Vec);
7052}
7053
7054static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
7055 SDValue In, SelectionDAG &DAG) {
7056 EVT InVT = In.getValueType();
7057 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7057, __extension__
__PRETTY_FUNCTION__))
;
7058 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7060, __extension__
__PRETTY_FUNCTION__))
7059 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7060, __extension__
__PRETTY_FUNCTION__))
7060 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7060, __extension__
__PRETTY_FUNCTION__))
;
7061
7062 // For 256-bit vectors, we only need the lower (128-bit) input half.
7063 // For 512-bit vectors, we only need the lower input half or quarter.
7064 if (InVT.getSizeInBits() > 128) {
7065 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7066, __extension__
__PRETTY_FUNCTION__))
7066 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7066, __extension__
__PRETTY_FUNCTION__))
;
7067 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
7068 In = extractSubVector(In, 0, DAG, DL,
7069 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
7070 InVT = In.getValueType();
7071 }
7072
7073 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
7074 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
7075
7076 return DAG.getNode(Opcode, DL, VT, In);
7077}
7078
7079// Match (xor X, -1) -> X.
7080// Match extract_subvector(xor X, -1) -> extract_subvector(X).
7081// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
7082static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
7083 V = peekThroughBitcasts(V);
7084 if (V.getOpcode() == ISD::XOR &&
7085 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
7086 isAllOnesConstant(V.getOperand(1))))
7087 return V.getOperand(0);
7088 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7089 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
7090 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
7091 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
7092 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
7093 Not, V.getOperand(1));
7094 }
7095 }
7096 SmallVector<SDValue, 2> CatOps;
7097 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
7098 for (SDValue &CatOp : CatOps) {
7099 SDValue NotCat = IsNOT(CatOp, DAG);
7100 if (!NotCat) return SDValue();
7101 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
7102 }
7103 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
7104 }
7105 return SDValue();
7106}
7107
7108void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
7109 bool Lo, bool Unary) {
7110 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__))
7111 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__))
;
7112 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7112, __extension__
__PRETTY_FUNCTION__))
;
7113 int NumElts = VT.getVectorNumElements();
7114 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7115 for (int i = 0; i < NumElts; ++i) {
7116 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7117 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
7118 Pos += (Unary ? 0 : NumElts * (i % 2));
7119 Pos += (Lo ? 0 : NumEltsInLane / 2);
7120 Mask.push_back(Pos);
7121 }
7122}
7123
7124/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
7125/// imposed by AVX and specific to the unary pattern. Example:
7126/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
7127/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
7128void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7129 bool Lo) {
7130 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7130, __extension__
__PRETTY_FUNCTION__))
;
7131 int NumElts = VT.getVectorNumElements();
7132 for (int i = 0; i < NumElts; ++i) {
7133 int Pos = i / 2;
7134 Pos += (Lo ? 0 : NumElts / 2);
7135 Mask.push_back(Pos);
7136 }
7137}
7138
7139// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
7140static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
7141 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
7142 if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
7143 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
7144 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
7145 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
7146 int M = Mask[I];
7147 if (M < 0)
7148 continue;
7149 SDValue V = (M < NumElts) ? V1 : V2;
7150 if (V.isUndef())
7151 continue;
7152 Ops[I] = V.getOperand(M % NumElts);
7153 }
7154 return DAG.getBuildVector(VT, dl, Ops);
7155 }
7156
7157 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
7158}
7159
7160/// Returns a vector_shuffle node for an unpackl operation.
7161static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7162 SDValue V1, SDValue V2) {
7163 SmallVector<int, 8> Mask;
7164 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
7165 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7166}
7167
7168/// Returns a vector_shuffle node for an unpackh operation.
7169static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7170 SDValue V1, SDValue V2) {
7171 SmallVector<int, 8> Mask;
7172 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
7173 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7174}
7175
7176/// Returns a node that packs the LHS + RHS nodes together at half width.
7177/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
7178/// TODO: Add subvector splitting if/when we have a need for it.
7179static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
7180 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
7181 bool PackHiHalf = false) {
7182 MVT OpVT = LHS.getSimpleValueType();
7183 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7184 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
7185 assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))
7186 VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))
7187 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))
7188 "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))
;
7189 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7190, __extension__
__PRETTY_FUNCTION__))
7190 "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7190, __extension__
__PRETTY_FUNCTION__))
;
7191
7192 // Rely on vector shuffles for vXi64 -> vXi32 packing.
7193 if (EltSizeInBits == 32) {
7194 SmallVector<int> PackMask;
7195 int Offset = PackHiHalf ? 1 : 0;
7196 int NumElts = VT.getVectorNumElements();
7197 for (int I = 0; I != NumElts; I += 4) {
7198 PackMask.push_back(I + Offset);
7199 PackMask.push_back(I + Offset + 2);
7200 PackMask.push_back(I + Offset + NumElts);
7201 PackMask.push_back(I + Offset + NumElts + 2);
7202 }
7203 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
7204 DAG.getBitcast(VT, RHS), PackMask);
7205 }
7206
7207 // See if we already have sufficient leading bits for PACKSS/PACKUS.
7208 if (!PackHiHalf) {
7209 if (UsePackUS &&
7210 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
7211 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
7212 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7213
7214 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
7215 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
7216 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7217 }
7218
7219 // Fallback to sign/zero extending the requested half and pack.
7220 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
7221 if (UsePackUS) {
7222 if (PackHiHalf) {
7223 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
7224 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
7225 } else {
7226 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
7227 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
7228 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
7229 };
7230 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7231 };
7232
7233 if (!PackHiHalf) {
7234 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
7235 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
7236 }
7237 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
7238 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
7239 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7240}
7241
7242/// Return a vector_shuffle of the specified vector of zero or undef vector.
7243/// This produces a shuffle where the low element of V2 is swizzled into the
7244/// zero/undef vector, landing at element Idx.
7245/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
7246static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
7247 bool IsZero,
7248 const X86Subtarget &Subtarget,
7249 SelectionDAG &DAG) {
7250 MVT VT = V2.getSimpleValueType();
7251 SDValue V1 = IsZero
7252 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
7253 int NumElems = VT.getVectorNumElements();
7254 SmallVector<int, 16> MaskVec(NumElems);
7255 for (int i = 0; i != NumElems; ++i)
7256 // If this is the insertion idx, put the low elt of V2 here.
7257 MaskVec[i] = (i == Idx) ? NumElems : i;
7258 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
7259}
7260
7261static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
7262 if (Ptr.getOpcode() == X86ISD::Wrapper ||
7263 Ptr.getOpcode() == X86ISD::WrapperRIP)
7264 Ptr = Ptr.getOperand(0);
7265
7266 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
7267 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
7268 return nullptr;
7269
7270 return CNode->getConstVal();
7271}
7272
7273static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
7274 if (!Load || !ISD::isNormalLoad(Load))
7275 return nullptr;
7276 return getTargetConstantFromBasePtr(Load->getBasePtr());
7277}
7278
7279static const Constant *getTargetConstantFromNode(SDValue Op) {
7280 Op = peekThroughBitcasts(Op);
7281 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
7282}
7283
7284const Constant *
7285X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7286 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7286, __extension__
__PRETTY_FUNCTION__))
;
7287 return getTargetConstantFromNode(LD);
7288}
7289
7290// Extract raw constant bits from constant pools.
7291static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7292 APInt &UndefElts,
7293 SmallVectorImpl<APInt> &EltBits,
7294 bool AllowWholeUndefs = true,
7295 bool AllowPartialUndefs = true) {
7296 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7296, __extension__
__PRETTY_FUNCTION__))
;
7297
7298 Op = peekThroughBitcasts(Op);
7299
7300 EVT VT = Op.getValueType();
7301 unsigned SizeInBits = VT.getSizeInBits();
7302 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7302, __extension__
__PRETTY_FUNCTION__))
;
7303 unsigned NumElts = SizeInBits / EltSizeInBits;
7304
7305 // Bitcast a source array of element bits to the target size.
7306 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7307 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7308 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7309 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7310, __extension__
__PRETTY_FUNCTION__))
7310 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7310, __extension__
__PRETTY_FUNCTION__))
;
7311
7312 // Don't split if we don't allow undef bits.
7313 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7314 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7315 return false;
7316
7317 // If we're already the right size, don't bother bitcasting.
7318 if (NumSrcElts == NumElts) {
7319 UndefElts = UndefSrcElts;
7320 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7321 return true;
7322 }
7323
7324 // Extract all the undef/constant element data and pack into single bitsets.
7325 APInt UndefBits(SizeInBits, 0);
7326 APInt MaskBits(SizeInBits, 0);
7327
7328 for (unsigned i = 0; i != NumSrcElts; ++i) {
7329 unsigned BitOffset = i * SrcEltSizeInBits;
7330 if (UndefSrcElts[i])
7331 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7332 MaskBits.insertBits(SrcEltBits[i], BitOffset);
7333 }
7334
7335 // Split the undef/constant single bitset data into the target elements.
7336 UndefElts = APInt(NumElts, 0);
7337 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7338
7339 for (unsigned i = 0; i != NumElts; ++i) {
7340 unsigned BitOffset = i * EltSizeInBits;
7341 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7342
7343 // Only treat an element as UNDEF if all bits are UNDEF.
7344 if (UndefEltBits.isAllOnes()) {
7345 if (!AllowWholeUndefs)
7346 return false;
7347 UndefElts.setBit(i);
7348 continue;
7349 }
7350
7351 // If only some bits are UNDEF then treat them as zero (or bail if not
7352 // supported).
7353 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7354 return false;
7355
7356 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7357 }
7358 return true;
7359 };
7360
7361 // Collect constant bits and insert into mask/undef bit masks.
7362 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7363 unsigned UndefBitIndex) {
7364 if (!Cst)
7365 return false;
7366 if (isa<UndefValue>(Cst)) {
7367 Undefs.setBit(UndefBitIndex);
7368 return true;
7369 }
7370 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7371 Mask = CInt->getValue();
7372 return true;
7373 }
7374 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7375 Mask = CFP->getValueAPF().bitcastToAPInt();
7376 return true;
7377 }
7378 return false;
7379 };
7380
7381 // Handle UNDEFs.
7382 if (Op.isUndef()) {
7383 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7384 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7385 return CastBitData(UndefSrcElts, SrcEltBits);
7386 }
7387
7388 // Extract scalar constant bits.
7389 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7390 APInt UndefSrcElts = APInt::getZero(1);
7391 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7392 return CastBitData(UndefSrcElts, SrcEltBits);
7393 }
7394 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7395 APInt UndefSrcElts = APInt::getZero(1);
7396 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7397 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7398 return CastBitData(UndefSrcElts, SrcEltBits);
7399 }
7400
7401 // Extract constant bits from build vector.
7402 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7403 BitVector Undefs;
7404 SmallVector<APInt> SrcEltBits;
7405 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7406 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7407 APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());
7408 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7409 if (Undefs[I])
7410 UndefSrcElts.setBit(I);
7411 return CastBitData(UndefSrcElts, SrcEltBits);
7412 }
7413 }
7414
7415 // Extract constant bits from constant pool vector.
7416 if (auto *Cst = getTargetConstantFromNode(Op)) {
7417 Type *CstTy = Cst->getType();
7418 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7419 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7420 return false;
7421
7422 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7423 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7424
7425 APInt UndefSrcElts(NumSrcElts, 0);
7426 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7427 for (unsigned i = 0; i != NumSrcElts; ++i)
7428 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7429 UndefSrcElts, i))
7430 return false;
7431
7432 return CastBitData(UndefSrcElts, SrcEltBits);
7433 }
7434
7435 // Extract constant bits from a broadcasted constant pool scalar.
7436 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7437 EltSizeInBits <= VT.getScalarSizeInBits()) {
7438 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7439 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
7440 return false;
7441
7442 SDValue Ptr = MemIntr->getBasePtr();
7443 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7444 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
7445 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7446
7447 APInt UndefSrcElts(NumSrcElts, 0);
7448 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7449 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7450 if (UndefSrcElts[0])
7451 UndefSrcElts.setBits(0, NumSrcElts);
7452 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7453 return CastBitData(UndefSrcElts, SrcEltBits);
7454 }
7455 }
7456 }
7457
7458 // Extract constant bits from a subvector broadcast.
7459 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7460 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7461 SDValue Ptr = MemIntr->getBasePtr();
7462 // The source constant may be larger than the subvector broadcast,
7463 // ensure we extract the correct subvector constants.
7464 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7465 Type *CstTy = Cst->getType();
7466 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7467 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7468 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7469 (SizeInBits % SubVecSizeInBits) != 0)
7470 return false;
7471 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7472 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7473 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7474 APInt UndefSubElts(NumSubElts, 0);
7475 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7476 APInt(CstEltSizeInBits, 0));
7477 for (unsigned i = 0; i != NumSubElts; ++i) {
7478 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7479 UndefSubElts, i))
7480 return false;
7481 for (unsigned j = 1; j != NumSubVecs; ++j)
7482 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7483 }
7484 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7485 UndefSubElts);
7486 return CastBitData(UndefSubElts, SubEltBits);
7487 }
7488 }
7489
7490 // Extract a rematerialized scalar constant insertion.
7491 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7492 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7493 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7494 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7495 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7496
7497 APInt UndefSrcElts(NumSrcElts, 0);
7498 SmallVector<APInt, 64> SrcEltBits;
7499 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7500 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7501 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7502 return CastBitData(UndefSrcElts, SrcEltBits);
7503 }
7504
7505 // Insert constant bits from a base and sub vector sources.
7506 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7507 // If bitcasts to larger elements we might lose track of undefs - don't
7508 // allow any to be safe.
7509 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7510 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7511
7512 APInt UndefSrcElts, UndefSubElts;
7513 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7514 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7515 UndefSubElts, EltSubBits,
7516 AllowWholeUndefs && AllowUndefs,
7517 AllowPartialUndefs && AllowUndefs) &&
7518 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7519 UndefSrcElts, EltSrcBits,
7520 AllowWholeUndefs && AllowUndefs,
7521 AllowPartialUndefs && AllowUndefs)) {
7522 unsigned BaseIdx = Op.getConstantOperandVal(2);
7523 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7524 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7525 EltSrcBits[BaseIdx + i] = EltSubBits[i];
7526 return CastBitData(UndefSrcElts, EltSrcBits);
7527 }
7528 }
7529
7530 // Extract constant bits from a subvector's source.
7531 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7532 // TODO - support extract_subvector through bitcasts.
7533 if (EltSizeInBits != VT.getScalarSizeInBits())
7534 return false;
7535
7536 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7537 UndefElts, EltBits, AllowWholeUndefs,
7538 AllowPartialUndefs)) {
7539 EVT SrcVT = Op.getOperand(0).getValueType();
7540 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7541 unsigned NumSubElts = VT.getVectorNumElements();
7542 unsigned BaseIdx = Op.getConstantOperandVal(1);
7543 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7544 if ((BaseIdx + NumSubElts) != NumSrcElts)
7545 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7546 if (BaseIdx != 0)
7547 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7548 return true;
7549 }
7550 }
7551
7552 // Extract constant bits from shuffle node sources.
7553 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7554 // TODO - support shuffle through bitcasts.
7555 if (EltSizeInBits != VT.getScalarSizeInBits())
7556 return false;
7557
7558 ArrayRef<int> Mask = SVN->getMask();
7559 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7560 llvm::any_of(Mask, [](int M) { return M < 0; }))
7561 return false;
7562
7563 APInt UndefElts0, UndefElts1;
7564 SmallVector<APInt, 32> EltBits0, EltBits1;
7565 if (isAnyInRange(Mask, 0, NumElts) &&
7566 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7567 UndefElts0, EltBits0, AllowWholeUndefs,
7568 AllowPartialUndefs))
7569 return false;
7570 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7571 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7572 UndefElts1, EltBits1, AllowWholeUndefs,
7573 AllowPartialUndefs))
7574 return false;
7575
7576 UndefElts = APInt::getZero(NumElts);
7577 for (int i = 0; i != (int)NumElts; ++i) {
7578 int M = Mask[i];
7579 if (M < 0) {
7580 UndefElts.setBit(i);
7581 EltBits.push_back(APInt::getZero(EltSizeInBits));
7582 } else if (M < (int)NumElts) {
7583 if (UndefElts0[M])
7584 UndefElts.setBit(i);
7585 EltBits.push_back(EltBits0[M]);
7586 } else {
7587 if (UndefElts1[M - NumElts])
7588 UndefElts.setBit(i);
7589 EltBits.push_back(EltBits1[M - NumElts]);
7590 }
7591 }
7592 return true;
7593 }
7594
7595 return false;
7596}
7597
7598namespace llvm {
7599namespace X86 {
7600bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7601 APInt UndefElts;
7602 SmallVector<APInt, 16> EltBits;
7603 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7604 UndefElts, EltBits, true,
7605 AllowPartialUndefs)) {
7606 int SplatIndex = -1;
7607 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7608 if (UndefElts[i])
7609 continue;
7610 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7611 SplatIndex = -1;
7612 break;
7613 }
7614 SplatIndex = i;
7615 }
7616 if (0 <= SplatIndex) {
7617 SplatVal = EltBits[SplatIndex];
7618 return true;
7619 }
7620 }
7621
7622 return false;
7623}
7624} // namespace X86
7625} // namespace llvm
7626
7627static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7628 unsigned MaskEltSizeInBits,
7629 SmallVectorImpl<uint64_t> &RawMask,
7630 APInt &UndefElts) {
7631 // Extract the raw target constant bits.
7632 SmallVector<APInt, 64> EltBits;
7633 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7634 EltBits, /* AllowWholeUndefs */ true,
7635 /* AllowPartialUndefs */ false))
7636 return false;
7637
7638 // Insert the extracted elements into the mask.
7639 for (const APInt &Elt : EltBits)
7640 RawMask.push_back(Elt.getZExtValue());
7641
7642 return true;
7643}
7644
7645/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7646/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7647/// Note: This ignores saturation, so inputs must be checked first.
7648static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7649 bool Unary, unsigned NumStages = 1) {
7650 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7650, __extension__
__PRETTY_FUNCTION__))
;
7651 unsigned NumElts = VT.getVectorNumElements();
7652 unsigned NumLanes = VT.getSizeInBits() / 128;
7653 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7654 unsigned Offset = Unary ? 0 : NumElts;
7655 unsigned Repetitions = 1u << (NumStages - 1);
7656 unsigned Increment = 1u << NumStages;
7657 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7657, __extension__
__PRETTY_FUNCTION__))
;
7658
7659 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7660 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7661 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7662 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7663 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7664 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7665 }
7666 }
7667}
7668
7669// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7670static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7671 APInt &DemandedLHS, APInt &DemandedRHS) {
7672 int NumLanes = VT.getSizeInBits() / 128;
7673 int NumElts = DemandedElts.getBitWidth();
7674 int NumInnerElts = NumElts / 2;
7675 int NumEltsPerLane = NumElts / NumLanes;
7676 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7677
7678 DemandedLHS = APInt::getZero(NumInnerElts);
7679 DemandedRHS = APInt::getZero(NumInnerElts);
7680
7681 // Map DemandedElts to the packed operands.
7682 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7683 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7684 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7685 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7686 if (DemandedElts[OuterIdx])
7687 DemandedLHS.setBit(InnerIdx);
7688 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7689 DemandedRHS.setBit(InnerIdx);
7690 }
7691 }
7692}
7693
7694// Split the demanded elts of a HADD/HSUB node between its operands.
7695static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7696 APInt &DemandedLHS, APInt &DemandedRHS) {
7697 int NumLanes = VT.getSizeInBits() / 128;
7698 int NumElts = DemandedElts.getBitWidth();
7699 int NumEltsPerLane = NumElts / NumLanes;
7700 int HalfEltsPerLane = NumEltsPerLane / 2;
7701
7702 DemandedLHS = APInt::getZero(NumElts);
7703 DemandedRHS = APInt::getZero(NumElts);
7704
7705 // Map DemandedElts to the horizontal operands.
7706 for (int Idx = 0; Idx != NumElts; ++Idx) {
7707 if (!DemandedElts[Idx])
7708 continue;
7709 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7710 int LocalIdx = Idx % NumEltsPerLane;
7711 if (LocalIdx < HalfEltsPerLane) {
7712 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7713 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7714 } else {
7715 LocalIdx -= HalfEltsPerLane;
7716 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7717 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7718 }
7719 }
7720}
7721
7722/// Calculates the shuffle mask corresponding to the target-specific opcode.
7723/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7724/// operands in \p Ops, and returns true.
7725/// Sets \p IsUnary to true if only one source is used. Note that this will set
7726/// IsUnary for shuffles which use a single input multiple times, and in those
7727/// cases it will adjust the mask to only have indices within that single input.
7728/// It is an error to call this with non-empty Mask/Ops vectors.
7729static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7730 SmallVectorImpl<SDValue> &Ops,
7731 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7732 unsigned NumElems = VT.getVectorNumElements();
7733 unsigned MaskEltSize = VT.getScalarSizeInBits();
7734 SmallVector<uint64_t, 32> RawMask;
7735 APInt RawUndefs;
7736 uint64_t ImmN;
7737
7738 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7738, __extension__
__PRETTY_FUNCTION__))
;
7739 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7739, __extension__
__PRETTY_FUNCTION__))
;
7740
7741 IsUnary = false;
7742 bool IsFakeUnary = false;
7743 switch (N->getOpcode()) {
7744 case X86ISD::BLENDI:
7745 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7745, __extension__
__PRETTY_FUNCTION__))
;
7746 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7746, __extension__
__PRETTY_FUNCTION__))
;
7747 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7748 DecodeBLENDMask(NumElems, ImmN, Mask);
7749 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7750 break;
7751 case X86ISD::SHUFP:
7752 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7752, __extension__
__PRETTY_FUNCTION__))
;
7753 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7753, __extension__
__PRETTY_FUNCTION__))
;
7754 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7755 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7756 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7757 break;
7758 case X86ISD::INSERTPS:
7759 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7759, __extension__
__PRETTY_FUNCTION__))
;
7760 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7760, __extension__
__PRETTY_FUNCTION__))
;
7761 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7762 DecodeINSERTPSMask(ImmN, Mask);
7763 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7764 break;
7765 case X86ISD::EXTRQI:
7766 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7766, __extension__
__PRETTY_FUNCTION__))
;
7767 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7768 isa<ConstantSDNode>(N->getOperand(2))) {
7769 int BitLen = N->getConstantOperandVal(1);
7770 int BitIdx = N->getConstantOperandVal(2);
7771 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7772 IsUnary = true;
7773 }
7774 break;
7775 case X86ISD::INSERTQI:
7776 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7776, __extension__
__PRETTY_FUNCTION__))
;
7777 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7777, __extension__
__PRETTY_FUNCTION__))
;
7778 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7779 isa<ConstantSDNode>(N->getOperand(3))) {
7780 int BitLen = N->getConstantOperandVal(2);
7781 int BitIdx = N->getConstantOperandVal(3);
7782 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7783 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7784 }
7785 break;
7786 case X86ISD::UNPCKH:
7787 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7787, __extension__
__PRETTY_FUNCTION__))
;
7788 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7788, __extension__
__PRETTY_FUNCTION__))
;
7789 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7790 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7791 break;
7792 case X86ISD::UNPCKL:
7793 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__))
;
7794 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7794, __extension__
__PRETTY_FUNCTION__))
;
7795 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7796 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7797 break;
7798 case X86ISD::MOVHLPS:
7799 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__))
;
7800 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7800, __extension__
__PRETTY_FUNCTION__))
;
7801 DecodeMOVHLPSMask(NumElems, Mask);
7802 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7803 break;
7804 case X86ISD::MOVLHPS:
7805 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7805, __extension__
__PRETTY_FUNCTION__))
;
7806 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7806, __extension__
__PRETTY_FUNCTION__))
;
7807 DecodeMOVLHPSMask(NumElems, Mask);
7808 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7809 break;
7810 case X86ISD::VALIGN:
7811 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7812, __extension__
__PRETTY_FUNCTION__))
7812 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7812, __extension__
__PRETTY_FUNCTION__))
;
7813 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__
__PRETTY_FUNCTION__))
;
7814 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7814, __extension__
__PRETTY_FUNCTION__))
;
7815 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7816 DecodeVALIGNMask(NumElems, ImmN, Mask);
7817 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7818 Ops.push_back(N->getOperand(1));
7819 Ops.push_back(N->getOperand(0));
7820 break;
7821 case X86ISD::PALIGNR:
7822 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7822, __extension__
__PRETTY_FUNCTION__))
;
7823 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7823, __extension__
__PRETTY_FUNCTION__))
;
7824 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7824, __extension__
__PRETTY_FUNCTION__))
;
7825 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7826 DecodePALIGNRMask(NumElems, ImmN, Mask);
7827 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7828 Ops.push_back(N->getOperand(1));
7829 Ops.push_back(N->getOperand(0));
7830 break;
7831 case X86ISD::VSHLDQ:
7832 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7832, __extension__
__PRETTY_FUNCTION__))
;
7833 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7833, __extension__
__PRETTY_FUNCTION__))
;
7834 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7835 DecodePSLLDQMask(NumElems, ImmN, Mask);
7836 IsUnary = true;
7837 break;
7838 case X86ISD::VSRLDQ:
7839 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7839, __extension__
__PRETTY_FUNCTION__))
;
7840 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7840, __extension__
__PRETTY_FUNCTION__))
;
7841 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7842 DecodePSRLDQMask(NumElems, ImmN, Mask);
7843 IsUnary = true;
7844 break;
7845 case X86ISD::PSHUFD:
7846 case X86ISD::VPERMILPI:
7847 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__))
;
7848 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7849 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7850 IsUnary = true;
7851 break;
7852 case X86ISD::PSHUFHW:
7853 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7853, __extension__
__PRETTY_FUNCTION__))
;
7854 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7855 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7856 IsUnary = true;
7857 break;
7858 case X86ISD::PSHUFLW:
7859 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__
__PRETTY_FUNCTION__))
;
7860 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7861 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7862 IsUnary = true;
7863 break;
7864 case X86ISD::VZEXT_MOVL:
7865 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7865, __extension__
__PRETTY_FUNCTION__))
;
7866 DecodeZeroMoveLowMask(NumElems, Mask);
7867 IsUnary = true;
7868 break;
7869 case X86ISD::VBROADCAST:
7870 // We only decode broadcasts of same-sized vectors, peeking through to
7871 // extracted subvectors is likely to cause hasOneUse issues with
7872 // SimplifyDemandedBits etc.
7873 if (N->getOperand(0).getValueType() == VT) {
7874 DecodeVectorBroadcast(NumElems, Mask);
7875 IsUnary = true;
7876 break;
7877 }
7878 return false;
7879 case X86ISD::VPERMILPV: {
7880 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7880, __extension__
__PRETTY_FUNCTION__))
;
7881 IsUnary = true;
7882 SDValue MaskNode = N->getOperand(1);
7883 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7884 RawUndefs)) {
7885 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7886 break;
7887 }
7888 return false;
7889 }
7890 case X86ISD::PSHUFB: {
7891 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7891, __extension__
__PRETTY_FUNCTION__))
;
7892 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7892, __extension__
__PRETTY_FUNCTION__))
;
7893 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7893, __extension__
__PRETTY_FUNCTION__))
;
7894 IsUnary = true;
7895 SDValue MaskNode = N->getOperand(1);
7896 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7897 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7898 break;
7899 }
7900 return false;
7901 }
7902 case X86ISD::VPERMI:
7903 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7903, __extension__
__PRETTY_FUNCTION__))
;
7904 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7905 DecodeVPERMMask(NumElems, ImmN, Mask);
7906 IsUnary = true;
7907 break;
7908 case X86ISD::MOVSS:
7909 case X86ISD::MOVSD:
7910 case X86ISD::MOVSH:
7911 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7911, __extension__
__PRETTY_FUNCTION__))
;
7912 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7912, __extension__
__PRETTY_FUNCTION__))
;
7913 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7914 break;
7915 case X86ISD::VPERM2X128:
7916 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7916, __extension__
__PRETTY_FUNCTION__))
;
7917 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7917, __extension__
__PRETTY_FUNCTION__))
;
7918 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7919 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7920 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7921 break;
7922 case X86ISD::SHUF128:
7923 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7923, __extension__
__PRETTY_FUNCTION__))
;
7924 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7924, __extension__
__PRETTY_FUNCTION__))
;
7925 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7926 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7927 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7928 break;
7929 case X86ISD::MOVSLDUP:
7930 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7930, __extension__
__PRETTY_FUNCTION__))
;
7931 DecodeMOVSLDUPMask(NumElems, Mask);
7932 IsUnary = true;
7933 break;
7934 case X86ISD::MOVSHDUP:
7935 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7935, __extension__
__PRETTY_FUNCTION__))
;
7936 DecodeMOVSHDUPMask(NumElems, Mask);
7937 IsUnary = true;
7938 break;
7939 case X86ISD::MOVDDUP:
7940 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7940, __extension__
__PRETTY_FUNCTION__))
;
7941 DecodeMOVDDUPMask(NumElems, Mask);
7942 IsUnary = true;
7943 break;
7944 case X86ISD::VPERMIL2: {
7945 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__))
;
7946 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7946, __extension__
__PRETTY_FUNCTION__))
;
7947 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7948 SDValue MaskNode = N->getOperand(2);
7949 SDValue CtrlNode = N->getOperand(3);
7950 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7951 unsigned CtrlImm = CtrlOp->getZExtValue();
7952 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7953 RawUndefs)) {
7954 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7955 Mask);
7956 break;
7957 }
7958 }
7959 return false;
7960 }
7961 case X86ISD::VPPERM: {
7962 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7962, __extension__
__PRETTY_FUNCTION__))
;
7963 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7963, __extension__
__PRETTY_FUNCTION__))
;
7964 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7965 SDValue MaskNode = N->getOperand(2);
7966 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7967 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7968 break;
7969 }
7970 return false;
7971 }
7972 case X86ISD::VPERMV: {
7973 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7973, __extension__
__PRETTY_FUNCTION__))
;
7974 IsUnary = true;
7975 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7976 Ops.push_back(N->getOperand(1));
7977 SDValue MaskNode = N->getOperand(0);
7978 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7979 RawUndefs)) {
7980 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7981 break;
7982 }
7983 return false;
7984 }
7985 case X86ISD::VPERMV3: {
7986 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7986, __extension__
__PRETTY_FUNCTION__))
;
7987 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7987, __extension__
__PRETTY_FUNCTION__))
;
7988 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7989 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7990 Ops.push_back(N->getOperand(0));
7991 Ops.push_back(N->getOperand(2));
7992 SDValue MaskNode = N->getOperand(1);
7993 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7994 RawUndefs)) {
7995 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7996 break;
7997 }
7998 return false;
7999 }
8000 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8000)
;
8001 }
8002
8003 // Empty mask indicates the decode failed.
8004 if (Mask.empty())
8005 return false;
8006
8007 // Check if we're getting a shuffle mask with zero'd elements.
8008 if (!AllowSentinelZero && isAnyZero(Mask))
8009 return false;
8010
8011 // If we have a fake unary shuffle, the shuffle mask is spread across two
8012 // inputs that are actually the same node. Re-map the mask to always point
8013 // into the first input.
8014 if (IsFakeUnary)
8015 for (int &M : Mask)
8016 if (M >= (int)Mask.size())
8017 M -= Mask.size();
8018
8019 // If we didn't already add operands in the opcode-specific code, default to
8020 // adding 1 or 2 operands starting at 0.
8021 if (Ops.empty()) {
8022 Ops.push_back(N->getOperand(0));
8023 if (!IsUnary || IsFakeUnary)
8024 Ops.push_back(N->getOperand(1));
8025 }
8026
8027 return true;
8028}
8029
8030// Wrapper for getTargetShuffleMask with InUnary;
8031static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
8032 SmallVectorImpl<SDValue> &Ops,
8033 SmallVectorImpl<int> &Mask) {
8034 bool IsUnary;
8035 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
8036}
8037
8038/// Compute whether each element of a shuffle is zeroable.
8039///
8040/// A "zeroable" vector shuffle element is one which can be lowered to zero.
8041/// Either it is an undef element in the shuffle mask, the element of the input
8042/// referenced is undef, or the element of the input referenced is known to be
8043/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8044/// as many lanes with this technique as possible to simplify the remaining
8045/// shuffle.
8046static void computeZeroableShuffleElements(ArrayRef<int> Mask,
8047 SDValue V1, SDValue V2,
8048 APInt &KnownUndef, APInt &KnownZero) {
8049 int Size = Mask.size();
8050 KnownUndef = KnownZero = APInt::getZero(Size);
8051
8052 V1 = peekThroughBitcasts(V1);
8053 V2 = peekThroughBitcasts(V2);
8054
8055 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8056 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8057
8058 int VectorSizeInBits = V1.getValueSizeInBits();
8059 int ScalarSizeInBits = VectorSizeInBits / Size;
8060 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8060, __extension__
__PRETTY_FUNCTION__))
;
8061
8062 for (int i = 0; i < Size; ++i) {
8063 int M = Mask[i];
8064 // Handle the easy cases.
8065 if (M < 0) {
8066 KnownUndef.setBit(i);
8067 continue;
8068 }
8069 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8070 KnownZero.setBit(i);
8071 continue;
8072 }
8073
8074 // Determine shuffle input and normalize the mask.
8075 SDValue V = M < Size ? V1 : V2;
8076 M %= Size;
8077
8078 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8079 if (V.getOpcode() != ISD::BUILD_VECTOR)
8080 continue;
8081
8082 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8083 // the (larger) source element must be UNDEF/ZERO.
8084 if ((Size % V.getNumOperands()) == 0) {
8085 int Scale = Size / V->getNumOperands();
8086 SDValue Op = V.getOperand(M / Scale);
8087 if (Op.isUndef())
8088 KnownUndef.setBit(i);
8089 if (X86::isZeroNode(Op))
8090 KnownZero.setBit(i);
8091 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8092 APInt Val = Cst->getAPIntValue();
8093 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8094 if (Val == 0)
8095 KnownZero.setBit(i);
8096 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8097 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8098 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8099 if (Val == 0)
8100 KnownZero.setBit(i);
8101 }
8102 continue;
8103 }
8104
8105 // If the BUILD_VECTOR has more elements then all the (smaller) source
8106 // elements must be UNDEF or ZERO.
8107 if ((V.getNumOperands() % Size) == 0) {
8108 int Scale = V->getNumOperands() / Size;
8109 bool AllUndef = true;
8110 bool AllZero = true;
8111 for (int j = 0; j < Scale; ++j) {
8112 SDValue Op = V.getOperand((M * Scale) + j);
8113 AllUndef &= Op.isUndef();
8114 AllZero &= X86::isZeroNode(Op);
8115 }
8116 if (AllUndef)
8117 KnownUndef.setBit(i);
8118 if (AllZero)
8119 KnownZero.setBit(i);
8120 continue;
8121 }
8122 }
8123}
8124
8125/// Decode a target shuffle mask and inputs and see if any values are
8126/// known to be undef or zero from their inputs.
8127/// Returns true if the target shuffle mask was decoded.
8128/// FIXME: Merge this with computeZeroableShuffleElements?
8129static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
8130 SmallVectorImpl<SDValue> &Ops,
8131 APInt &KnownUndef, APInt &KnownZero) {
8132 bool IsUnary;
8133 if (!isTargetShuffle(N.getOpcode()))
8134 return false;
8135
8136 MVT VT = N.getSimpleValueType();
8137 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
8138 return false;
8139
8140 int Size = Mask.size();
8141 SDValue V1 = Ops[0];
8142 SDValue V2 = IsUnary ? V1 : Ops[1];
8143 KnownUndef = KnownZero = APInt::getZero(Size);
8144
8145 V1 = peekThroughBitcasts(V1);
8146 V2 = peekThroughBitcasts(V2);
8147
8148 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8149, __extension__
__PRETTY_FUNCTION__))
8149 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8149, __extension__
__PRETTY_FUNCTION__))
;
8150 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
8151
8152 // Extract known constant input data.
8153 APInt UndefSrcElts[2];
8154 SmallVector<APInt, 32> SrcEltBits[2];
8155 bool IsSrcConstant[2] = {
8156 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
8157 SrcEltBits[0], true, false),
8158 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
8159 SrcEltBits[1], true, false)};
8160
8161 for (int i = 0; i < Size; ++i) {
8162 int M = Mask[i];
8163
8164 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
8165 if (M < 0) {
8166 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8166, __extension__
__PRETTY_FUNCTION__))
;
8167 if (SM_SentinelUndef == M)
8168 KnownUndef.setBit(i);
8169 if (SM_SentinelZero == M)
8170 KnownZero.setBit(i);
8171 continue;
8172 }
8173
8174 // Determine shuffle input and normalize the mask.
8175 unsigned SrcIdx = M / Size;
8176 SDValue V = M < Size ? V1 : V2;
8177 M %= Size;
8178
8179 // We are referencing an UNDEF input.
8180 if (V.isUndef()) {
8181 KnownUndef.setBit(i);
8182 continue;
8183 }
8184
8185 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
8186 // TODO: We currently only set UNDEF for integer types - floats use the same
8187 // registers as vectors and many of the scalar folded loads rely on the
8188 // SCALAR_TO_VECTOR pattern.
8189 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8190 (Size % V.getValueType().getVectorNumElements()) == 0) {
8191 int Scale = Size / V.getValueType().getVectorNumElements();
8192 int Idx = M / Scale;
8193 if (Idx != 0 && !VT.isFloatingPoint())
8194 KnownUndef.setBit(i);
8195 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
8196 KnownZero.setBit(i);
8197 continue;
8198 }
8199
8200 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
8201 // base vectors.
8202 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
8203 SDValue Vec = V.getOperand(0);
8204 int NumVecElts = Vec.getValueType().getVectorNumElements();
8205 if (Vec.isUndef() && Size == NumVecElts) {
8206 int Idx = V.getConstantOperandVal(2);
8207 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
8208 if (M < Idx || (Idx + NumSubElts) <= M)
8209 KnownUndef.setBit(i);
8210 }
8211 continue;
8212 }
8213
8214 // Attempt to extract from the source's constant bits.
8215 if (IsSrcConstant[SrcIdx]) {
8216 if (UndefSrcElts[SrcIdx][M])
8217 KnownUndef.setBit(i);
8218 else if (SrcEltBits[SrcIdx][M] == 0)
8219 KnownZero.setBit(i);
8220 }
8221 }
8222
8223 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8224, __extension__
__PRETTY_FUNCTION__))
8224 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8224, __extension__
__PRETTY_FUNCTION__))
;
8225 return true;
8226}
8227
8228// Replace target shuffle mask elements with known undef/zero sentinels.
8229static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
8230 const APInt &KnownUndef,
8231 const APInt &KnownZero,
8232 bool ResolveKnownZeros= true) {
8233 unsigned NumElts = Mask.size();
8234 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8235, __extension__
__PRETTY_FUNCTION__))
8235 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8235, __extension__
__PRETTY_FUNCTION__))
;
8236
8237 for (unsigned i = 0; i != NumElts; ++i) {
8238 if (KnownUndef[i])
8239 Mask[i] = SM_SentinelUndef;
8240 else if (ResolveKnownZeros && KnownZero[i])
8241 Mask[i] = SM_SentinelZero;
8242 }
8243}
8244
8245// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
8246static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
8247 APInt &KnownUndef,
8248 APInt &KnownZero) {
8249 unsigned NumElts = Mask.size();
8250 KnownUndef = KnownZero = APInt::getZero(NumElts);
8251
8252 for (unsigned i = 0; i != NumElts; ++i) {
8253 int M = Mask[i];
8254 if (SM_SentinelUndef == M)
8255 KnownUndef.setBit(i);
8256 if (SM_SentinelZero == M)
8257 KnownZero.setBit(i);
8258 }
8259}
8260
8261// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
8262static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
8263 SDValue Cond, bool IsBLENDV = false) {
8264 EVT CondVT = Cond.getValueType();
8265 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
8266 unsigned NumElts = CondVT.getVectorNumElements();
8267
8268 APInt UndefElts;
8269 SmallVector<APInt, 32> EltBits;
8270 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
8271 true, false))
8272 return false;
8273
8274 Mask.resize(NumElts, SM_SentinelUndef);
8275
8276 for (int i = 0; i != (int)NumElts; ++i) {
8277 Mask[i] = i;
8278 // Arbitrarily choose from the 2nd operand if the select condition element
8279 // is undef.
8280 // TODO: Can we do better by matching patterns such as even/odd?
8281 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
8282 (IsBLENDV && EltBits[i].isNonNegative()))
8283 Mask[i] += NumElts;
8284 }
8285
8286 return true;
8287}
8288
8289// Forward declaration (for getFauxShuffleMask recursive check).
8290static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8291 SmallVectorImpl<SDValue> &Inputs,
8292 SmallVectorImpl<int> &Mask,
8293 const SelectionDAG &DAG, unsigned Depth,
8294 bool ResolveKnownElts);
8295
8296// Attempt to decode ops that could be represented as a shuffle mask.
8297// The decoded shuffle mask may contain a different number of elements to the
8298// destination value type.
8299// TODO: Merge into getTargetShuffleInputs()
8300static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
8301 SmallVectorImpl<int> &Mask,
8302 SmallVectorImpl<SDValue> &Ops,
8303 const SelectionDAG &DAG, unsigned Depth,
8304 bool ResolveKnownElts) {
8305 Mask.clear();
8306 Ops.clear();
8307
8308 MVT VT = N.getSimpleValueType();
8309 unsigned NumElts = VT.getVectorNumElements();
8310 unsigned NumSizeInBits = VT.getSizeInBits();
8311 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8312 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8313 return false;
8314 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8314, __extension__
__PRETTY_FUNCTION__))
;
8315 unsigned NumSizeInBytes = NumSizeInBits / 8;
8316 unsigned NumBytesPerElt = NumBitsPerElt / 8;
8317
8318 unsigned Opcode = N.getOpcode();
8319 switch (Opcode) {
8320 case ISD::VECTOR_SHUFFLE: {
8321 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8322 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8323 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8324 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8325 Ops.push_back(N.getOperand(0));
8326 Ops.push_back(N.getOperand(1));
8327 return true;
8328 }
8329 return false;
8330 }
8331 case ISD::AND:
8332 case X86ISD::ANDNP: {
8333 // Attempt to decode as a per-byte mask.
8334 APInt UndefElts;
8335 SmallVector<APInt, 32> EltBits;
8336 SDValue N0 = N.getOperand(0);
8337 SDValue N1 = N.getOperand(1);
8338 bool IsAndN = (X86ISD::ANDNP == Opcode);
8339 uint64_t ZeroMask = IsAndN ? 255 : 0;
8340 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8341 return false;
8342 // We can't assume an undef src element gives an undef dst - the other src
8343 // might be zero.
8344 if (!UndefElts.isZero())
8345 return false;
8346 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8347 const APInt &ByteBits = EltBits[i];
8348 if (ByteBits != 0 && ByteBits != 255)
8349 return false;
8350 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8351 }
8352 Ops.push_back(IsAndN ? N1 : N0);
8353 return true;
8354 }
8355 case ISD::OR: {
8356 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8357 // is a valid shuffle index.
8358 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8359 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8360 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8361 return false;
8362
8363 SmallVector<int, 64> SrcMask0, SrcMask1;
8364 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8365 APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
8366 APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
8367 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
8368 Depth + 1, true) ||
8369 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
8370 Depth + 1, true))
8371 return false;
8372
8373 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8374 SmallVector<int, 64> Mask0, Mask1;
8375 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8376 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8377 for (int i = 0; i != (int)MaskSize; ++i) {
8378 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8379 // loops converting between OR and BLEND shuffles due to
8380 // canWidenShuffleElements merging away undef elements, meaning we
8381 // fail to recognise the OR as the undef element isn't known zero.
8382 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8383 Mask.push_back(SM_SentinelZero);
8384 else if (Mask1[i] == SM_SentinelZero)
8385 Mask.push_back(i);
8386 else if (Mask0[i] == SM_SentinelZero)
8387 Mask.push_back(i + MaskSize);
8388 else
8389 return false;
8390 }
8391 Ops.push_back(N0);
8392 Ops.push_back(N1);
8393 return true;
8394 }
8395 case ISD::INSERT_SUBVECTOR: {
8396 SDValue Src = N.getOperand(0);
8397 SDValue Sub = N.getOperand(1);
8398 EVT SubVT = Sub.getValueType();
8399 unsigned NumSubElts = SubVT.getVectorNumElements();
8400 if (!N->isOnlyUserOf(Sub.getNode()))
8401 return false;
8402 uint64_t InsertIdx = N.getConstantOperandVal(2);
8403 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8404 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8405 Sub.getOperand(0).getValueType() == VT) {
8406 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8407 for (int i = 0; i != (int)NumElts; ++i)
8408 Mask.push_back(i);
8409 for (int i = 0; i != (int)NumSubElts; ++i)
8410 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8411 Ops.push_back(Src);
8412 Ops.push_back(Sub.getOperand(0));
8413 return true;
8414 }
8415 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8416 SmallVector<int, 64> SubMask;
8417 SmallVector<SDValue, 2> SubInputs;
8418 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
8419 EVT SubSrcVT = SubSrc.getValueType();
8420 if (!SubSrcVT.isVector())
8421 return false;
8422
8423 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
8424 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
8425 Depth + 1, ResolveKnownElts))
8426 return false;
8427
8428 // Subvector shuffle inputs must not be larger than the subvector.
8429 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8430 return SubVT.getFixedSizeInBits() <
8431 SubInput.getValueSizeInBits().getFixedValue();
8432 }))
8433 return false;
8434
8435 if (SubMask.size() != NumSubElts) {
8436 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8437, __extension__
__PRETTY_FUNCTION__))
8437 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8437, __extension__
__PRETTY_FUNCTION__))
;
8438 if ((NumSubElts % SubMask.size()) == 0) {
8439 int Scale = NumSubElts / SubMask.size();
8440 SmallVector<int,64> ScaledSubMask;
8441 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8442 SubMask = ScaledSubMask;
8443 } else {
8444 int Scale = SubMask.size() / NumSubElts;
8445 NumSubElts = SubMask.size();
8446 NumElts *= Scale;
8447 InsertIdx *= Scale;
8448 }
8449 }
8450 Ops.push_back(Src);
8451 Ops.append(SubInputs.begin(), SubInputs.end());
8452 if (ISD::isBuildVectorAllZeros(Src.getNode()))
8453 Mask.append(NumElts, SM_SentinelZero);
8454 else
8455 for (int i = 0; i != (int)NumElts; ++i)
8456 Mask.push_back(i);
8457 for (int i = 0; i != (int)NumSubElts; ++i) {
8458 int M = SubMask[i];
8459 if (0 <= M) {
8460 int InputIdx = M / NumSubElts;
8461 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8462 }
8463 Mask[i + InsertIdx] = M;
8464 }
8465 return true;
8466 }
8467 case X86ISD::PINSRB:
8468 case X86ISD::PINSRW:
8469 case ISD::SCALAR_TO_VECTOR:
8470 case ISD::INSERT_VECTOR_ELT: {
8471 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8472 // vector, for matching src/dst vector types.
8473 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8474
8475 unsigned DstIdx = 0;
8476 if (Opcode != ISD::SCALAR_TO_VECTOR) {
8477 // Check we have an in-range constant insertion index.
8478 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8479 N.getConstantOperandAPInt(2).uge(NumElts))
8480 return false;
8481 DstIdx = N.getConstantOperandVal(2);
8482
8483 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8484 if (X86::isZeroNode(Scl)) {
8485 Ops.push_back(N.getOperand(0));
8486 for (unsigned i = 0; i != NumElts; ++i)
8487 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8488 return true;
8489 }
8490 }
8491
8492 // Peek through trunc/aext/zext.
8493 // TODO: aext shouldn't require SM_SentinelZero padding.
8494 // TODO: handle shift of scalars.
8495 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8496 while (Scl.getOpcode() == ISD::TRUNCATE ||
8497 Scl.getOpcode() == ISD::ANY_EXTEND ||
8498 Scl.getOpcode() == ISD::ZERO_EXTEND) {
8499 Scl = Scl.getOperand(0);
8500 MinBitsPerElt =
8501 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8502 }
8503 if ((MinBitsPerElt % 8) != 0)
8504 return false;
8505
8506 // Attempt to find the source vector the scalar was extracted from.
8507 SDValue SrcExtract;
8508 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8509 Scl.getOpcode() == X86ISD::PEXTRW ||
8510 Scl.getOpcode() == X86ISD::PEXTRB) &&
8511 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8512 SrcExtract = Scl;
8513 }
8514 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8515 return false;
8516
8517 SDValue SrcVec = SrcExtract.getOperand(0);
8518 EVT SrcVT = SrcVec.getValueType();
8519 if (!SrcVT.getScalarType().isByteSized())
8520 return false;
8521 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8522 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8523 unsigned DstByte = DstIdx * NumBytesPerElt;
8524 MinBitsPerElt =
8525 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8526
8527 // Create 'identity' byte level shuffle mask and then add inserted bytes.
8528 if (Opcode == ISD::SCALAR_TO_VECTOR) {
8529 Ops.push_back(SrcVec);
8530 Mask.append(NumSizeInBytes, SM_SentinelUndef);
8531 } else {
8532 Ops.push_back(SrcVec);
8533 Ops.push_back(N.getOperand(0));
8534 for (int i = 0; i != (int)NumSizeInBytes; ++i)
8535 Mask.push_back(NumSizeInBytes + i);
8536 }
8537
8538 unsigned MinBytesPerElts = MinBitsPerElt / 8;
8539 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8540 for (unsigned i = 0; i != MinBytesPerElts; ++i)
8541 Mask[DstByte + i] = SrcByte + i;
8542 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8543 Mask[DstByte + i] = SM_SentinelZero;
8544 return true;
8545 }
8546 case X86ISD::PACKSS:
8547 case X86ISD::PACKUS: {
8548 SDValue N0 = N.getOperand(0);
8549 SDValue N1 = N.getOperand(1);
8550 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8552, __extension__
__PRETTY_FUNCTION__))
8551 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8552, __extension__
__PRETTY_FUNCTION__))
8552 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8552, __extension__
__PRETTY_FUNCTION__))
;
8553
8554 APInt EltsLHS, EltsRHS;
8555 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8556
8557 // If we know input saturation won't happen (or we don't care for particular
8558 // lanes), we can treat this as a truncation shuffle.
8559 bool Offset0 = false, Offset1 = false;
8560 if (Opcode == X86ISD::PACKSS) {
8561 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8562 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8563 (!(N1.isUndef() || EltsRHS.isZero()) &&
8564 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8565 return false;
8566 // We can't easily fold ASHR into a shuffle, but if it was feeding a
8567 // PACKSS then it was likely being used for sign-extension for a
8568 // truncation, so just peek through and adjust the mask accordingly.
8569 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8570 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8571 Offset0 = true;
8572 N0 = N0.getOperand(0);
8573 }
8574 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8575 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8576 Offset1 = true;
8577 N1 = N1.getOperand(0);
8578 }
8579 } else {
8580 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8581 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8582 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8583 (!(N1.isUndef() || EltsRHS.isZero()) &&
8584 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8585 return false;
8586 }
8587
8588 bool IsUnary = (N0 == N1);
8589
8590 Ops.push_back(N0);
8591 if (!IsUnary)
8592 Ops.push_back(N1);
8593
8594 createPackShuffleMask(VT, Mask, IsUnary);
8595
8596 if (Offset0 || Offset1) {
8597 for (int &M : Mask)
8598 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8599 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8600 ++M;
8601 }
8602 return true;
8603 }
8604 case ISD::VSELECT:
8605 case X86ISD::BLENDV: {
8606 SDValue Cond = N.getOperand(0);
8607 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
8608 Ops.push_back(N.getOperand(1));
8609 Ops.push_back(N.getOperand(2));
8610 return true;
8611 }
8612 return false;
8613 }
8614 case X86ISD::VTRUNC: {
8615 SDValue Src = N.getOperand(0);
8616 EVT SrcVT = Src.getValueType();
8617 // Truncated source must be a simple vector.
8618 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8619 (SrcVT.getScalarSizeInBits() % 8) != 0)
8620 return false;
8621 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8622 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8623 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8624 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8624, __extension__
__PRETTY_FUNCTION__))
;
8625 for (unsigned i = 0; i != NumSrcElts; ++i)
8626 Mask.push_back(i * Scale);
8627 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8628 Ops.push_back(Src);
8629 return true;
8630 }
8631 case X86ISD::VSHLI:
8632 case X86ISD::VSRLI: {
8633 uint64_t ShiftVal = N.getConstantOperandVal(1);
8634 // Out of range bit shifts are guaranteed to be zero.
8635 if (NumBitsPerElt <= ShiftVal) {
8636 Mask.append(NumElts, SM_SentinelZero);
8637 return true;
8638 }
8639
8640 // We can only decode 'whole byte' bit shifts as shuffles.
8641 if ((ShiftVal % 8) != 0)
8642 break;
8643
8644 uint64_t ByteShift = ShiftVal / 8;
8645 Ops.push_back(N.getOperand(0));
8646
8647 // Clear mask to all zeros and insert the shifted byte indices.
8648 Mask.append(NumSizeInBytes, SM_SentinelZero);
8649
8650 if (X86ISD::VSHLI == Opcode) {
8651 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8652 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8653 Mask[i + j] = i + j - ByteShift;
8654 } else {
8655 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8656 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8657 Mask[i + j - ByteShift] = i + j;
8658 }
8659 return true;
8660 }
8661 case X86ISD::VROTLI:
8662 case X86ISD::VROTRI: {
8663 // We can only decode 'whole byte' bit rotates as shuffles.
8664 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8665 if ((RotateVal % 8) != 0)
8666 return false;
8667 Ops.push_back(N.getOperand(0));
8668 int Offset = RotateVal / 8;
8669 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8670 for (int i = 0; i != (int)NumElts; ++i) {
8671 int BaseIdx = i * NumBytesPerElt;
8672 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8673 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8674 }
8675 }
8676 return true;
8677 }
8678 case X86ISD::VBROADCAST: {
8679 SDValue Src = N.getOperand(0);
8680 if (!Src.getSimpleValueType().isVector()) {
8681 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8682 !isNullConstant(Src.getOperand(1)) ||
8683 Src.getOperand(0).getValueType().getScalarType() !=
8684 VT.getScalarType())
8685 return false;
8686 Src = Src.getOperand(0);
8687 }
8688 Ops.push_back(Src);
8689 Mask.append(NumElts, 0);
8690 return true;
8691 }
8692 case ISD::ZERO_EXTEND:
8693 case ISD::ANY_EXTEND:
8694 case ISD::ZERO_EXTEND_VECTOR_INREG:
8695 case ISD::ANY_EXTEND_VECTOR_INREG: {
8696 SDValue Src = N.getOperand(0);
8697 EVT SrcVT = Src.getValueType();
8698
8699 // Extended source must be a simple vector.
8700 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8701 (SrcVT.getScalarSizeInBits() % 8) != 0)
8702 return false;
8703
8704 bool IsAnyExtend =
8705 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8706 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8707 IsAnyExtend, Mask);
8708 Ops.push_back(Src);
8709 return true;
8710 }
8711 }
8712
8713 return false;
8714}
8715
8716/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8717static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8718 SmallVectorImpl<int> &Mask) {
8719 int MaskWidth = Mask.size();
8720 SmallVector<SDValue, 16> UsedInputs;
8721 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8722 int lo = UsedInputs.size() * MaskWidth;
8723 int hi = lo + MaskWidth;
8724
8725 // Strip UNDEF input usage.
8726 if (Inputs[i].isUndef())
8727 for (int &M : Mask)
8728 if ((lo <= M) && (M < hi))
8729 M = SM_SentinelUndef;
8730
8731 // Check for unused inputs.
8732 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8733 for (int &M : Mask)
8734 if (lo <= M)
8735 M -= MaskWidth;
8736 continue;
8737 }
8738
8739 // Check for repeated inputs.
8740 bool IsRepeat = false;
8741 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8742 if (UsedInputs[j] != Inputs[i])
8743 continue;
8744 for (int &M : Mask)
8745 if (lo <= M)
8746 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8747 IsRepeat = true;
8748 break;
8749 }
8750 if (IsRepeat)
8751 continue;
8752
8753 UsedInputs.push_back(Inputs[i]);
8754 }
8755 Inputs = UsedInputs;
8756}
8757
8758/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8759/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8760/// Returns true if the target shuffle mask was decoded.
8761static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8762 SmallVectorImpl<SDValue> &Inputs,
8763 SmallVectorImpl<int> &Mask,
8764 APInt &KnownUndef, APInt &KnownZero,
8765 const SelectionDAG &DAG, unsigned Depth,
8766 bool ResolveKnownElts) {
8767 if (Depth >= SelectionDAG::MaxRecursionDepth)
8768 return false; // Limit search depth.
8769
8770 EVT VT = Op.getValueType();
8771 if (!VT.isSimple() || !VT.isVector())
8772 return false;
8773
8774 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8775 if (ResolveKnownElts)
8776 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8777 return true;
8778 }
8779 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8780 ResolveKnownElts)) {
8781 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8782 return true;
8783 }
8784 return false;
8785}
8786
8787static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8788 SmallVectorImpl<SDValue> &Inputs,
8789 SmallVectorImpl<int> &Mask,
8790 const SelectionDAG &DAG, unsigned Depth,
8791 bool ResolveKnownElts) {
8792 APInt KnownUndef, KnownZero;
8793 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8794 KnownZero, DAG, Depth, ResolveKnownElts);
8795}
8796
8797static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8798 SmallVectorImpl<int> &Mask,
8799 const SelectionDAG &DAG, unsigned Depth = 0,
8800 bool ResolveKnownElts = true) {
8801 EVT VT = Op.getValueType();
8802 if (!VT.isSimple() || !VT.isVector())
8803 return false;
8804
8805 unsigned NumElts = Op.getValueType().getVectorNumElements();
8806 APInt DemandedElts = APInt::getAllOnes(NumElts);
8807 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
8808 ResolveKnownElts);
8809}
8810
8811// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8812static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8813 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8814 SelectionDAG &DAG) {
8815 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8817, __extension__
__PRETTY_FUNCTION__))
8816 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8817, __extension__
__PRETTY_FUNCTION__))
8817 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8817, __extension__
__PRETTY_FUNCTION__))
;
8818
8819 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8820 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8821 return SDValue();
8822
8823 SDValue Ptr =
8824 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8825 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8826 SDValue Ops[] = {Mem->getChain(), Ptr};
8827 SDValue BcstLd = DAG.getMemIntrinsicNode(
8828 Opcode, DL, Tys, Ops, MemVT,
8829 DAG.getMachineFunction().getMachineMemOperand(
8830 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8831 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8832 return BcstLd;
8833}
8834
8835/// Returns the scalar element that will make up the i'th
8836/// element of the result of the vector shuffle.
8837static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8838 SelectionDAG &DAG, unsigned Depth) {
8839 if (Depth >= SelectionDAG::MaxRecursionDepth)
8840 return SDValue(); // Limit search depth.
8841
8842 EVT VT = Op.getValueType();
8843 unsigned Opcode = Op.getOpcode();
8844 unsigned NumElems = VT.getVectorNumElements();
8845
8846 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8847 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8848 int Elt = SV->getMaskElt(Index);
8849
8850 if (Elt < 0)
8851 return DAG.getUNDEF(VT.getVectorElementType());
8852
8853 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8854 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8855 }
8856
8857 // Recurse into target specific vector shuffles to find scalars.
8858 if (isTargetShuffle(Opcode)) {
8859 MVT ShufVT = VT.getSimpleVT();
8860 MVT ShufSVT = ShufVT.getVectorElementType();
8861 int NumElems = (int)ShufVT.getVectorNumElements();
8862 SmallVector<int, 16> ShuffleMask;
8863 SmallVector<SDValue, 16> ShuffleOps;
8864 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8865 ShuffleMask))
8866 return SDValue();
8867
8868 int Elt = ShuffleMask[Index];
8869 if (Elt == SM_SentinelZero)
8870 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8871 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8872 if (Elt == SM_SentinelUndef)
8873 return DAG.getUNDEF(ShufSVT);
8874
8875 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8875, __extension__
__PRETTY_FUNCTION__))
;
8876 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8877 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8878 }
8879
8880 // Recurse into insert_subvector base/sub vector to find scalars.
8881 if (Opcode == ISD::INSERT_SUBVECTOR) {
8882 SDValue Vec = Op.getOperand(0);
8883 SDValue Sub = Op.getOperand(1);
8884 uint64_t SubIdx = Op.getConstantOperandVal(2);
8885 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8886
8887 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8888 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8889 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8890 }
8891
8892 // Recurse into concat_vectors sub vector to find scalars.
8893 if (Opcode == ISD::CONCAT_VECTORS) {
8894 EVT SubVT = Op.getOperand(0).getValueType();
8895 unsigned NumSubElts = SubVT.getVectorNumElements();
8896 uint64_t SubIdx = Index / NumSubElts;
8897 uint64_t SubElt = Index % NumSubElts;
8898 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8899 }
8900
8901 // Recurse into extract_subvector src vector to find scalars.
8902 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8903 SDValue Src = Op.getOperand(0);
8904 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8905 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8906 }
8907
8908 // We only peek through bitcasts of the same vector width.
8909 if (Opcode == ISD::BITCAST) {
8910 SDValue Src = Op.getOperand(0);
8911 EVT SrcVT = Src.getValueType();
8912 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8913 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8914 return SDValue();
8915 }
8916
8917 // Actual nodes that may contain scalar elements
8918
8919 // For insert_vector_elt - either return the index matching scalar or recurse
8920 // into the base vector.
8921 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8922 isa<ConstantSDNode>(Op.getOperand(2))) {
8923 if (Op.getConstantOperandAPInt(2) == Index)
8924 return Op.getOperand(1);
8925 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8926 }
8927
8928 if (Opcode == ISD::SCALAR_TO_VECTOR)
8929 return (Index == 0) ? Op.getOperand(0)
8930 : DAG.getUNDEF(VT.getVectorElementType());
8931
8932 if (Opcode == ISD::BUILD_VECTOR)
8933 return Op.getOperand(Index);
8934
8935 return SDValue();
8936}
8937
8938// Use PINSRB/PINSRW/PINSRD to create a build vector.
8939static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8940 unsigned NumNonZero, unsigned NumZero,
8941 SelectionDAG &DAG,
8942 const X86Subtarget &Subtarget) {
8943 MVT VT = Op.getSimpleValueType();
8944 unsigned NumElts = VT.getVectorNumElements();
8945 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8947, __extension__
__PRETTY_FUNCTION__))
8946 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8947, __extension__
__PRETTY_FUNCTION__))
8947 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8947, __extension__
__PRETTY_FUNCTION__))
;
8948
8949 SDLoc dl(Op);
8950 SDValue V;
8951 bool First = true;
8952
8953 for (unsigned i = 0; i < NumElts; ++i) {
8954 bool IsNonZero = NonZeroMask[i];
8955 if (!IsNonZero)
8956 continue;
8957
8958 // If the build vector contains zeros or our first insertion is not the
8959 // first index then insert into zero vector to break any register
8960 // dependency else use SCALAR_TO_VECTOR.
8961 if (First) {
8962 First = false;
8963 if (NumZero || 0 != i)
8964 V = getZeroVector(VT, Subtarget, DAG, dl);
8965 else {
8966 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8966, __extension__
__PRETTY_FUNCTION__))
;
8967 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8968 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8969 V = DAG.getBitcast(VT, V);
8970 continue;
8971 }
8972 }
8973 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8974 DAG.getIntPtrConstant(i, dl));
8975 }
8976
8977 return V;
8978}
8979
8980/// Custom lower build_vector of v16i8.
8981static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8982 unsigned NumNonZero, unsigned NumZero,
8983 SelectionDAG &DAG,
8984 const X86Subtarget &Subtarget) {
8985 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8986 return SDValue();
8987
8988 // SSE4.1 - use PINSRB to insert each byte directly.
8989 if (Subtarget.hasSSE41())
8990 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8991 Subtarget);
8992
8993 SDLoc dl(Op);
8994 SDValue V;
8995
8996 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8997 for (unsigned i = 0; i < 16; i += 2) {
8998 bool ThisIsNonZero = NonZeroMask[i];
8999 bool NextIsNonZero = NonZeroMask[i + 1];
9000 if (!ThisIsNonZero && !NextIsNonZero)
9001 continue;
9002
9003 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
9004 SDValue Elt;
9005 if (ThisIsNonZero) {
9006 if (NumZero || NextIsNonZero)
9007 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9008 else
9009 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9010 }
9011
9012 if (NextIsNonZero) {
9013 SDValue NextElt = Op.getOperand(i + 1);
9014 if (i == 0 && NumZero)
9015 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
9016 else
9017 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
9018 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
9019 DAG.getConstant(8, dl, MVT::i8));
9020 if (ThisIsNonZero)
9021 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
9022 else
9023 Elt = NextElt;
9024 }
9025
9026 // If our first insertion is not the first index or zeros are needed, then
9027 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
9028 // elements undefined).
9029 if (!V) {
9030 if (i != 0 || NumZero)
9031 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
9032 else {
9033 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
9034 V = DAG.getBitcast(MVT::v8i16, V);
9035 continue;
9036 }
9037 }
9038 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
9039 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
9040 DAG.getIntPtrConstant(i / 2, dl));
9041 }
9042
9043 return DAG.getBitcast(MVT::v16i8, V);
9044}
9045
9046/// Custom lower build_vector of v8i16.
9047static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
9048 unsigned NumNonZero, unsigned NumZero,
9049 SelectionDAG &DAG,
9050 const X86Subtarget &Subtarget) {
9051 if (NumNonZero > 4 && !Subtarget.hasSSE41())
9052 return SDValue();
9053
9054 // Use PINSRW to insert each byte directly.
9055 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9056 Subtarget);
9057}
9058
9059/// Custom lower build_vector of v4i32 or v4f32.
9060static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
9061 const X86Subtarget &Subtarget) {
9062 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
9063 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
9064 // Because we're creating a less complicated build vector here, we may enable
9065 // further folding of the MOVDDUP via shuffle transforms.
9066 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
9067 Op.getOperand(0) == Op.getOperand(2) &&
9068 Op.getOperand(1) == Op.getOperand(3) &&
9069 Op.getOperand(0) != Op.getOperand(1)) {
9070 SDLoc DL(Op);
9071 MVT VT = Op.getSimpleValueType();
9072 MVT EltVT = VT.getVectorElementType();
9073 // Create a new build vector with the first 2 elements followed by undef
9074 // padding, bitcast to v2f64, duplicate, and bitcast back.
9075 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9076 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9077 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
9078 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
9079 return DAG.getBitcast(VT, Dup);
9080 }
9081
9082 // Find all zeroable elements.
9083 std::bitset<4> Zeroable, Undefs;
9084 for (int i = 0; i < 4; ++i) {
9085 SDValue Elt = Op.getOperand(i);
9086 Undefs[i] = Elt.isUndef();
9087 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
9088 }
9089 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9090, __extension__
__PRETTY_FUNCTION__))
9090 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9090, __extension__
__PRETTY_FUNCTION__))
;
9091
9092 // We only know how to deal with build_vector nodes where elements are either
9093 // zeroable or extract_vector_elt with constant index.
9094 SDValue FirstNonZero;
9095 unsigned FirstNonZeroIdx;
9096 for (unsigned i = 0; i < 4; ++i) {
9097 if (Zeroable[i])
9098 continue;
9099 SDValue Elt = Op.getOperand(i);
9100 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9101 !isa<ConstantSDNode>(Elt.getOperand(1)))
9102 return SDValue();
9103 // Make sure that this node is extracting from a 128-bit vector.
9104 MVT VT = Elt.getOperand(0).getSimpleValueType();
9105 if (!VT.is128BitVector())
9106 return SDValue();
9107 if (!FirstNonZero.getNode()) {
9108 FirstNonZero = Elt;
9109 FirstNonZeroIdx = i;
9110 }
9111 }
9112
9113 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9113, __extension__
__PRETTY_FUNCTION__))
;
9114 SDValue V1 = FirstNonZero.getOperand(0);
9115 MVT VT = V1.getSimpleValueType();
9116
9117 // See if this build_vector can be lowered as a blend with zero.
9118 SDValue Elt;
9119 unsigned EltMaskIdx, EltIdx;
9120 int Mask[4];
9121 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
9122 if (Zeroable[EltIdx]) {
9123 // The zero vector will be on the right hand side.
9124 Mask[EltIdx] = EltIdx+4;
9125 continue;
9126 }
9127
9128 Elt = Op->getOperand(EltIdx);
9129 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
9130 EltMaskIdx = Elt.getConstantOperandVal(1);
9131 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
9132 break;
9133 Mask[EltIdx] = EltIdx;
9134 }
9135
9136 if (EltIdx == 4) {
9137 // Let the shuffle legalizer deal with blend operations.
9138 SDValue VZeroOrUndef = (Zeroable == Undefs)
9139 ? DAG.getUNDEF(VT)
9140 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
9141 if (V1.getSimpleValueType() != VT)
9142 V1 = DAG.getBitcast(VT, V1);
9143 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
9144 }
9145
9146 // See if we can lower this build_vector to a INSERTPS.
9147 if (!Subtarget.hasSSE41())
9148 return SDValue();
9149
9150 SDValue V2 = Elt.getOperand(0);
9151 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
9152 V1 = SDValue();
9153
9154 bool CanFold = true;
9155 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
9156 if (Zeroable[i])
9157 continue;
9158
9159 SDValue Current = Op->getOperand(i);
9160 SDValue SrcVector = Current->getOperand(0);
9161 if (!V1.getNode())
9162 V1 = SrcVector;
9163 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
9164 }
9165
9166 if (!CanFold)
9167 return SDValue();
9168
9169 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9169, __extension__
__PRETTY_FUNCTION__))
;
9170 if (V1.getSimpleValueType() != MVT::v4f32)
9171 V1 = DAG.getBitcast(MVT::v4f32, V1);
9172 if (V2.getSimpleValueType() != MVT::v4f32)
9173 V2 = DAG.getBitcast(MVT::v4f32, V2);
9174
9175 // Ok, we can emit an INSERTPS instruction.
9176 unsigned ZMask = Zeroable.to_ulong();
9177
9178 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
9179 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9179, __extension__
__PRETTY_FUNCTION__))
;
9180 SDLoc DL(Op);
9181 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9182 DAG.getIntPtrConstant(InsertPSMask, DL, true));
9183 return DAG.getBitcast(VT, Result);
9184}
9185
9186/// Return a vector logical shift node.
9187static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
9188 SelectionDAG &DAG, const TargetLowering &TLI,
9189 const SDLoc &dl) {
9190 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9190, __extension__
__PRETTY_FUNCTION__))
;
9191 MVT ShVT = MVT::v16i8;
9192 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
9193 SrcOp = DAG.getBitcast(ShVT, SrcOp);
9194 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9194, __extension__
__PRETTY_FUNCTION__))
;
9195 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
9196 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
9197}
9198
9199static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
9200 SelectionDAG &DAG) {
9201
9202 // Check if the scalar load can be widened into a vector load. And if
9203 // the address is "base + cst" see if the cst can be "absorbed" into
9204 // the shuffle mask.
9205 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
9206 SDValue Ptr = LD->getBasePtr();
9207 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
9208 return SDValue();
9209 EVT PVT = LD->getValueType(0);
9210 if (PVT != MVT::i32 && PVT != MVT::f32)
9211 return SDValue();
9212
9213 int FI = -1;
9214 int64_t Offset = 0;
9215 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
9216 FI = FINode->getIndex();
9217 Offset = 0;
9218 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
9219 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
9220 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
9221 Offset = Ptr.getConstantOperandVal(1);
9222 Ptr = Ptr.getOperand(0);
9223 } else {
9224 return SDValue();
9225 }
9226
9227 // FIXME: 256-bit vector instructions don't require a strict alignment,
9228 // improve this code to support it better.
9229 Align RequiredAlign(VT.getSizeInBits() / 8);
9230 SDValue Chain = LD->getChain();
9231 // Make sure the stack object alignment is at least 16 or 32.
9232 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9233 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
9234 if (!InferredAlign || *InferredAlign < RequiredAlign) {
9235 if (MFI.isFixedObjectIndex(FI)) {
9236 // Can't change the alignment. FIXME: It's possible to compute
9237 // the exact stack offset and reference FI + adjust offset instead.
9238 // If someone *really* cares about this. That's the way to implement it.
9239 return SDValue();
9240 } else {
9241 MFI.setObjectAlignment(FI, RequiredAlign);
9242 }
9243 }
9244
9245 // (Offset % 16 or 32) must be multiple of 4. Then address is then
9246 // Ptr + (Offset & ~15).
9247 if (Offset < 0)
9248 return SDValue();
9249 if ((Offset % RequiredAlign.value()) & 3)
9250 return SDValue();
9251 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
9252 if (StartOffset) {
9253 SDLoc DL(Ptr);
9254 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9255 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
9256 }
9257
9258 int EltNo = (Offset - StartOffset) >> 2;
9259 unsigned NumElems = VT.getVectorNumElements();
9260
9261 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
9262 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
9263 LD->getPointerInfo().getWithOffset(StartOffset));
9264
9265 SmallVector<int, 8> Mask(NumElems, EltNo);
9266
9267 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
9268 }
9269
9270 return SDValue();
9271}
9272
9273// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
9274static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
9275 if (ISD::isNON_EXTLoad(Elt.getNode())) {
9276 auto *BaseLd = cast<LoadSDNode>(Elt);
9277 if (!BaseLd->isSimple())
9278 return false;
9279 Ld = BaseLd;
9280 ByteOffset = 0;
9281 return true;
9282 }
9283
9284 switch (Elt.getOpcode()) {
9285 case ISD::BITCAST:
9286 case ISD::TRUNCATE:
9287 case ISD::SCALAR_TO_VECTOR:
9288 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
9289 case ISD::SRL:
9290 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9291 uint64_t Amt = AmtC->getZExtValue();
9292 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
9293 ByteOffset += Amt / 8;
9294 return true;
9295 }
9296 }
9297 break;
9298 case ISD::EXTRACT_VECTOR_ELT:
9299 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9300 SDValue Src = Elt.getOperand(0);
9301 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
9302 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
9303 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
9304 findEltLoadSrc(Src, Ld, ByteOffset)) {
9305 uint64_t Idx = IdxC->getZExtValue();
9306 ByteOffset += Idx * (SrcSizeInBits / 8);
9307 return true;
9308 }
9309 }
9310 break;
9311 }
9312
9313 return false;
9314}
9315
9316/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
9317/// elements can be replaced by a single large load which has the same value as
9318/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
9319///
9320/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
9321static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
9322 const SDLoc &DL, SelectionDAG &DAG,
9323 const X86Subtarget &Subtarget,
9324 bool IsAfterLegalize) {
9325 if ((VT.getScalarSizeInBits() % 8) != 0)
9326 return SDValue();
9327
9328 unsigned NumElems = Elts.size();
9329
9330 int LastLoadedElt = -1;
9331 APInt LoadMask = APInt::getZero(NumElems);
9332 APInt ZeroMask = APInt::getZero(NumElems);
9333 APInt UndefMask = APInt::getZero(NumElems);
9334
9335 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
9336 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
9337
9338 // For each element in the initializer, see if we've found a load, zero or an
9339 // undef.
9340 for (unsigned i = 0; i < NumElems; ++i) {
9341 SDValue Elt = peekThroughBitcasts(Elts[i]);
9342 if (!Elt.getNode())
9343 return SDValue();
9344 if (Elt.isUndef()) {
9345 UndefMask.setBit(i);
9346 continue;
9347 }
9348 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9349 ZeroMask.setBit(i);
9350 continue;
9351 }
9352
9353 // Each loaded element must be the correct fractional portion of the
9354 // requested vector load.
9355 unsigned EltSizeInBits = Elt.getValueSizeInBits();
9356 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9357 return SDValue();
9358
9359 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9360 return SDValue();
9361 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9362 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9363 return SDValue();
9364
9365 LoadMask.setBit(i);
9366 LastLoadedElt = i;
9367 }
9368 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
9369 LoadMask.countPopulation()) == NumElems &&(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
9370 "Incomplete element masks")(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
;
9371
9372 // Handle Special Cases - all undef or undef/zero.
9373 if (UndefMask.countPopulation() == NumElems)
9374 return DAG.getUNDEF(VT);
9375 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
9376 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9377 : DAG.getConstantFP(0.0, DL, VT);
9378
9379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9380 int FirstLoadedElt = LoadMask.countTrailingZeros();
9381 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9382 EVT EltBaseVT = EltBase.getValueType();
9383 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9384, __extension__
__PRETTY_FUNCTION__))
9384 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9384, __extension__
__PRETTY_FUNCTION__))
;
9385 LoadSDNode *LDBase = Loads[FirstLoadedElt];
9386 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9386, __extension__
__PRETTY_FUNCTION__))
;
9387 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9388 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9389 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9390 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9391 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9391, __extension__
__PRETTY_FUNCTION__))
;
9392
9393 // TODO: Support offsetting the base load.
9394 if (ByteOffsets[FirstLoadedElt] != 0)
9395 return SDValue();
9396
9397 // Check to see if the element's load is consecutive to the base load
9398 // or offset from a previous (already checked) load.
9399 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9400 LoadSDNode *Ld = Loads[EltIdx];
9401 int64_t ByteOffset = ByteOffsets[EltIdx];
9402 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9403 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9404 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9405 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9406 }
9407 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9408 EltIdx - FirstLoadedElt);
9409 };
9410
9411 // Consecutive loads can contain UNDEFS but not ZERO elements.
9412 // Consecutive loads with UNDEFs and ZEROs elements require a
9413 // an additional shuffle stage to clear the ZERO elements.
9414 bool IsConsecutiveLoad = true;
9415 bool IsConsecutiveLoadWithZeros = true;
9416 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9417 if (LoadMask[i]) {
9418 if (!CheckConsecutiveLoad(LDBase, i)) {
9419 IsConsecutiveLoad = false;
9420 IsConsecutiveLoadWithZeros = false;
9421 break;
9422 }
9423 } else if (ZeroMask[i]) {
9424 IsConsecutiveLoad = false;
9425 }
9426 }
9427
9428 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9429 auto MMOFlags = LDBase->getMemOperand()->getFlags();
9430 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9431, __extension__
__PRETTY_FUNCTION__))
9431 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9431, __extension__
__PRETTY_FUNCTION__))
;
9432 SDValue NewLd =
9433 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9434 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9435 MMOFlags);
9436 for (auto *LD : Loads)
9437 if (LD)
9438 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9439 return NewLd;
9440 };
9441
9442 // Check if the base load is entirely dereferenceable.
9443 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9444 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9445
9446 // LOAD - all consecutive load/undefs (must start/end with a load or be
9447 // entirely dereferenceable). If we have found an entire vector of loads and
9448 // undefs, then return a large load of the entire vector width starting at the
9449 // base pointer. If the vector contains zeros, then attempt to shuffle those
9450 // elements.
9451 if (FirstLoadedElt == 0 &&
9452 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9453 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9454 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9455 return SDValue();
9456
9457 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9458 // will lower to regular temporal loads and use the cache.
9459 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
9460 VT.is256BitVector() && !Subtarget.hasInt256())
9461 return SDValue();
9462
9463 if (NumElems == 1)
9464 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9465
9466 if (!ZeroMask)
9467 return CreateLoad(VT, LDBase);
9468
9469 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9470 // vector and a zero vector to clear out the zero elements.
9471 if (!IsAfterLegalize && VT.isVector()) {
9472 unsigned NumMaskElts = VT.getVectorNumElements();
9473 if ((NumMaskElts % NumElems) == 0) {
9474 unsigned Scale = NumMaskElts / NumElems;
9475 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9476 for (unsigned i = 0; i < NumElems; ++i) {
9477 if (UndefMask[i])
9478 continue;
9479 int Offset = ZeroMask[i] ? NumMaskElts : 0;
9480 for (unsigned j = 0; j != Scale; ++j)
9481 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9482 }
9483 SDValue V = CreateLoad(VT, LDBase);
9484 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9485 : DAG.getConstantFP(0.0, DL, VT);
9486 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9487 }
9488 }
9489 }
9490
9491 // If the upper half of a ymm/zmm load is undef then just load the lower half.
9492 if (VT.is256BitVector() || VT.is512BitVector()) {
9493 unsigned HalfNumElems = NumElems / 2;
9494 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9495 EVT HalfVT =
9496 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9497 SDValue HalfLD =
9498 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9499 DAG, Subtarget, IsAfterLegalize);
9500 if (HalfLD)
9501 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9502 HalfLD, DAG.getIntPtrConstant(0, DL));
9503 }
9504 }
9505
9506 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9507 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9508 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9509 LoadSizeInBits == 64) &&
9510 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9511 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9512 : MVT::getIntegerVT(LoadSizeInBits);
9513 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9514 // Allow v4f32 on SSE1 only targets.
9515 // FIXME: Add more isel patterns so we can just use VT directly.
9516 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9517 VecVT = MVT::v4f32;
9518 if (TLI.isTypeLegal(VecVT)) {
9519 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9520 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9521 SDValue ResNode = DAG.getMemIntrinsicNode(
9522 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9523 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9524 for (auto *LD : Loads)
9525 if (LD)
9526 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9527 return DAG.getBitcast(VT, ResNode);
9528 }
9529 }
9530
9531 // BROADCAST - match the smallest possible repetition pattern, load that
9532 // scalar/subvector element and then broadcast to the entire vector.
9533 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9534 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9535 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9536 unsigned RepeatSize = SubElems * BaseSizeInBits;
9537 unsigned ScalarSize = std::min(RepeatSize, 64u);
9538 if (!Subtarget.hasAVX2() && ScalarSize < 32)
9539 continue;
9540
9541 // Don't attempt a 1:N subvector broadcast - it should be caught by
9542 // combineConcatVectorOps, else will cause infinite loops.
9543 if (RepeatSize > ScalarSize && SubElems == 1)
9544 continue;
9545
9546 bool Match = true;
9547 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9548 for (unsigned i = 0; i != NumElems && Match; ++i) {
9549 if (!LoadMask[i])
9550 continue;
9551 SDValue Elt = peekThroughBitcasts(Elts[i]);
9552 if (RepeatedLoads[i % SubElems].isUndef())
9553 RepeatedLoads[i % SubElems] = Elt;
9554 else
9555 Match &= (RepeatedLoads[i % SubElems] == Elt);
9556 }
9557
9558 // We must have loads at both ends of the repetition.
9559 Match &= !RepeatedLoads.front().isUndef();
9560 Match &= !RepeatedLoads.back().isUndef();
9561 if (!Match)
9562 continue;
9563
9564 EVT RepeatVT =
9565 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9566 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9567 : EVT::getFloatingPointVT(ScalarSize);
9568 if (RepeatSize > ScalarSize)
9569 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9570 RepeatSize / ScalarSize);
9571 EVT BroadcastVT =
9572 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9573 VT.getSizeInBits() / ScalarSize);
9574 if (TLI.isTypeLegal(BroadcastVT)) {
9575 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9576 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9577 SDValue Broadcast = RepeatLoad;
9578 if (RepeatSize > ScalarSize) {
9579 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9580 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9581 } else {
9582 if (!Subtarget.hasAVX2() &&
9583 !X86::mayFoldLoadIntoBroadcastFromMem(
9584 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9585 Subtarget,
9586 /*AssumeSingleUse=*/true))
9587 return SDValue();
9588 Broadcast =
9589 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9590 }
9591 return DAG.getBitcast(VT, Broadcast);
9592 }
9593 }
9594 }
9595 }
9596
9597 return SDValue();
9598}
9599
9600// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9601// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9602// are consecutive, non-overlapping, and in the right order.
9603static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9604 SelectionDAG &DAG,
9605 const X86Subtarget &Subtarget,
9606 bool IsAfterLegalize) {
9607 SmallVector<SDValue, 64> Elts;
9608 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9609 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9610 Elts.push_back(Elt);
9611 continue;
9612 }
9613 return SDValue();
9614 }
9615 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9615, __extension__
__PRETTY_FUNCTION__))
;
9616 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9617 IsAfterLegalize);
9618}
9619
9620static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9621 unsigned SplatBitSize, LLVMContext &C) {
9622 unsigned ScalarSize = VT.getScalarSizeInBits();
9623 unsigned NumElm = SplatBitSize / ScalarSize;
9624
9625 SmallVector<Constant *, 32> ConstantVec;
9626 for (unsigned i = 0; i < NumElm; i++) {
9627 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9628 Constant *Const;
9629 if (VT.isFloatingPoint()) {
9630 if (ScalarSize == 16) {
9631 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9632 } else if (ScalarSize == 32) {
9633 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9634 } else {
9635 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9635, __extension__
__PRETTY_FUNCTION__))
;
9636 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9637 }
9638 } else
9639 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9640 ConstantVec.push_back(Const);
9641 }
9642 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9643}
9644
9645static bool isFoldableUseOfShuffle(SDNode *N) {
9646 for (auto *U : N->uses()) {
9647 unsigned Opc = U->getOpcode();
9648 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9649 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9650 return false;
9651 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9652 return false;
9653 if (isTargetShuffle(Opc))
9654 return true;
9655 if (Opc == ISD::BITCAST) // Ignore bitcasts
9656 return isFoldableUseOfShuffle(U);
9657 if (N->hasOneUse()) {
9658 // TODO, there may be some general way to know if a SDNode can
9659 // be folded. We now only know whether an MI is foldable.
9660 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9661 return false;
9662 return true;
9663 }
9664 }
9665 return false;
9666}
9667
9668/// Attempt to use the vbroadcast instruction to generate a splat value
9669/// from a splat BUILD_VECTOR which uses:
9670/// a. A single scalar load, or a constant.
9671/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9672///
9673/// The VBROADCAST node is returned when a pattern is found,
9674/// or SDValue() otherwise.
9675static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9676 const X86Subtarget &Subtarget,
9677 SelectionDAG &DAG) {
9678 // VBROADCAST requires AVX.
9679 // TODO: Splats could be generated for non-AVX CPUs using SSE
9680 // instructions, but there's less potential gain for only 128-bit vectors.
9681 if (!Subtarget.hasAVX())
9682 return SDValue();
9683
9684 MVT VT = BVOp->getSimpleValueType(0);
9685 unsigned NumElts = VT.getVectorNumElements();
9686 SDLoc dl(BVOp);
9687
9688 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__))
9689 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__))
;
9690
9691 // See if the build vector is a repeating sequence of scalars (inc. splat).
9692 SDValue Ld;
9693 BitVector UndefElements;
9694 SmallVector<SDValue, 16> Sequence;
9695 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9696 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9696, __extension__
__PRETTY_FUNCTION__))
;
9697 if (Sequence.size() == 1)
9698 Ld = Sequence[0];
9699 }
9700
9701 // Attempt to use VBROADCASTM
9702 // From this pattern:
9703 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9704 // b. t1 = (build_vector t0 t0)
9705 //
9706 // Create (VBROADCASTM v2i1 X)
9707 if (!Sequence.empty() && Subtarget.hasCDI()) {
9708 // If not a splat, are the upper sequence values zeroable?
9709 unsigned SeqLen = Sequence.size();
9710 bool UpperZeroOrUndef =
9711 SeqLen == 1 ||
9712 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
9713 return !V || V.isUndef() || isNullConstant(V);
9714 });
9715 SDValue Op0 = Sequence[0];
9716 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9717 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9718 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9719 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9720 ? Op0.getOperand(0)
9721 : Op0.getOperand(0).getOperand(0);
9722 MVT MaskVT = BOperand.getSimpleValueType();
9723 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9724 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9725 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9726 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9727 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9728 unsigned Scale = 512 / VT.getSizeInBits();
9729 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9730 }
9731 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9732 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9733 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9734 return DAG.getBitcast(VT, Bcst);
9735 }
9736 }
9737 }
9738
9739 unsigned NumUndefElts = UndefElements.count();
9740 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9741 APInt SplatValue, Undef;
9742 unsigned SplatBitSize;
9743 bool HasUndef;
9744 // Check if this is a repeated constant pattern suitable for broadcasting.
9745 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9746 SplatBitSize > VT.getScalarSizeInBits() &&
9747 SplatBitSize < VT.getSizeInBits()) {
9748 // Avoid replacing with broadcast when it's a use of a shuffle
9749 // instruction to preserve the present custom lowering of shuffles.
9750 if (isFoldableUseOfShuffle(BVOp))
9751 return SDValue();
9752 // replace BUILD_VECTOR with broadcast of the repeated constants.
9753 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9754 LLVMContext *Ctx = DAG.getContext();
9755 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9756 if (Subtarget.hasAVX()) {
9757 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9758 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9759 // Splatted value can fit in one INTEGER constant in constant pool.
9760 // Load the constant and broadcast it.
9761 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9762 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9763 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9764 SDValue CP = DAG.getConstantPool(C, PVT);
9765 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9766
9767 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9768 SDVTList Tys =
9769 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9770 SDValue Ops[] = {DAG.getEntryNode(), CP};
9771 MachinePointerInfo MPI =
9772 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9773 SDValue Brdcst = DAG.getMemIntrinsicNode(
9774 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9775 MachineMemOperand::MOLoad);
9776 return DAG.getBitcast(VT, Brdcst);
9777 }
9778 if (SplatBitSize > 64) {
9779 // Load the vector of constants and broadcast it.
9780 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9781 *Ctx);
9782 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9783 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9784 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9785 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9786 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9787 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9788 MachinePointerInfo MPI =
9789 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9790 return DAG.getMemIntrinsicNode(
9791 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9792 MachineMemOperand::MOLoad);
9793 }
9794 }
9795 }
9796
9797 // If we are moving a scalar into a vector (Ld must be set and all elements
9798 // but 1 are undef) and that operation is not obviously supported by
9799 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9800 // That's better than general shuffling and may eliminate a load to GPR and
9801 // move from scalar to vector register.
9802 if (!Ld || NumElts - NumUndefElts != 1)
9803 return SDValue();
9804 unsigned ScalarSize = Ld.getValueSizeInBits();
9805 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9806 return SDValue();
9807 }
9808
9809 bool ConstSplatVal =
9810 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9811 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9812
9813 // TODO: Handle broadcasts of non-constant sequences.
9814
9815 // Make sure that all of the users of a non-constant load are from the
9816 // BUILD_VECTOR node.
9817 // FIXME: Is the use count needed for non-constant, non-load case?
9818 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9819 return SDValue();
9820
9821 unsigned ScalarSize = Ld.getValueSizeInBits();
9822 bool IsGE256 = (VT.getSizeInBits() >= 256);
9823
9824 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9825 // instruction to save 8 or more bytes of constant pool data.
9826 // TODO: If multiple splats are generated to load the same constant,
9827 // it may be detrimental to overall size. There needs to be a way to detect
9828 // that condition to know if this is truly a size win.
9829 bool OptForSize = DAG.shouldOptForSize();
9830
9831 // Handle broadcasting a single constant scalar from the constant pool
9832 // into a vector.
9833 // On Sandybridge (no AVX2), it is still better to load a constant vector
9834 // from the constant pool and not to broadcast it from a scalar.
9835 // But override that restriction when optimizing for size.
9836 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9837 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9838 EVT CVT = Ld.getValueType();
9839 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9839, __extension__
__PRETTY_FUNCTION__))
;
9840
9841 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
9842 // For size optimization, also splat v2f64 and v2i64, and for size opt
9843 // with AVX2, also splat i8 and i16.
9844 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9845 if (ScalarSize == 32 ||
9846 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9847 CVT == MVT::f16 ||
9848 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9849 const Constant *C = nullptr;
9850 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9851 C = CI->getConstantIntValue();
9852 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9853 C = CF->getConstantFPValue();
9854
9855 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9855, __extension__
__PRETTY_FUNCTION__))
;
9856
9857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9858 SDValue CP =
9859 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9860 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9861
9862 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9863 SDValue Ops[] = {DAG.getEntryNode(), CP};
9864 MachinePointerInfo MPI =
9865 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9866 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9867 MPI, Alignment, MachineMemOperand::MOLoad);
9868 }
9869 }
9870
9871 // Handle AVX2 in-register broadcasts.
9872 if (!IsLoad && Subtarget.hasInt256() &&
9873 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9874 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9875
9876 // The scalar source must be a normal load.
9877 if (!IsLoad)
9878 return SDValue();
9879
9880 // Make sure the non-chain result is only used by this build vector.
9881 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9882 return SDValue();
9883
9884 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9885 (Subtarget.hasVLX() && ScalarSize == 64)) {
9886 auto *LN = cast<LoadSDNode>(Ld);
9887 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9888 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9889 SDValue BCast =
9890 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9891 LN->getMemoryVT(), LN->getMemOperand());
9892 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9893 return BCast;
9894 }
9895
9896 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9897 // double since there is no vbroadcastsd xmm
9898 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9899 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9900 auto *LN = cast<LoadSDNode>(Ld);
9901 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9902 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9903 SDValue BCast =
9904 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9905 LN->getMemoryVT(), LN->getMemOperand());
9906 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9907 return BCast;
9908 }
9909
9910 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9911 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9912
9913 // Unsupported broadcast.
9914 return SDValue();
9915}
9916
9917/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9918/// underlying vector and index.
9919///
9920/// Modifies \p ExtractedFromVec to the real vector and returns the real
9921/// index.
9922static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9923 SDValue ExtIdx) {
9924 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9925 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9926 return Idx;
9927
9928 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9929 // lowered this:
9930 // (extract_vector_elt (v8f32 %1), Constant<6>)
9931 // to:
9932 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9933 // (extract_subvector (v8f32 %0), Constant<4>),
9934 // undef)
9935 // Constant<0>)
9936 // In this case the vector is the extract_subvector expression and the index
9937 // is 2, as specified by the shuffle.
9938 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9939 SDValue ShuffleVec = SVOp->getOperand(0);
9940 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9941 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9942, __extension__
__PRETTY_FUNCTION__))
9942 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9942, __extension__
__PRETTY_FUNCTION__))
;
9943
9944 int ShuffleIdx = SVOp->getMaskElt(Idx);
9945 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9946 ExtractedFromVec = ShuffleVec;
9947 return ShuffleIdx;
9948 }
9949 return Idx;
9950}
9951
9952static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9953 MVT VT = Op.getSimpleValueType();
9954
9955 // Skip if insert_vec_elt is not supported.
9956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9957 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9958 return SDValue();
9959
9960 SDLoc DL(Op);
9961 unsigned NumElems = Op.getNumOperands();
9962
9963 SDValue VecIn1;
9964 SDValue VecIn2;
9965 SmallVector<unsigned, 4> InsertIndices;
9966 SmallVector<int, 8> Mask(NumElems, -1);
9967
9968 for (unsigned i = 0; i != NumElems; ++i) {
9969 unsigned Opc = Op.getOperand(i).getOpcode();
9970
9971 if (Opc == ISD::UNDEF)
9972 continue;
9973
9974 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9975 // Quit if more than 1 elements need inserting.
9976 if (InsertIndices.size() > 1)
9977 return SDValue();
9978
9979 InsertIndices.push_back(i);
9980 continue;
9981 }
9982
9983 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9984 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9985
9986 // Quit if non-constant index.
9987 if (!isa<ConstantSDNode>(ExtIdx))
9988 return SDValue();
9989 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9990
9991 // Quit if extracted from vector of different type.
9992 if (ExtractedFromVec.getValueType() != VT)
9993 return SDValue();
9994
9995 if (!VecIn1.getNode())
9996 VecIn1 = ExtractedFromVec;
9997 else if (VecIn1 != ExtractedFromVec) {
9998 if (!VecIn2.getNode())
9999 VecIn2 = ExtractedFromVec;
10000 else if (VecIn2 != ExtractedFromVec)
10001 // Quit if more than 2 vectors to shuffle
10002 return SDValue();
10003 }
10004
10005 if (ExtractedFromVec == VecIn1)
10006 Mask[i] = Idx;
10007 else if (ExtractedFromVec == VecIn2)
10008 Mask[i] = Idx + NumElems;
10009 }
10010
10011 if (!VecIn1.getNode())
10012 return SDValue();
10013
10014 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
10015 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
10016
10017 for (unsigned Idx : InsertIndices)
10018 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
10019 DAG.getIntPtrConstant(Idx, DL));
10020
10021 return NV;
10022}
10023
10024// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
10025static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
10026 const X86Subtarget &Subtarget) {
10027 MVT VT = Op.getSimpleValueType();
10028 MVT IVT = VT.changeVectorElementTypeToInteger();
10029 SmallVector<SDValue, 16> NewOps;
10030 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
10031 NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
10032 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
10033 return DAG.getBitcast(VT, Res);
10034}
10035
10036// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
10037static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
10038 const X86Subtarget &Subtarget) {
10039
10040 MVT VT = Op.getSimpleValueType();
10041 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10042, __extension__
__PRETTY_FUNCTION__))
10042 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10042, __extension__
__PRETTY_FUNCTION__))
;
10043
10044 SDLoc dl(Op);
10045 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
10046 ISD::isBuildVectorAllOnes(Op.getNode()))
10047 return Op;
10048
10049 uint64_t Immediate = 0;
10050 SmallVector<unsigned, 16> NonConstIdx;
10051 bool IsSplat = true;
10052 bool HasConstElts = false;
10053 int SplatIdx = -1;
10054 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
10055 SDValue In = Op.getOperand(idx);
10056 if (In.isUndef())
10057 continue;
10058 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
10059 Immediate |= (InC->getZExtValue() & 0x1) << idx;
10060 HasConstElts = true;
10061 } else {
10062 NonConstIdx.push_back(idx);
10063 }
10064 if (SplatIdx < 0)
10065 SplatIdx = idx;
10066 else if (In != Op.getOperand(SplatIdx))
10067 IsSplat = false;
10068 }
10069
10070 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
10071 if (IsSplat) {
10072 // The build_vector allows the scalar element to be larger than the vector
10073 // element type. We need to mask it to use as a condition unless we know
10074 // the upper bits are zero.
10075 // FIXME: Use computeKnownBits instead of checking specific opcode?
10076 SDValue Cond = Op.getOperand(SplatIdx);
10077 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10077, __extension__
__PRETTY_FUNCTION__))
;
10078 if (Cond.getOpcode() != ISD::SETCC)
10079 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
10080 DAG.getConstant(1, dl, MVT::i8));
10081
10082 // Perform the select in the scalar domain so we can use cmov.
10083 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10084 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
10085 DAG.getAllOnesConstant(dl, MVT::i32),
10086 DAG.getConstant(0, dl, MVT::i32));
10087 Select = DAG.getBitcast(MVT::v32i1, Select);
10088 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
10089 } else {
10090 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10091 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
10092 DAG.getAllOnesConstant(dl, ImmVT),
10093 DAG.getConstant(0, dl, ImmVT));
10094 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10095 Select = DAG.getBitcast(VecVT, Select);
10096 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
10097 DAG.getIntPtrConstant(0, dl));
10098 }
10099 }
10100
10101 // insert elements one by one
10102 SDValue DstVec;
10103 if (HasConstElts) {
10104 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10105 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
10106 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
10107 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
10108 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
10109 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
10110 } else {
10111 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10112 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
10113 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10114 DstVec = DAG.getBitcast(VecVT, Imm);
10115 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
10116 DAG.getIntPtrConstant(0, dl));
10117 }
10118 } else
10119 DstVec = DAG.getUNDEF(VT);
10120
10121 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
10122 unsigned InsertIdx = NonConstIdx[i];
10123 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10124 Op.getOperand(InsertIdx),
10125 DAG.getIntPtrConstant(InsertIdx, dl));
10126 }
10127 return DstVec;
10128}
10129
10130LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
10131 switch (Opcode) {
10132 case X86ISD::PACKSS:
10133 case X86ISD::PACKUS:
10134 case X86ISD::FHADD:
10135 case X86ISD::FHSUB:
10136 case X86ISD::HADD:
10137 case X86ISD::HSUB:
10138 return true;
10139 }
10140 return false;
10141}
10142
10143/// This is a helper function of LowerToHorizontalOp().
10144/// This function checks that the build_vector \p N in input implements a
10145/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
10146/// may not match the layout of an x86 256-bit horizontal instruction.
10147/// In other words, if this returns true, then some extraction/insertion will
10148/// be required to produce a valid horizontal instruction.
10149///
10150/// Parameter \p Opcode defines the kind of horizontal operation to match.
10151/// For example, if \p Opcode is equal to ISD::ADD, then this function
10152/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
10153/// is equal to ISD::SUB, then this function checks if this is a horizontal
10154/// arithmetic sub.
10155///
10156/// This function only analyzes elements of \p N whose indices are
10157/// in range [BaseIdx, LastIdx).
10158///
10159/// TODO: This function was originally used to match both real and fake partial
10160/// horizontal operations, but the index-matching logic is incorrect for that.
10161/// See the corrected implementation in isHopBuildVector(). Can we reduce this
10162/// code because it is only used for partial h-op matching now?
10163static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
10164 SelectionDAG &DAG,
10165 unsigned BaseIdx, unsigned LastIdx,
10166 SDValue &V0, SDValue &V1) {
10167 EVT VT = N->getValueType(0);
10168 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10168, __extension__
__PRETTY_FUNCTION__))
;
10169 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10169, __extension__
__PRETTY_FUNCTION__))
;
10170 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10171, __extension__
__PRETTY_FUNCTION__))
10171 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10171, __extension__
__PRETTY_FUNCTION__))
;
10172
10173 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
10174 bool CanFold = true;
10175 unsigned ExpectedVExtractIdx = BaseIdx;
10176 unsigned NumElts = LastIdx - BaseIdx;
10177 V0 = DAG.getUNDEF(VT);
10178 V1 = DAG.getUNDEF(VT);
10179
10180 // Check if N implements a horizontal binop.
10181 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
10182 SDValue Op = N->getOperand(i + BaseIdx);
10183
10184 // Skip UNDEFs.
10185 if (Op->isUndef()) {
10186 // Update the expected vector extract index.
10187 if (i * 2 == NumElts)
10188 ExpectedVExtractIdx = BaseIdx;
10189 ExpectedVExtractIdx += 2;
10190 continue;
10191 }
10192
10193 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
10194
10195 if (!CanFold)
10196 break;
10197
10198 SDValue Op0 = Op.getOperand(0);
10199 SDValue Op1 = Op.getOperand(1);
10200
10201 // Try to match the following pattern:
10202 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
10203 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10204 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10205 Op0.getOperand(0) == Op1.getOperand(0) &&
10206 isa<ConstantSDNode>(Op0.getOperand(1)) &&
10207 isa<ConstantSDNode>(Op1.getOperand(1)));
10208 if (!CanFold)
10209 break;
10210
10211 unsigned I0 = Op0.getConstantOperandVal(1);
10212 unsigned I1 = Op1.getConstantOperandVal(1);
10213
10214 if (i * 2 < NumElts) {
10215 if (V0.isUndef()) {
10216 V0 = Op0.getOperand(0);
10217 if (V0.getValueType() != VT)
10218 return false;
10219 }
10220 } else {
10221 if (V1.isUndef()) {
10222 V1 = Op0.getOperand(0);
10223 if (V1.getValueType() != VT)
10224 return false;
10225 }
10226 if (i * 2 == NumElts)
10227 ExpectedVExtractIdx = BaseIdx;
10228 }
10229
10230 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
10231 if (I0 == ExpectedVExtractIdx)
10232 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
10233 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
10234 // Try to match the following dag sequence:
10235 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
10236 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
10237 } else
10238 CanFold = false;
10239
10240 ExpectedVExtractIdx += 2;
10241 }
10242
10243 return CanFold;
10244}
10245
10246/// Emit a sequence of two 128-bit horizontal add/sub followed by
10247/// a concat_vector.
10248///
10249/// This is a helper function of LowerToHorizontalOp().
10250/// This function expects two 256-bit vectors called V0 and V1.
10251/// At first, each vector is split into two separate 128-bit vectors.
10252/// Then, the resulting 128-bit vectors are used to implement two
10253/// horizontal binary operations.
10254///
10255/// The kind of horizontal binary operation is defined by \p X86Opcode.
10256///
10257/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
10258/// the two new horizontal binop.
10259/// When Mode is set, the first horizontal binop dag node would take as input
10260/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
10261/// horizontal binop dag node would take as input the lower 128-bit of V1
10262/// and the upper 128-bit of V1.
10263/// Example:
10264/// HADD V0_LO, V0_HI
10265/// HADD V1_LO, V1_HI
10266///
10267/// Otherwise, the first horizontal binop dag node takes as input the lower
10268/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
10269/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
10270/// Example:
10271/// HADD V0_LO, V1_LO
10272/// HADD V0_HI, V1_HI
10273///
10274/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
10275/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
10276/// the upper 128-bits of the result.
10277static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
10278 const SDLoc &DL, SelectionDAG &DAG,
10279 unsigned X86Opcode, bool Mode,
10280 bool isUndefLO, bool isUndefHI) {
10281 MVT VT = V0.getSimpleValueType();
10282 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10283, __extension__
__PRETTY_FUNCTION__))
10283 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10283, __extension__
__PRETTY_FUNCTION__))
;
10284
10285 unsigned NumElts = VT.getVectorNumElements();
10286 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
10287 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
10288 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
10289 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
10290 MVT NewVT = V0_LO.getSimpleValueType();
10291
10292 SDValue LO = DAG.getUNDEF(NewVT);
10293 SDValue HI = DAG.getUNDEF(NewVT);
10294
10295 if (Mode) {
10296 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10297 if (!isUndefLO && !V0->isUndef())
10298 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
10299 if (!isUndefHI && !V1->isUndef())
10300 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
10301 } else {
10302 // Don't emit a horizontal binop if the result is expected to be UNDEF.
10303 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
10304 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
10305
10306 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
10307 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
10308 }
10309
10310 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
10311}
10312
10313/// Returns true iff \p BV builds a vector with the result equivalent to
10314/// the result of ADDSUB/SUBADD operation.
10315/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
10316/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
10317/// \p Opnd0 and \p Opnd1.
10318static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
10319 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10320 SDValue &Opnd0, SDValue &Opnd1,
10321 unsigned &NumExtracts,
10322 bool &IsSubAdd) {
10323
10324 MVT VT = BV->getSimpleValueType(0);
10325 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
10326 return false;
10327
10328 unsigned NumElts = VT.getVectorNumElements();
10329 SDValue InVec0 = DAG.getUNDEF(VT);
10330 SDValue InVec1 = DAG.getUNDEF(VT);
10331
10332 NumExtracts = 0;
10333
10334 // Odd-numbered elements in the input build vector are obtained from
10335 // adding/subtracting two integer/float elements.
10336 // Even-numbered elements in the input build vector are obtained from
10337 // subtracting/adding two integer/float elements.
10338 unsigned Opc[2] = {0, 0};
10339 for (unsigned i = 0, e = NumElts; i != e; ++i) {
10340 SDValue Op = BV->getOperand(i);
10341
10342 // Skip 'undef' values.
10343 unsigned Opcode = Op.getOpcode();
10344 if (Opcode == ISD::UNDEF)
10345 continue;
10346
10347 // Early exit if we found an unexpected opcode.
10348 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
10349 return false;
10350
10351 SDValue Op0 = Op.getOperand(0);
10352 SDValue Op1 = Op.getOperand(1);
10353
10354 // Try to match the following pattern:
10355 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10356 // Early exit if we cannot match that sequence.
10357 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10358 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10359 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10360 Op0.getOperand(1) != Op1.getOperand(1))
10361 return false;
10362
10363 unsigned I0 = Op0.getConstantOperandVal(1);
10364 if (I0 != i)
10365 return false;
10366
10367 // We found a valid add/sub node, make sure its the same opcode as previous
10368 // elements for this parity.
10369 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10370 return false;
10371 Opc[i % 2] = Opcode;
10372
10373 // Update InVec0 and InVec1.
10374 if (InVec0.isUndef()) {
10375 InVec0 = Op0.getOperand(0);
10376 if (InVec0.getSimpleValueType() != VT)
10377 return false;
10378 }
10379 if (InVec1.isUndef()) {
10380 InVec1 = Op1.getOperand(0);
10381 if (InVec1.getSimpleValueType() != VT)
10382 return false;
10383 }
10384
10385 // Make sure that operands in input to each add/sub node always
10386 // come from a same pair of vectors.
10387 if (InVec0 != Op0.getOperand(0)) {
10388 if (Opcode == ISD::FSUB)
10389 return false;
10390
10391 // FADD is commutable. Try to commute the operands
10392 // and then test again.
10393 std::swap(Op0, Op1);
10394 if (InVec0 != Op0.getOperand(0))
10395 return false;
10396 }
10397
10398 if (InVec1 != Op1.getOperand(0))
10399 return false;
10400
10401 // Increment the number of extractions done.
10402 ++NumExtracts;
10403 }
10404
10405 // Ensure we have found an opcode for both parities and that they are
10406 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10407 // inputs are undef.
10408 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10409 InVec0.isUndef() || InVec1.isUndef())
10410 return false;
10411
10412 IsSubAdd = Opc[0] == ISD::FADD;
10413
10414 Opnd0 = InVec0;
10415 Opnd1 = InVec1;
10416 return true;
10417}
10418
10419/// Returns true if is possible to fold MUL and an idiom that has already been
10420/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10421/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10422/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10423///
10424/// Prior to calling this function it should be known that there is some
10425/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10426/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10427/// before replacement of such SDNode with ADDSUB operation. Thus the number
10428/// of \p Opnd0 uses is expected to be equal to 2.
10429/// For example, this function may be called for the following IR:
10430/// %AB = fmul fast <2 x double> %A, %B
10431/// %Sub = fsub fast <2 x double> %AB, %C
10432/// %Add = fadd fast <2 x double> %AB, %C
10433/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10434/// <2 x i32> <i32 0, i32 3>
10435/// There is a def for %Addsub here, which potentially can be replaced by
10436/// X86ISD::ADDSUB operation:
10437/// %Addsub = X86ISD::ADDSUB %AB, %C
10438/// and such ADDSUB can further be replaced with FMADDSUB:
10439/// %Addsub = FMADDSUB %A, %B, %C.
10440///
10441/// The main reason why this method is called before the replacement of the
10442/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10443/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10444/// FMADDSUB is.
10445static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10446 SelectionDAG &DAG,
10447 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10448 unsigned ExpectedUses) {
10449 if (Opnd0.getOpcode() != ISD::FMUL ||
10450 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10451 return false;
10452
10453 // FIXME: These checks must match the similar ones in
10454 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10455 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10456 // or MUL + ADDSUB to FMADDSUB.
10457 const TargetOptions &Options = DAG.getTarget().Options;
10458 bool AllowFusion =
10459 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10460 if (!AllowFusion)
10461 return false;
10462
10463 Opnd2 = Opnd1;
10464 Opnd1 = Opnd0.getOperand(1);
10465 Opnd0 = Opnd0.getOperand(0);
10466
10467 return true;
10468}
10469
10470/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10471/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10472/// X86ISD::FMSUBADD node.
10473static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10474 const X86Subtarget &Subtarget,
10475 SelectionDAG &DAG) {
10476 SDValue Opnd0, Opnd1;
10477 unsigned NumExtracts;
10478 bool IsSubAdd;
10479 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10480 IsSubAdd))
10481 return SDValue();
10482
10483 MVT VT = BV->getSimpleValueType(0);
10484 SDLoc DL(BV);
10485
10486 // Try to generate X86ISD::FMADDSUB node here.
10487 SDValue Opnd2;
10488 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10489 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10490 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10491 }
10492
10493 // We only support ADDSUB.
10494 if (IsSubAdd)
10495 return SDValue();
10496
10497 // There are no known X86 targets with 512-bit ADDSUB instructions!
10498 // Convert to blend(fsub,fadd).
10499 if (VT.is512BitVector()) {
10500 SmallVector<int> Mask;
10501 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10502 Mask.push_back(I);
10503 Mask.push_back(I + E + 1);
10504 }
10505 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10506 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10507 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10508 }
10509
10510 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10511}
10512
10513static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10514 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10515 // Initialize outputs to known values.
10516 MVT VT = BV->getSimpleValueType(0);
10517 HOpcode = ISD::DELETED_NODE;
10518 V0 = DAG.getUNDEF(VT);
10519 V1 = DAG.getUNDEF(VT);
10520
10521 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10522 // half of the result is calculated independently from the 128-bit halves of
10523 // the inputs, so that makes the index-checking logic below more complicated.
10524 unsigned NumElts = VT.getVectorNumElements();
10525 unsigned GenericOpcode = ISD::DELETED_NODE;
10526 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10527 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10528 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10529 for (unsigned i = 0; i != Num128BitChunks; ++i) {
10530 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10531 // Ignore undef elements.
10532 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10533 if (Op.isUndef())
10534 continue;
10535
10536 // If there's an opcode mismatch, we're done.
10537 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10538 return false;
10539
10540 // Initialize horizontal opcode.
10541 if (HOpcode == ISD::DELETED_NODE) {
10542 GenericOpcode = Op.getOpcode();
10543 switch (GenericOpcode) {
10544 case ISD::ADD: HOpcode = X86ISD::HADD; break;
10545 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10546 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10547 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10548 default: return false;
10549 }
10550 }
10551
10552 SDValue Op0 = Op.getOperand(0);
10553 SDValue Op1 = Op.getOperand(1);
10554 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10555 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10556 Op0.getOperand(0) != Op1.getOperand(0) ||
10557 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10558 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10559 return false;
10560
10561 // The source vector is chosen based on which 64-bit half of the
10562 // destination vector is being calculated.
10563 if (j < NumEltsIn64Bits) {
10564 if (V0.isUndef())
10565 V0 = Op0.getOperand(0);
10566 } else {
10567 if (V1.isUndef())
10568 V1 = Op0.getOperand(0);
10569 }
10570
10571 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10572 if (SourceVec != Op0.getOperand(0))
10573 return false;
10574
10575 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10576 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10577 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10578 unsigned ExpectedIndex = i * NumEltsIn128Bits +
10579 (j % NumEltsIn64Bits) * 2;
10580 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10581 continue;
10582
10583 // If this is not a commutative op, this does not match.
10584 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10585 return false;
10586
10587 // Addition is commutative, so try swapping the extract indexes.
10588 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10589 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10590 continue;
10591
10592 // Extract indexes do not match horizontal requirement.
10593 return false;
10594 }
10595 }
10596 // We matched. Opcode and operands are returned by reference as arguments.
10597 return true;
10598}
10599
10600static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10601 SelectionDAG &DAG, unsigned HOpcode,
10602 SDValue V0, SDValue V1) {
10603 // If either input vector is not the same size as the build vector,
10604 // extract/insert the low bits to the correct size.
10605 // This is free (examples: zmm --> xmm, xmm --> ymm).
10606 MVT VT = BV->getSimpleValueType(0);
10607 unsigned Width = VT.getSizeInBits();
10608 if (V0.getValueSizeInBits() > Width)
10609 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10610 else if (V0.getValueSizeInBits() < Width)
10611 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10612
10613 if (V1.getValueSizeInBits() > Width)
10614 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10615 else if (V1.getValueSizeInBits() < Width)
10616 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10617
10618 unsigned NumElts = VT.getVectorNumElements();
10619 APInt DemandedElts = APInt::getAllOnes(NumElts);
10620 for (unsigned i = 0; i != NumElts; ++i)
10621 if (BV->getOperand(i).isUndef())
10622 DemandedElts.clearBit(i);
10623
10624 // If we don't need the upper xmm, then perform as a xmm hop.
10625 unsigned HalfNumElts = NumElts / 2;
10626 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10627 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10628 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10629 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10630 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10631 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10632 }
10633
10634 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10635}
10636
10637/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10638static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10639 const X86Subtarget &Subtarget,
10640 SelectionDAG &DAG) {
10641 // We need at least 2 non-undef elements to make this worthwhile by default.
10642 unsigned NumNonUndefs =
10643 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10644 if (NumNonUndefs < 2)
10645 return SDValue();
10646
10647 // There are 4 sets of horizontal math operations distinguished by type:
10648 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10649 // subtarget feature. Try to match those "native" patterns first.
10650 MVT VT = BV->getSimpleValueType(0);
10651 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10652 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10653 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10654 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10655 unsigned HOpcode;
10656 SDValue V0, V1;
10657 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10658 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10659 }
10660
10661 // Try harder to match 256-bit ops by using extract/concat.
10662 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10663 return SDValue();
10664
10665 // Count the number of UNDEF operands in the build_vector in input.
10666 unsigned NumElts = VT.getVectorNumElements();
10667 unsigned Half = NumElts / 2;
10668 unsigned NumUndefsLO = 0;
10669 unsigned NumUndefsHI = 0;
10670 for (unsigned i = 0, e = Half; i != e; ++i)
10671 if (BV->getOperand(i)->isUndef())
10672 NumUndefsLO++;
10673
10674 for (unsigned i = Half, e = NumElts; i != e; ++i)
10675 if (BV->getOperand(i)->isUndef())
10676 NumUndefsHI++;
10677
10678 SDLoc DL(BV);
10679 SDValue InVec0, InVec1;
10680 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10681 SDValue InVec2, InVec3;
10682 unsigned X86Opcode;
10683 bool CanFold = true;
10684
10685 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10686 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10687 InVec3) &&
10688 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10689 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10690 X86Opcode = X86ISD::HADD;
10691 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10692 InVec1) &&
10693 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10694 InVec3) &&
10695 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10696 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10697 X86Opcode = X86ISD::HSUB;
10698 else
10699 CanFold = false;
10700
10701 if (CanFold) {
10702 // Do not try to expand this build_vector into a pair of horizontal
10703 // add/sub if we can emit a pair of scalar add/sub.
10704 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10705 return SDValue();
10706
10707 // Convert this build_vector into a pair of horizontal binops followed by
10708 // a concat vector. We must adjust the outputs from the partial horizontal
10709 // matching calls above to account for undefined vector halves.
10710 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10711 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10712 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10712, __extension__
__PRETTY_FUNCTION__))
;
10713 bool isUndefLO = NumUndefsLO == Half;
10714 bool isUndefHI = NumUndefsHI == Half;
10715 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10716 isUndefHI);
10717 }
10718 }
10719
10720 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10721 VT == MVT::v16i16) {
10722 unsigned X86Opcode;
10723 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10724 X86Opcode = X86ISD::HADD;
10725 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10726 InVec1))
10727 X86Opcode = X86ISD::HSUB;
10728 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10729 InVec1))
10730 X86Opcode = X86ISD::FHADD;
10731 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10732 InVec1))
10733 X86Opcode = X86ISD::FHSUB;
10734 else
10735 return SDValue();
10736
10737 // Don't try to expand this build_vector into a pair of horizontal add/sub
10738 // if we can simply emit a pair of scalar add/sub.
10739 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10740 return SDValue();
10741
10742 // Convert this build_vector into two horizontal add/sub followed by
10743 // a concat vector.
10744 bool isUndefLO = NumUndefsLO == Half;
10745 bool isUndefHI = NumUndefsHI == Half;
10746 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10747 isUndefLO, isUndefHI);
10748 }
10749
10750 return SDValue();
10751}
10752
10753static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10754 SelectionDAG &DAG);
10755
10756/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10757/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10758/// just apply the bit to the vectors.
10759/// NOTE: Its not in our interest to start make a general purpose vectorizer
10760/// from this, but enough scalar bit operations are created from the later
10761/// legalization + scalarization stages to need basic support.
10762static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10763 const X86Subtarget &Subtarget,
10764 SelectionDAG &DAG) {
10765 SDLoc DL(Op);
10766 MVT VT = Op->getSimpleValueType(0);
10767 unsigned NumElems = VT.getVectorNumElements();
10768 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10769
10770 // Check that all elements have the same opcode.
10771 // TODO: Should we allow UNDEFS and if so how many?
10772 unsigned Opcode = Op->getOperand(0).getOpcode();
10773 for (unsigned i = 1; i < NumElems; ++i)
10774 if (Opcode != Op->getOperand(i).getOpcode())
10775 return SDValue();
10776
10777 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10778 bool IsShift = false;
10779 switch (Opcode) {
10780 default:
10781 return SDValue();
10782 case ISD::SHL:
10783 case ISD::SRL:
10784 case ISD::SRA:
10785 IsShift = true;
10786 break;
10787 case ISD::AND:
10788 case ISD::XOR:
10789 case ISD::OR:
10790 // Don't do this if the buildvector is a splat - we'd replace one
10791 // constant with an entire vector.
10792 if (Op->getSplatValue())
10793 return SDValue();
10794 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10795 return SDValue();
10796 break;
10797 }
10798
10799 SmallVector<SDValue, 4> LHSElts, RHSElts;
10800 for (SDValue Elt : Op->ops()) {
10801 SDValue LHS = Elt.getOperand(0);
10802 SDValue RHS = Elt.getOperand(1);
10803
10804 // We expect the canonicalized RHS operand to be the constant.
10805 if (!isa<ConstantSDNode>(RHS))
10806 return SDValue();
10807
10808 // Extend shift amounts.
10809 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10810 if (!IsShift)
10811 return SDValue();
10812 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10813 }
10814
10815 LHSElts.push_back(LHS);
10816 RHSElts.push_back(RHS);
10817 }
10818
10819 // Limit to shifts by uniform immediates.
10820 // TODO: Only accept vXi8/vXi64 special cases?
10821 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10822 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10823 return SDValue();
10824
10825 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10826 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10827 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10828
10829 if (!IsShift)
10830 return Res;
10831
10832 // Immediately lower the shift to ensure the constant build vector doesn't
10833 // get converted to a constant pool before the shift is lowered.
10834 return LowerShift(Res, Subtarget, DAG);
10835}
10836
10837/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10838/// functionality to do this, so it's all zeros, all ones, or some derivation
10839/// that is cheap to calculate.
10840static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10841 const X86Subtarget &Subtarget) {
10842 SDLoc DL(Op);
10843 MVT VT = Op.getSimpleValueType();
10844
10845 // Vectors containing all zeros can be matched by pxor and xorps.
10846 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10847 return Op;
10848
10849 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10850 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10851 // vpcmpeqd on 256-bit vectors.
10852 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10853 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10854 return Op;
10855
10856 return getOnesVector(VT, DAG, DL);
10857 }
10858
10859 return SDValue();
10860}
10861
10862/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10863/// from a vector of source values and a vector of extraction indices.
10864/// The vectors might be manipulated to match the type of the permute op.
10865static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10866 SDLoc &DL, SelectionDAG &DAG,
10867 const X86Subtarget &Subtarget) {
10868 MVT ShuffleVT = VT;
10869 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10870 unsigned NumElts = VT.getVectorNumElements();
10871 unsigned SizeInBits = VT.getSizeInBits();
10872
10873 // Adjust IndicesVec to match VT size.
10874 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10875, __extension__
__PRETTY_FUNCTION__))
10875 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10875, __extension__
__PRETTY_FUNCTION__))
;
10876 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10877 // Narrow/widen the indices vector to the correct size.
10878 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10879 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10880 NumElts * VT.getScalarSizeInBits());
10881 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10882 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10883 SDLoc(IndicesVec), SizeInBits);
10884 // Zero-extend the index elements within the vector.
10885 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10886 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10887 IndicesVT, IndicesVec);
10888 }
10889 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10890
10891 // Handle SrcVec that don't match VT type.
10892 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10893 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10894 // Handle larger SrcVec by treating it as a larger permute.
10895 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10896 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10897 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10898 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10899 Subtarget, DAG, SDLoc(IndicesVec));
10900 SDValue NewSrcVec =
10901 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10902 if (NewSrcVec)
10903 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10904 return SDValue();
10905 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10906 // Widen smaller SrcVec to match VT.
10907 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10908 } else
10909 return SDValue();
10910 }
10911
10912 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10913 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10913, __extension__
__PRETTY_FUNCTION__))
;
10914 EVT SrcVT = Idx.getValueType();
10915 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10916 uint64_t IndexScale = 0;
10917 uint64_t IndexOffset = 0;
10918
10919 // If we're scaling a smaller permute op, then we need to repeat the
10920 // indices, scaling and offsetting them as well.
10921 // e.g. v4i32 -> v16i8 (Scale = 4)
10922 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10923 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10924 for (uint64_t i = 0; i != Scale; ++i) {
10925 IndexScale |= Scale << (i * NumDstBits);
10926 IndexOffset |= i << (i * NumDstBits);
10927 }
10928
10929 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10930 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10931 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10932 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10933 return Idx;
10934 };
10935
10936 unsigned Opcode = 0;
10937 switch (VT.SimpleTy) {
10938 default:
10939 break;
10940 case MVT::v16i8:
10941 if (Subtarget.hasSSSE3())
10942 Opcode = X86ISD::PSHUFB;
10943 break;
10944 case MVT::v8i16:
10945 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10946 Opcode = X86ISD::VPERMV;
10947 else if (Subtarget.hasSSSE3()) {
10948 Opcode = X86ISD::PSHUFB;
10949 ShuffleVT = MVT::v16i8;
10950 }
10951 break;
10952 case MVT::v4f32:
10953 case MVT::v4i32:
10954 if (Subtarget.hasAVX()) {
10955 Opcode = X86ISD::VPERMILPV;
10956 ShuffleVT = MVT::v4f32;
10957 } else if (Subtarget.hasSSSE3()) {
10958 Opcode = X86ISD::PSHUFB;
10959 ShuffleVT = MVT::v16i8;
10960 }
10961 break;
10962 case MVT::v2f64:
10963 case MVT::v2i64:
10964 if (Subtarget.hasAVX()) {
10965 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10966 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10967 Opcode = X86ISD::VPERMILPV;
10968 ShuffleVT = MVT::v2f64;
10969 } else if (Subtarget.hasSSE41()) {
10970 // SSE41 can compare v2i64 - select between indices 0 and 1.
10971 return DAG.getSelectCC(
10972 DL, IndicesVec,
10973 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10974 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10975 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10976 ISD::CondCode::SETEQ);
10977 }
10978 break;
10979 case MVT::v32i8:
10980 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10981 Opcode = X86ISD::VPERMV;
10982 else if (Subtarget.hasXOP()) {
10983 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10984 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10985 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10986 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10987 return DAG.getNode(
10988 ISD::CONCAT_VECTORS, DL, VT,
10989 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10990 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10991 } else if (Subtarget.hasAVX()) {
10992 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10993 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10994 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10995 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10996 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10997 ArrayRef<SDValue> Ops) {
10998 // Permute Lo and Hi and then select based on index range.
10999 // This works as SHUFB uses bits[3:0] to permute elements and we don't
11000 // care about the bit[7] as its just an index vector.
11001 SDValue Idx = Ops[2];
11002 EVT VT = Idx.getValueType();
11003 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
11004 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
11005 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
11006 ISD::CondCode::SETGT);
11007 };
11008 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
11009 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
11010 PSHUFBBuilder);
11011 }
11012 break;
11013 case MVT::v16i16:
11014 if (Subtarget.hasVLX() && Subtarget.hasBWI())
11015 Opcode = X86ISD::VPERMV;
11016 else if (Subtarget.hasAVX()) {
11017 // Scale to v32i8 and perform as v32i8.
11018 IndicesVec = ScaleIndices(IndicesVec, 2);
11019 return DAG.getBitcast(
11020 VT, createVariablePermute(
11021 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
11022 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
11023 }
11024 break;
11025 case MVT::v8f32:
11026 case MVT::v8i32:
11027 if (Subtarget.hasAVX2())
11028 Opcode = X86ISD::VPERMV;
11029 else if (Subtarget.hasAVX()) {
11030 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
11031 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11032 {0, 1, 2, 3, 0, 1, 2, 3});
11033 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11034 {4, 5, 6, 7, 4, 5, 6, 7});
11035 if (Subtarget.hasXOP())
11036 return DAG.getBitcast(
11037 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
11038 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11039 // Permute Lo and Hi and then select based on index range.
11040 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
11041 SDValue Res = DAG.getSelectCC(
11042 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
11043 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
11044 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
11045 ISD::CondCode::SETGT);
11046 return DAG.getBitcast(VT, Res);
11047 }
11048 break;
11049 case MVT::v4i64:
11050 case MVT::v4f64:
11051 if (Subtarget.hasAVX512()) {
11052 if (!Subtarget.hasVLX()) {
11053 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
11054 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
11055 SDLoc(SrcVec));
11056 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
11057 DAG, SDLoc(IndicesVec));
11058 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
11059 DAG, Subtarget);
11060 return extract256BitVector(Res, 0, DAG, DL);
11061 }
11062 Opcode = X86ISD::VPERMV;
11063 } else if (Subtarget.hasAVX()) {
11064 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
11065 SDValue LoLo =
11066 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
11067 SDValue HiHi =
11068 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
11069 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
11070 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11071 if (Subtarget.hasXOP())
11072 return DAG.getBitcast(
11073 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
11074 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11075 // Permute Lo and Hi and then select based on index range.
11076 // This works as VPERMILPD only uses index bit[1] to permute elements.
11077 SDValue Res = DAG.getSelectCC(
11078 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
11079 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
11080 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
11081 ISD::CondCode::SETGT);
11082 return DAG.getBitcast(VT, Res);
11083 }
11084 break;
11085 case MVT::v64i8:
11086 if (Subtarget.hasVBMI())
11087 Opcode = X86ISD::VPERMV;
11088 break;
11089 case MVT::v32i16:
11090 if (Subtarget.hasBWI())
11091 Opcode = X86ISD::VPERMV;
11092 break;
11093 case MVT::v16f32:
11094 case MVT::v16i32:
11095 case MVT::v8f64:
11096 case MVT::v8i64:
11097 if (Subtarget.hasAVX512())
11098 Opcode = X86ISD::VPERMV;
11099 break;
11100 }
11101 if (!Opcode)
11102 return SDValue();
11103
11104 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11106, __extension__
__PRETTY_FUNCTION__))
11105 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11106, __extension__
__PRETTY_FUNCTION__))
11106 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11106, __extension__
__PRETTY_FUNCTION__))
;
11107
11108 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
11109 if (Scale > 1)
11110 IndicesVec = ScaleIndices(IndicesVec, Scale);
11111
11112 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
11113 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
11114
11115 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
11116 SDValue Res = Opcode == X86ISD::VPERMV
11117 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
11118 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
11119 return DAG.getBitcast(VT, Res);
11120}
11121
11122// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
11123// reasoned to be a permutation of a vector by indices in a non-constant vector.
11124// (build_vector (extract_elt V, (extract_elt I, 0)),
11125// (extract_elt V, (extract_elt I, 1)),
11126// ...
11127// ->
11128// (vpermv I, V)
11129//
11130// TODO: Handle undefs
11131// TODO: Utilize pshufb and zero mask blending to support more efficient
11132// construction of vectors with constant-0 elements.
11133static SDValue
11134LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
11135 const X86Subtarget &Subtarget) {
11136 SDValue SrcVec, IndicesVec;
11137 // Check for a match of the permute source vector and permute index elements.
11138 // This is done by checking that the i-th build_vector operand is of the form:
11139 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
11140 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
11141 SDValue Op = V.getOperand(Idx);
11142 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11143 return SDValue();
11144
11145 // If this is the first extract encountered in V, set the source vector,
11146 // otherwise verify the extract is from the previously defined source
11147 // vector.
11148 if (!SrcVec)
11149 SrcVec = Op.getOperand(0);
11150 else if (SrcVec != Op.getOperand(0))
11151 return SDValue();
11152 SDValue ExtractedIndex = Op->getOperand(1);
11153 // Peek through extends.
11154 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
11155 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
11156 ExtractedIndex = ExtractedIndex.getOperand(0);
11157 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11158 return SDValue();
11159
11160 // If this is the first extract from the index vector candidate, set the
11161 // indices vector, otherwise verify the extract is from the previously
11162 // defined indices vector.
11163 if (!IndicesVec)
11164 IndicesVec = ExtractedIndex.getOperand(0);
11165 else if (IndicesVec != ExtractedIndex.getOperand(0))
11166 return SDValue();
11167
11168 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
11169 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
11170 return SDValue();
11171 }
11172
11173 SDLoc DL(V);
11174 MVT VT = V.getSimpleValueType();
11175 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11176}
11177
11178SDValue
11179X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
11180 SDLoc dl(Op);
11181
11182 MVT VT = Op.getSimpleValueType();
11183 MVT EltVT = VT.getVectorElementType();
11184 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
11185 unsigned NumElems = Op.getNumOperands();
11186
11187 // Generate vectors for predicate vectors.
11188 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
11189 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
11190
11191 if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
11192 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
11193
11194 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
11195 return VectorConstant;
11196
11197 unsigned EVTBits = EltVT.getSizeInBits();
11198 APInt UndefMask = APInt::getZero(NumElems);
11199 APInt FrozenUndefMask = APInt::getZero(NumElems);
11200 APInt ZeroMask = APInt::getZero(NumElems);
11201 APInt NonZeroMask = APInt::getZero(NumElems);
11202 bool IsAllConstants = true;
11203 SmallSet<SDValue, 8> Values;
11204 unsigned NumConstants = NumElems;
11205 for (unsigned i = 0; i < NumElems; ++i) {
11206 SDValue Elt = Op.getOperand(i);
11207 if (Elt.isUndef()) {
11208 UndefMask.setBit(i);
11209 continue;
11210 }
11211 if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {
11212 FrozenUndefMask.setBit(i);
11213 continue;
11214 }
11215 Values.insert(Elt);
11216 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
11217 IsAllConstants = false;
11218 NumConstants--;
11219 }
11220 if (X86::isZeroNode(Elt)) {
11221 ZeroMask.setBit(i);
11222 } else {
11223 NonZeroMask.setBit(i);
11224 }
11225 }
11226
11227 // All undef vector. Return an UNDEF.
11228 if (UndefMask.isAllOnes())
11229 return DAG.getUNDEF(VT);
11230
11231 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
11232 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
11233 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
11234 // and blend the FREEZE-UNDEF operands back in.
11235 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
11236 if (unsigned NumFrozenUndefElts = FrozenUndefMask.countPopulation();
11237 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
11238 SmallVector<int, 16> BlendMask(NumElems, -1);
11239 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
11240 for (unsigned i = 0; i < NumElems; ++i) {
11241 if (UndefMask[i]) {
11242 BlendMask[i] = -1;
11243 continue;
11244 }
11245 BlendMask[i] = i;
11246 if (!FrozenUndefMask[i])
11247 Elts[i] = Op.getOperand(i);
11248 else
11249 BlendMask[i] += NumElems;
11250 }
11251 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
11252 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
11253 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
11254 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
11255 }
11256
11257 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
11258
11259 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
11260 // lowering to a smaller build vector and padding with undef/zero.
11261 if ((VT.is256BitVector() || VT.is512BitVector()) &&
11262 !isFoldableUseOfShuffle(BV)) {
11263 unsigned UpperElems = NumElems / 2;
11264 APInt UndefOrZeroMask = UndefMask | ZeroMask;
11265 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
11266 if (NumUpperUndefsOrZeros >= UpperElems) {
11267 if (VT.is512BitVector() &&
11268 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
11269 UpperElems = NumElems - (NumElems / 4);
11270 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
11271 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
11272 SDValue NewBV =
11273 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
11274 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
11275 }
11276 }
11277
11278 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
11279 return AddSub;
11280 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
11281 return HorizontalOp;
11282 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
11283 return Broadcast;
11284 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
11285 return BitOp;
11286
11287 unsigned NumZero = ZeroMask.countPopulation();
11288 unsigned NumNonZero = NonZeroMask.countPopulation();
11289
11290 // If we are inserting one variable into a vector of non-zero constants, try
11291 // to avoid loading each constant element as a scalar. Load the constants as a
11292 // vector and then insert the variable scalar element. If insertion is not
11293 // supported, fall back to a shuffle to get the scalar blended with the
11294 // constants. Insertion into a zero vector is handled as a special-case
11295 // somewhere below here.
11296 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
11297 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
11298 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
11299 // Create an all-constant vector. The variable element in the old
11300 // build vector is replaced by undef in the constant vector. Save the
11301 // variable scalar element and its index for use in the insertelement.
11302 LLVMContext &Context = *DAG.getContext();
11303 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
11304 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
11305 SDValue VarElt;
11306 SDValue InsIndex;
11307 for (unsigned i = 0; i != NumElems; ++i) {
11308 SDValue Elt = Op.getOperand(i);
11309 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
11310 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
11311 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
11312 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
11313 else if (!Elt.isUndef()) {
11314 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11315, __extension__
__PRETTY_FUNCTION__))
11315 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11315, __extension__
__PRETTY_FUNCTION__))
;
11316 VarElt = Elt;
11317 InsIndex = DAG.getVectorIdxConstant(i, dl);
11318 }
11319 }
11320 Constant *CV = ConstantVector::get(ConstVecOps);
11321 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
11322
11323 // The constants we just created may not be legal (eg, floating point). We
11324 // must lower the vector right here because we can not guarantee that we'll
11325 // legalize it before loading it. This is also why we could not just create
11326 // a new build vector here. If the build vector contains illegal constants,
11327 // it could get split back up into a series of insert elements.
11328 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
11329 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
11330 MachineFunction &MF = DAG.getMachineFunction();
11331 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
11332 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
11333 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
11334 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
11335 if (InsertC < NumEltsInLow128Bits)
11336 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
11337
11338 // There's no good way to insert into the high elements of a >128-bit
11339 // vector, so use shuffles to avoid an extract/insert sequence.
11340 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11340, __extension__
__PRETTY_FUNCTION__))
;
11341 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11341, __extension__
__PRETTY_FUNCTION__))
;
11342 SmallVector<int, 8> ShuffleMask;
11343 unsigned NumElts = VT.getVectorNumElements();
11344 for (unsigned i = 0; i != NumElts; ++i)
11345 ShuffleMask.push_back(i == InsertC ? NumElts : i);
11346 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
11347 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
11348 }
11349
11350 // Special case for single non-zero, non-undef, element.
11351 if (NumNonZero == 1) {
11352 unsigned Idx = NonZeroMask.countTrailingZeros();
11353 SDValue Item = Op.getOperand(Idx);
11354
11355 // If we have a constant or non-constant insertion into the low element of
11356 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
11357 // the rest of the elements. This will be matched as movd/movq/movss/movsd
11358 // depending on what the source datatype is.
11359 if (Idx == 0) {
11360 if (NumZero == 0)
11361 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11362
11363 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
11364 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
11365 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
11366 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11368, __extension__
__PRETTY_FUNCTION__))
11367 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11368, __extension__
__PRETTY_FUNCTION__))
11368 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11368, __extension__
__PRETTY_FUNCTION__))
;
11369 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11370 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
11371 // zero vector.
11372 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11373 }
11374
11375 // We can't directly insert an i8 or i16 into a vector, so zero extend
11376 // it to i32 first.
11377 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
11378 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
11379 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
11380 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
11381 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11382 return DAG.getBitcast(VT, Item);
11383 }
11384 }
11385
11386 // Is it a vector logical left shift?
11387 if (NumElems == 2 && Idx == 1 &&
11388 X86::isZeroNode(Op.getOperand(0)) &&
11389 !X86::isZeroNode(Op.getOperand(1))) {
11390 unsigned NumBits = VT.getSizeInBits();
11391 return getVShift(true, VT,
11392 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11393 VT, Op.getOperand(1)),
11394 NumBits/2, DAG, *this, dl);
11395 }
11396
11397 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11398 return SDValue();
11399
11400 // Otherwise, if this is a vector with i32 or f32 elements, and the element
11401 // is a non-constant being inserted into an element other than the low one,
11402 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
11403 // movd/movss) to move this into the low element, then shuffle it into
11404 // place.
11405 if (EVTBits == 32) {
11406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11407 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11408 }
11409 }
11410
11411 // Splat is obviously ok. Let legalizer expand it to a shuffle.
11412 if (Values.size() == 1) {
11413 if (EVTBits == 32) {
11414 // Instead of a shuffle like this:
11415 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11416 // Check if it's possible to issue this instead.
11417 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11418 unsigned Idx = NonZeroMask.countTrailingZeros();
11419 SDValue Item = Op.getOperand(Idx);
11420 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11421 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11422 }
11423 return SDValue();
11424 }
11425
11426 // A vector full of immediates; various special cases are already
11427 // handled, so this is best done with a single constant-pool load.
11428 if (IsAllConstants)
11429 return SDValue();
11430
11431 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11432 return V;
11433
11434 // See if we can use a vector load to get all of the elements.
11435 {
11436 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11437 if (SDValue LD =
11438 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11439 return LD;
11440 }
11441
11442 // If this is a splat of pairs of 32-bit elements, we can use a narrower
11443 // build_vector and broadcast it.
11444 // TODO: We could probably generalize this more.
11445 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11446 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11447 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11448 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11449 // Make sure all the even/odd operands match.
11450 for (unsigned i = 2; i != NumElems; ++i)
11451 if (Ops[i % 2] != Op.getOperand(i))
11452 return false;
11453 return true;
11454 };
11455 if (CanSplat(Op, NumElems, Ops)) {
11456 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11457 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11458 // Create a new build vector and cast to v2i64/v2f64.
11459 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11460 DAG.getBuildVector(NarrowVT, dl, Ops));
11461 // Broadcast from v2i64/v2f64 and cast to final VT.
11462 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11463 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11464 NewBV));
11465 }
11466 }
11467
11468 // For AVX-length vectors, build the individual 128-bit pieces and use
11469 // shuffles to put them in place.
11470 if (VT.getSizeInBits() > 128) {
11471 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11472
11473 // Build both the lower and upper subvector.
11474 SDValue Lower =
11475 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11476 SDValue Upper = DAG.getBuildVector(
11477 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11478
11479 // Recreate the wider vector with the lower and upper part.
11480 return concatSubVectors(Lower, Upper, DAG, dl);
11481 }
11482
11483 // Let legalizer expand 2-wide build_vectors.
11484 if (EVTBits == 64) {
11485 if (NumNonZero == 1) {
11486 // One half is zero or undef.
11487 unsigned Idx = NonZeroMask.countTrailingZeros();
11488 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11489 Op.getOperand(Idx));
11490 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11491 }
11492 return SDValue();
11493 }
11494
11495 // If element VT is < 32 bits, convert it to inserts into a zero vector.
11496 if (EVTBits == 8 && NumElems == 16)
11497 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11498 DAG, Subtarget))
11499 return V;
11500
11501 if (EltVT == MVT::i16 && NumElems == 8)
11502 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11503 DAG, Subtarget))
11504 return V;
11505
11506 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11507 if (EVTBits == 32 && NumElems == 4)
11508 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11509 return V;
11510
11511 // If element VT is == 32 bits, turn it into a number of shuffles.
11512 if (NumElems == 4 && NumZero > 0) {
11513 SmallVector<SDValue, 8> Ops(NumElems);
11514 for (unsigned i = 0; i < 4; ++i) {
11515 bool isZero = !NonZeroMask[i];
11516 if (isZero)
11517 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11518 else
11519 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11520 }
11521
11522 for (unsigned i = 0; i < 2; ++i) {
11523 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11524 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11524)
;
11525 case 0:
11526 Ops[i] = Ops[i*2]; // Must be a zero vector.
11527 break;
11528 case 1:
11529 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11530 break;
11531 case 2:
11532 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11533 break;
11534 case 3:
11535 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11536 break;
11537 }
11538 }
11539
11540 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11541 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11542 int MaskVec[] = {
11543 Reverse1 ? 1 : 0,
11544 Reverse1 ? 0 : 1,
11545 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11546 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
11547 };
11548 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11549 }
11550
11551 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11551, __extension__
__PRETTY_FUNCTION__))
;
11552
11553 // Check for a build vector from mostly shuffle plus few inserting.
11554 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11555 return Sh;
11556
11557 // For SSE 4.1, use insertps to put the high elements into the low element.
11558 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11559 SDValue Result;
11560 if (!Op.getOperand(0).isUndef())
11561 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11562 else
11563 Result = DAG.getUNDEF(VT);
11564
11565 for (unsigned i = 1; i < NumElems; ++i) {
11566 if (Op.getOperand(i).isUndef()) continue;
11567 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11568 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11569 }
11570 return Result;
11571 }
11572
11573 // Otherwise, expand into a number of unpckl*, start by extending each of
11574 // our (non-undef) elements to the full vector width with the element in the
11575 // bottom slot of the vector (which generates no code for SSE).
11576 SmallVector<SDValue, 8> Ops(NumElems);
11577 for (unsigned i = 0; i < NumElems; ++i) {
11578 if (!Op.getOperand(i).isUndef())
11579 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11580 else
11581 Ops[i] = DAG.getUNDEF(VT);
11582 }
11583
11584 // Next, we iteratively mix elements, e.g. for v4f32:
11585 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11586 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11587 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
11588 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11589 // Generate scaled UNPCKL shuffle mask.
11590 SmallVector<int, 16> Mask;
11591 for(unsigned i = 0; i != Scale; ++i)
11592 Mask.push_back(i);
11593 for (unsigned i = 0; i != Scale; ++i)
11594 Mask.push_back(NumElems+i);
11595 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11596
11597 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11598 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11599 }
11600 return Ops[0];
11601}
11602
11603// 256-bit AVX can use the vinsertf128 instruction
11604// to create 256-bit vectors from two other 128-bit ones.
11605// TODO: Detect subvector broadcast here instead of DAG combine?
11606static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11607 const X86Subtarget &Subtarget) {
11608 SDLoc dl(Op);
11609 MVT ResVT = Op.getSimpleValueType();
11610
11611 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11612, __extension__
__PRETTY_FUNCTION__))
11612 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11612, __extension__
__PRETTY_FUNCTION__))
;
11613
11614 unsigned NumOperands = Op.getNumOperands();
11615 unsigned NumFreezeUndef = 0;
11616 unsigned NumZero = 0;
11617 unsigned NumNonZero = 0;
11618 unsigned NonZeros = 0;
11619 for (unsigned i = 0; i != NumOperands; ++i) {
11620 SDValue SubVec = Op.getOperand(i);
11621 if (SubVec.isUndef())
11622 continue;
11623 if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())
11624 ++NumFreezeUndef;
11625 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11626 ++NumZero;
11627 else {
11628 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11628, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11629 NonZeros |= 1 << i;
11630 ++NumNonZero;
11631 }
11632 }
11633
11634 // If we have more than 2 non-zeros, build each half separately.
11635 if (NumNonZero > 2) {
11636 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11637 ArrayRef<SDUse> Ops = Op->ops();
11638 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11639 Ops.slice(0, NumOperands/2));
11640 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11641 Ops.slice(NumOperands/2));
11642 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11643 }
11644
11645 // Otherwise, build it up through insert_subvectors.
11646 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11647 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
11648 : DAG.getUNDEF(ResVT));
11649
11650 MVT SubVT = Op.getOperand(0).getSimpleValueType();
11651 unsigned NumSubElems = SubVT.getVectorNumElements();
11652 for (unsigned i = 0; i != NumOperands; ++i) {
11653 if ((NonZeros & (1 << i)) == 0)
11654 continue;
11655
11656 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11657 Op.getOperand(i),
11658 DAG.getIntPtrConstant(i * NumSubElems, dl));
11659 }
11660
11661 return Vec;
11662}
11663
11664// Returns true if the given node is a type promotion (by concatenating i1
11665// zeros) of the result of a node that already zeros all upper bits of
11666// k-register.
11667// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11668static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11669 const X86Subtarget &Subtarget,
11670 SelectionDAG & DAG) {
11671 SDLoc dl(Op);
11672 MVT ResVT = Op.getSimpleValueType();
11673 unsigned NumOperands = Op.getNumOperands();
11674
11675 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11676, __extension__
__PRETTY_FUNCTION__))
11676 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11676, __extension__
__PRETTY_FUNCTION__))
;
11677
11678 uint64_t Zeros = 0;
11679 uint64_t NonZeros = 0;
11680 for (unsigned i = 0; i != NumOperands; ++i) {
11681 SDValue SubVec = Op.getOperand(i);
11682 if (SubVec.isUndef())
11683 continue;
11684 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11684, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11685 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11686 Zeros |= (uint64_t)1 << i;
11687 else
11688 NonZeros |= (uint64_t)1 << i;
11689 }
11690
11691 unsigned NumElems = ResVT.getVectorNumElements();
11692
11693 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11694 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11695 // insert_subvector will give us two kshifts.
11696 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11697 Log2_64(NonZeros) != NumOperands - 1) {
11698 MVT ShiftVT = ResVT;
11699 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11700 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11701 unsigned Idx = Log2_64(NonZeros);
11702 SDValue SubVec = Op.getOperand(Idx);
11703 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11704 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11705 DAG.getUNDEF(ShiftVT), SubVec,
11706 DAG.getIntPtrConstant(0, dl));
11707 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11708 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11709 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11710 DAG.getIntPtrConstant(0, dl));
11711 }
11712
11713 // If there are zero or one non-zeros we can handle this very simply.
11714 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11715 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11716 if (!NonZeros)
11717 return Vec;
11718 unsigned Idx = Log2_64(NonZeros);
11719 SDValue SubVec = Op.getOperand(Idx);
11720 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11721 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11722 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11723 }
11724
11725 if (NumOperands > 2) {
11726 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11727 ArrayRef<SDUse> Ops = Op->ops();
11728 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11729 Ops.slice(0, NumOperands/2));
11730 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11731 Ops.slice(NumOperands/2));
11732 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11733 }
11734
11735 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11735, __extension__
__PRETTY_FUNCTION__))
;
11736
11737 if (ResVT.getVectorNumElements() >= 16)
11738 return Op; // The operation is legal with KUNPCK
11739
11740 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11741 DAG.getUNDEF(ResVT), Op.getOperand(0),
11742 DAG.getIntPtrConstant(0, dl));
11743 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11744 DAG.getIntPtrConstant(NumElems/2, dl));
11745}
11746
11747static SDValue LowerCONCAT_VECTORS(SDValue Op,
11748 const X86Subtarget &Subtarget,
11749 SelectionDAG &DAG) {
11750 MVT VT = Op.getSimpleValueType();
11751 if (VT.getVectorElementType() == MVT::i1)
11752 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11753
11754 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11756, __extension__
__PRETTY_FUNCTION__))
11755 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11756, __extension__
__PRETTY_FUNCTION__))
11756 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11756, __extension__
__PRETTY_FUNCTION__))
;
11757
11758 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11759 // from two other 128-bit ones.
11760
11761 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11762 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11763}
11764
11765//===----------------------------------------------------------------------===//
11766// Vector shuffle lowering
11767//
11768// This is an experimental code path for lowering vector shuffles on x86. It is
11769// designed to handle arbitrary vector shuffles and blends, gracefully
11770// degrading performance as necessary. It works hard to recognize idiomatic
11771// shuffles and lower them to optimal instruction patterns without leaving
11772// a framework that allows reasonably efficient handling of all vector shuffle
11773// patterns.
11774//===----------------------------------------------------------------------===//
11775
11776/// Tiny helper function to identify a no-op mask.
11777///
11778/// This is a somewhat boring predicate function. It checks whether the mask
11779/// array input, which is assumed to be a single-input shuffle mask of the kind
11780/// used by the X86 shuffle instructions (not a fully general
11781/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11782/// in-place shuffle are 'no-op's.
11783static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11784 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11785 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11785, __extension__
__PRETTY_FUNCTION__))
;
11786 if (Mask[i] >= 0 && Mask[i] != i)
11787 return false;
11788 }
11789 return true;
11790}
11791
11792/// Test whether there are elements crossing LaneSizeInBits lanes in this
11793/// shuffle mask.
11794///
11795/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11796/// and we routinely test for these.
11797static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11798 unsigned ScalarSizeInBits,
11799 ArrayRef<int> Mask) {
11800 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11802, __extension__
__PRETTY_FUNCTION__))
11801 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11802, __extension__
__PRETTY_FUNCTION__))
11802 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11802, __extension__
__PRETTY_FUNCTION__))
;
11803 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11804 int Size = Mask.size();
11805 for (int i = 0; i < Size; ++i)
11806 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11807 return true;
11808 return false;
11809}
11810
11811/// Test whether there are elements crossing 128-bit lanes in this
11812/// shuffle mask.
11813static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11814 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11815}
11816
11817/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11818/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11819/// better support 'repeated mask + lane permute' style shuffles.
11820static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11821 unsigned ScalarSizeInBits,
11822 ArrayRef<int> Mask) {
11823 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))
11824 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))
11825 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))
;
11826 int NumElts = Mask.size();
11827 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11828 int NumLanes = NumElts / NumEltsPerLane;
11829 if (NumLanes > 1) {
11830 for (int i = 0; i != NumLanes; ++i) {
11831 int SrcLane = -1;
11832 for (int j = 0; j != NumEltsPerLane; ++j) {
11833 int M = Mask[(i * NumEltsPerLane) + j];
11834 if (M < 0)
11835 continue;
11836 int Lane = (M % NumElts) / NumEltsPerLane;
11837 if (SrcLane >= 0 && SrcLane != Lane)
11838 return true;
11839 SrcLane = Lane;
11840 }
11841 }
11842 }
11843 return false;
11844}
11845
11846/// Test whether a shuffle mask is equivalent within each sub-lane.
11847///
11848/// This checks a shuffle mask to see if it is performing the same
11849/// lane-relative shuffle in each sub-lane. This trivially implies
11850/// that it is also not lane-crossing. It may however involve a blend from the
11851/// same lane of a second vector.
11852///
11853/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11854/// non-trivial to compute in the face of undef lanes. The representation is
11855/// suitable for use with existing 128-bit shuffles as entries from the second
11856/// vector have been remapped to [LaneSize, 2*LaneSize).
11857static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11858 ArrayRef<int> Mask,
11859 SmallVectorImpl<int> &RepeatedMask) {
11860 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11861 RepeatedMask.assign(LaneSize, -1);
11862 int Size = Mask.size();
11863 for (int i = 0; i < Size; ++i) {
11864 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11864, __extension__
__PRETTY_FUNCTION__))
;
11865 if (Mask[i] < 0)
11866 continue;
11867 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11868 // This entry crosses lanes, so there is no way to model this shuffle.
11869 return false;
11870
11871 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11872 // Adjust second vector indices to start at LaneSize instead of Size.
11873 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11874 : Mask[i] % LaneSize + LaneSize;
11875 if (RepeatedMask[i % LaneSize] < 0)
11876 // This is the first non-undef entry in this slot of a 128-bit lane.
11877 RepeatedMask[i % LaneSize] = LocalM;
11878 else if (RepeatedMask[i % LaneSize] != LocalM)
11879 // Found a mismatch with the repeated mask.
11880 return false;
11881 }
11882 return true;
11883}
11884
11885/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11886static bool
11887is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11888 SmallVectorImpl<int> &RepeatedMask) {
11889 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11890}
11891
11892static bool
11893is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11894 SmallVector<int, 32> RepeatedMask;
11895 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11896}
11897
11898/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11899static bool
11900is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11901 SmallVectorImpl<int> &RepeatedMask) {
11902 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11903}
11904
11905/// Test whether a target shuffle mask is equivalent within each sub-lane.
11906/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11907static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11908 unsigned EltSizeInBits,
11909 ArrayRef<int> Mask,
11910 SmallVectorImpl<int> &RepeatedMask) {
11911 int LaneSize = LaneSizeInBits / EltSizeInBits;
11912 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11913 int Size = Mask.size();
11914 for (int i = 0; i < Size; ++i) {
11915 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11915, __extension__
__PRETTY_FUNCTION__))
;
11916 if (Mask[i] == SM_SentinelUndef)
11917 continue;
11918 if (Mask[i] == SM_SentinelZero) {
11919 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11920 return false;
11921 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11922 continue;
11923 }
11924 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11925 // This entry crosses lanes, so there is no way to model this shuffle.
11926 return false;
11927
11928 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11929 // later vector indices to start at multiples of LaneSize instead of Size.
11930 int LaneM = Mask[i] / Size;
11931 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11932 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11933 // This is the first non-undef entry in this slot of a 128-bit lane.
11934 RepeatedMask[i % LaneSize] = LocalM;
11935 else if (RepeatedMask[i % LaneSize] != LocalM)
11936 // Found a mismatch with the repeated mask.
11937 return false;
11938 }
11939 return true;
11940}
11941
11942/// Test whether a target shuffle mask is equivalent within each sub-lane.
11943/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11944static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11945 ArrayRef<int> Mask,
11946 SmallVectorImpl<int> &RepeatedMask) {
11947 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11948 Mask, RepeatedMask);
11949}
11950
11951/// Checks whether the vector elements referenced by two shuffle masks are
11952/// equivalent.
11953static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11954 int Idx, int ExpectedIdx) {
11955 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11956, __extension__
__PRETTY_FUNCTION__))
11956 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11956, __extension__
__PRETTY_FUNCTION__))
;
11957 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11958 return false;
11959
11960 switch (Op.getOpcode()) {
11961 case ISD::BUILD_VECTOR:
11962 // If the values are build vectors, we can look through them to find
11963 // equivalent inputs that make the shuffles equivalent.
11964 // TODO: Handle MaskSize != Op.getNumOperands()?
11965 if (MaskSize == (int)Op.getNumOperands() &&
11966 MaskSize == (int)ExpectedOp.getNumOperands())
11967 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11968 break;
11969 case X86ISD::VBROADCAST:
11970 case X86ISD::VBROADCAST_LOAD:
11971 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11972 return (Op == ExpectedOp &&
11973 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11974 case X86ISD::HADD:
11975 case X86ISD::HSUB:
11976 case X86ISD::FHADD:
11977 case X86ISD::FHSUB:
11978 case X86ISD::PACKSS:
11979 case X86ISD::PACKUS:
11980 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11981 // TODO: Handle MaskSize != NumElts?
11982 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11983 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11984 MVT VT = Op.getSimpleValueType();
11985 int NumElts = VT.getVectorNumElements();
11986 if (MaskSize == NumElts) {
11987 int NumLanes = VT.getSizeInBits() / 128;
11988 int NumEltsPerLane = NumElts / NumLanes;
11989 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11990 bool SameLane =
11991 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11992 bool SameElt =
11993 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11994 return SameLane && SameElt;
11995 }
11996 }
11997 break;
11998 }
11999
12000 return false;
12001}
12002
12003/// Checks whether a shuffle mask is equivalent to an explicit list of
12004/// arguments.
12005///
12006/// This is a fast way to test a shuffle mask against a fixed pattern:
12007///
12008/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
12009///
12010/// It returns true if the mask is exactly as wide as the argument list, and
12011/// each element of the mask is either -1 (signifying undef) or the value given
12012/// in the argument.
12013static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
12014 SDValue V1 = SDValue(),
12015 SDValue V2 = SDValue()) {
12016 int Size = Mask.size();
12017 if (Size != (int)ExpectedMask.size())
12018 return false;
12019
12020 for (int i = 0; i < Size; ++i) {
12021 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12021, __extension__
__PRETTY_FUNCTION__))
;
12022 int MaskIdx = Mask[i];
12023 int ExpectedIdx = ExpectedMask[i];
12024 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
12025 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12026 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12027 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12028 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12029 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12030 return false;
12031 }
12032 }
12033 return true;
12034}
12035
12036/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
12037///
12038/// The masks must be exactly the same width.
12039///
12040/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
12041/// value in ExpectedMask is always accepted. Otherwise the indices must match.
12042///
12043/// SM_SentinelZero is accepted as a valid negative index but must match in
12044/// both, or via a known bits test.
12045static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
12046 ArrayRef<int> ExpectedMask,
12047 const SelectionDAG &DAG,
12048 SDValue V1 = SDValue(),
12049 SDValue V2 = SDValue()) {
12050 int Size = Mask.size();
12051 if (Size != (int)ExpectedMask.size())
12052 return false;
12053 assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12055, __extension__
__PRETTY_FUNCTION__))
12054 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12055, __extension__
__PRETTY_FUNCTION__))
12055 "Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12055, __extension__
__PRETTY_FUNCTION__))
;
12056
12057 // Check for out-of-range target shuffle mask indices.
12058 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
12059 return false;
12060
12061 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
12062 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
12063 V1 = SDValue();
12064 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
12065 V2 = SDValue();
12066
12067 APInt ZeroV1 = APInt::getNullValue(Size);
12068 APInt ZeroV2 = APInt::getNullValue(Size);
12069
12070 for (int i = 0; i < Size; ++i) {
12071 int MaskIdx = Mask[i];
12072 int ExpectedIdx = ExpectedMask[i];
12073 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
12074 continue;
12075 if (MaskIdx == SM_SentinelZero) {
12076 // If we need this expected index to be a zero element, then update the
12077 // relevant zero mask and perform the known bits at the end to minimize
12078 // repeated computes.
12079 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12080 if (ExpectedV &&
12081 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
12082 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12083 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
12084 ZeroMask.setBit(BitIdx);
12085 continue;
12086 }
12087 }
12088 if (MaskIdx >= 0) {
12089 SDValue MaskV = MaskIdx < Size ? V1 : V2;
12090 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12091 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12092 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12093 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12094 continue;
12095 }
12096 return false;
12097 }
12098 return (ZeroV1.isNullValue() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
12099 (ZeroV2.isNullValue() || DAG.MaskedVectorIsZero(V2, ZeroV2));
12100}
12101
12102// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
12103// instructions.
12104static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
12105 const SelectionDAG &DAG) {
12106 if (VT != MVT::v8i32 && VT != MVT::v8f32)
12107 return false;
12108
12109 SmallVector<int, 8> Unpcklwd;
12110 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
12111 /* Unary = */ false);
12112 SmallVector<int, 8> Unpckhwd;
12113 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
12114 /* Unary = */ false);
12115 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
12116 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
12117 return IsUnpackwdMask;
12118}
12119
12120static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
12121 const SelectionDAG &DAG) {
12122 // Create 128-bit vector type based on mask size.
12123 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
12124 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
12125
12126 // We can't assume a canonical shuffle mask, so try the commuted version too.
12127 SmallVector<int, 4> CommutedMask(Mask);
12128 ShuffleVectorSDNode::commuteMask(CommutedMask);
12129
12130 // Match any of unary/binary or low/high.
12131 for (unsigned i = 0; i != 4; ++i) {
12132 SmallVector<int, 16> UnpackMask;
12133 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
12134 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
12135 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
12136 return true;
12137 }
12138 return false;
12139}
12140
12141/// Return true if a shuffle mask chooses elements identically in its top and
12142/// bottom halves. For example, any splat mask has the same top and bottom
12143/// halves. If an element is undefined in only one half of the mask, the halves
12144/// are not considered identical.
12145static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
12146 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12146, __extension__
__PRETTY_FUNCTION__))
;
12147 unsigned HalfSize = Mask.size() / 2;
12148 for (unsigned i = 0; i != HalfSize; ++i) {
12149 if (Mask[i] != Mask[i + HalfSize])
12150 return false;
12151 }
12152 return true;
12153}
12154
12155/// Get a 4-lane 8-bit shuffle immediate for a mask.
12156///
12157/// This helper function produces an 8-bit shuffle immediate corresponding to
12158/// the ubiquitous shuffle encoding scheme used in x86 instructions for
12159/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
12160/// example.
12161///
12162/// NB: We rely heavily on "undef" masks preserving the input lane.
12163static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
12164 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12164, __extension__
__PRETTY_FUNCTION__))
;
12165 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12165, __extension__
__PRETTY_FUNCTION__))
;
12166 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12166, __extension__
__PRETTY_FUNCTION__))
;
12167 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12167, __extension__
__PRETTY_FUNCTION__))
;
12168 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12168, __extension__
__PRETTY_FUNCTION__))
;
12169
12170 // If the mask only uses one non-undef element, then fully 'splat' it to
12171 // improve later broadcast matching.
12172 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
12173 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12173, __extension__
__PRETTY_FUNCTION__))
;
12174
12175 int FirstElt = Mask[FirstIndex];
12176 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
12177 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
12178
12179 unsigned Imm = 0;
12180 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
12181 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
12182 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
12183 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
12184 return Imm;
12185}
12186
12187static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
12188 SelectionDAG &DAG) {
12189 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
12190}
12191
12192// The Shuffle result is as follow:
12193// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
12194// Each Zeroable's element correspond to a particular Mask's element.
12195// As described in computeZeroableShuffleElements function.
12196//
12197// The function looks for a sub-mask that the nonzero elements are in
12198// increasing order. If such sub-mask exist. The function returns true.
12199static bool isNonZeroElementsInOrder(const APInt &Zeroable,
12200 ArrayRef<int> Mask, const EVT &VectorType,
12201 bool &IsZeroSideLeft) {
12202 int NextElement = -1;
12203 // Check if the Mask's nonzero elements are in increasing order.
12204 for (int i = 0, e = Mask.size(); i < e; i++) {
12205 // Checks if the mask's zeros elements are built from only zeros.
12206 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12206, __extension__
__PRETTY_FUNCTION__))
;
12207 if (Mask[i] < 0)
12208 return false;
12209 if (Zeroable[i])
12210 continue;
12211 // Find the lowest non zero element
12212 if (NextElement < 0) {
12213 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
12214 IsZeroSideLeft = NextElement != 0;
12215 }
12216 // Exit if the mask's non zero elements are not in increasing order.
12217 if (NextElement != Mask[i])
12218 return false;
12219 NextElement++;
12220 }
12221 return true;
12222}
12223
12224/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
12225static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
12226 ArrayRef<int> Mask, SDValue V1,
12227 SDValue V2, const APInt &Zeroable,
12228 const X86Subtarget &Subtarget,
12229 SelectionDAG &DAG) {
12230 int Size = Mask.size();
12231 int LaneSize = 128 / VT.getScalarSizeInBits();
12232 const int NumBytes = VT.getSizeInBits() / 8;
12233 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
12234
12235 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))
12236 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))
12237 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))
;
12238
12239 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
12240 // Sign bit set in i8 mask means zero element.
12241 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
12242
12243 SDValue V;
12244 for (int i = 0; i < NumBytes; ++i) {
12245 int M = Mask[i / NumEltBytes];
12246 if (M < 0) {
12247 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
12248 continue;
12249 }
12250 if (Zeroable[i / NumEltBytes]) {
12251 PSHUFBMask[i] = ZeroMask;
12252 continue;
12253 }
12254
12255 // We can only use a single input of V1 or V2.
12256 SDValue SrcV = (M >= Size ? V2 : V1);
12257 if (V && V != SrcV)
12258 return SDValue();
12259 V = SrcV;
12260 M %= Size;
12261
12262 // PSHUFB can't cross lanes, ensure this doesn't happen.
12263 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
12264 return SDValue();
12265
12266 M = M % LaneSize;
12267 M = M * NumEltBytes + (i % NumEltBytes);
12268 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
12269 }
12270 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))
;
12271
12272 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
12273 return DAG.getBitcast(
12274 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
12275 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
12276}
12277
12278static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
12279 const X86Subtarget &Subtarget, SelectionDAG &DAG,
12280 const SDLoc &dl);
12281
12282// X86 has dedicated shuffle that can be lowered to VEXPAND
12283static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
12284 const APInt &Zeroable,
12285 ArrayRef<int> Mask, SDValue &V1,
12286 SDValue &V2, SelectionDAG &DAG,
12287 const X86Subtarget &Subtarget) {
12288 bool IsLeftZeroSide = true;
12289 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
12290 IsLeftZeroSide))
12291 return SDValue();
12292 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
12293 MVT IntegerType =
12294 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12295 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
12296 unsigned NumElts = VT.getVectorNumElements();
12297 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12298, __extension__
__PRETTY_FUNCTION__))
12298 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12298, __extension__
__PRETTY_FUNCTION__))
;
12299 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
12300 Subtarget, DAG, DL);
12301 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
12302 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
12303 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
12304}
12305
12306static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
12307 unsigned &UnpackOpcode, bool IsUnary,
12308 ArrayRef<int> TargetMask, const SDLoc &DL,
12309 SelectionDAG &DAG,
12310 const X86Subtarget &Subtarget) {
12311 int NumElts = VT.getVectorNumElements();
12312
12313 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
12314 for (int i = 0; i != NumElts; i += 2) {
12315 int M1 = TargetMask[i + 0];
12316 int M2 = TargetMask[i + 1];
12317 Undef1 &= (SM_SentinelUndef == M1);
12318 Undef2 &= (SM_SentinelUndef == M2);
12319 Zero1 &= isUndefOrZero(M1);
12320 Zero2 &= isUndefOrZero(M2);
12321 }
12322 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12323, __extension__
__PRETTY_FUNCTION__))
12323 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12323, __extension__
__PRETTY_FUNCTION__))
;
12324
12325 // Attempt to match the target mask against the unpack lo/hi mask patterns.
12326 SmallVector<int, 64> Unpckl, Unpckh;
12327 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
12328 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
12329 (IsUnary ? V1 : V2))) {
12330 UnpackOpcode = X86ISD::UNPCKL;
12331 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12332 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12333 return true;
12334 }
12335
12336 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
12337 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
12338 (IsUnary ? V1 : V2))) {
12339 UnpackOpcode = X86ISD::UNPCKH;
12340 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12341 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12342 return true;
12343 }
12344
12345 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
12346 if (IsUnary && (Zero1 || Zero2)) {
12347 // Don't bother if we can blend instead.
12348 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
12349 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
12350 return false;
12351
12352 bool MatchLo = true, MatchHi = true;
12353 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
12354 int M = TargetMask[i];
12355
12356 // Ignore if the input is known to be zero or the index is undef.
12357 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
12358 (M == SM_SentinelUndef))
12359 continue;
12360
12361 MatchLo &= (M == Unpckl[i]);
12362 MatchHi &= (M == Unpckh[i]);
12363 }
12364
12365 if (MatchLo || MatchHi) {
12366 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12367 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12368 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12369 return true;
12370 }
12371 }
12372
12373 // If a binary shuffle, commute and try again.
12374 if (!IsUnary) {
12375 ShuffleVectorSDNode::commuteMask(Unpckl);
12376 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
12377 UnpackOpcode = X86ISD::UNPCKL;
12378 std::swap(V1, V2);
12379 return true;
12380 }
12381
12382 ShuffleVectorSDNode::commuteMask(Unpckh);
12383 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
12384 UnpackOpcode = X86ISD::UNPCKH;
12385 std::swap(V1, V2);
12386 return true;
12387 }
12388 }
12389
12390 return false;
12391}
12392
12393// X86 has dedicated unpack instructions that can handle specific blend
12394// operations: UNPCKH and UNPCKL.
12395static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12396 ArrayRef<int> Mask, SDValue V1, SDValue V2,
12397 SelectionDAG &DAG) {
12398 SmallVector<int, 8> Unpckl;
12399 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12400 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12401 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12402
12403 SmallVector<int, 8> Unpckh;
12404 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12405 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12406 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12407
12408 // Commute and try again.
12409 ShuffleVectorSDNode::commuteMask(Unpckl);
12410 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12411 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12412
12413 ShuffleVectorSDNode::commuteMask(Unpckh);
12414 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12415 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12416
12417 return SDValue();
12418}
12419
12420/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12421/// followed by unpack 256-bit.
12422static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12423 ArrayRef<int> Mask, SDValue V1,
12424 SDValue V2, SelectionDAG &DAG) {
12425 SmallVector<int, 32> Unpckl, Unpckh;
12426 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12427 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12428
12429 unsigned UnpackOpcode;
12430 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12431 UnpackOpcode = X86ISD::UNPCKL;
12432 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12433 UnpackOpcode = X86ISD::UNPCKH;
12434 else
12435 return SDValue();
12436
12437 // This is a "natural" unpack operation (rather than the 128-bit sectored
12438 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12439 // input in order to use the x86 instruction.
12440 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12441 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12442 V1 = DAG.getBitcast(VT, V1);
12443 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12444}
12445
12446// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12447// source into the lower elements and zeroing the upper elements.
12448static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12449 ArrayRef<int> Mask, const APInt &Zeroable,
12450 const X86Subtarget &Subtarget) {
12451 if (!VT.is512BitVector() && !Subtarget.hasVLX())
12452 return false;
12453
12454 unsigned NumElts = Mask.size();
12455 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12456 unsigned MaxScale = 64 / EltSizeInBits;
12457
12458 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12459 unsigned SrcEltBits = EltSizeInBits * Scale;
12460 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12461 continue;
12462 unsigned NumSrcElts = NumElts / Scale;
12463 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12464 continue;
12465 unsigned UpperElts = NumElts - NumSrcElts;
12466 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12467 continue;
12468 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12469 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12470 DstVT = MVT::getIntegerVT(EltSizeInBits);
12471 if ((NumSrcElts * EltSizeInBits) >= 128) {
12472 // ISD::TRUNCATE
12473 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12474 } else {
12475 // X86ISD::VTRUNC
12476 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12477 }
12478 return true;
12479 }
12480
12481 return false;
12482}
12483
12484// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12485// element padding to the final DstVT.
12486static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12487 const X86Subtarget &Subtarget,
12488 SelectionDAG &DAG, bool ZeroUppers) {
12489 MVT SrcVT = Src.getSimpleValueType();
12490 MVT DstSVT = DstVT.getScalarType();
12491 unsigned NumDstElts = DstVT.getVectorNumElements();
12492 unsigned NumSrcElts = SrcVT.getVectorNumElements();
12493 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12494
12495 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12496 return SDValue();
12497
12498 // Perform a direct ISD::TRUNCATE if possible.
12499 if (NumSrcElts == NumDstElts)
12500 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12501
12502 if (NumSrcElts > NumDstElts) {
12503 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12504 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12505 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12506 }
12507
12508 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12509 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12510 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12511 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12512 DstVT.getSizeInBits());
12513 }
12514
12515 // Non-VLX targets must truncate from a 512-bit type, so we need to
12516 // widen, truncate and then possibly extract the original subvector.
12517 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12518 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12519 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12520 }
12521
12522 // Fallback to a X86ISD::VTRUNC, padding if necessary.
12523 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12524 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12525 if (DstVT != TruncVT)
12526 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12527 DstVT.getSizeInBits());
12528 return Trunc;
12529}
12530
12531// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12532//
12533// An example is the following:
12534//
12535// t0: ch = EntryToken
12536// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12537// t25: v4i32 = truncate t2
12538// t41: v8i16 = bitcast t25
12539// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12540// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12541// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12542// t18: v2i64 = bitcast t51
12543//
12544// One can just use a single vpmovdw instruction, without avx512vl we need to
12545// use the zmm variant and extract the lower subvector, padding with zeroes.
12546// TODO: Merge with lowerShuffleAsVTRUNC.
12547static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12548 SDValue V2, ArrayRef<int> Mask,
12549 const APInt &Zeroable,
12550 const X86Subtarget &Subtarget,
12551 SelectionDAG &DAG) {
12552 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12552, __extension__
__PRETTY_FUNCTION__))
;
12553 if (!Subtarget.hasAVX512())
12554 return SDValue();
12555
12556 unsigned NumElts = VT.getVectorNumElements();
12557 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12558 unsigned MaxScale = 64 / EltSizeInBits;
12559 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12560 unsigned SrcEltBits = EltSizeInBits * Scale;
12561 unsigned NumSrcElts = NumElts / Scale;
12562 unsigned UpperElts = NumElts - NumSrcElts;
12563 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12564 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12565 continue;
12566
12567 // Attempt to find a matching source truncation, but as a fall back VLX
12568 // cases can use the VPMOV directly.
12569 SDValue Src = peekThroughBitcasts(V1);
12570 if (Src.getOpcode() == ISD::TRUNCATE &&
12571 Src.getScalarValueSizeInBits() == SrcEltBits) {
12572 Src = Src.getOperand(0);
12573 } else if (Subtarget.hasVLX()) {
12574 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12575 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12576 Src = DAG.getBitcast(SrcVT, Src);
12577 // Don't do this if PACKSS/PACKUS could perform it cheaper.
12578 if (Scale == 2 &&
12579 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12580 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12581 return SDValue();
12582 } else
12583 return SDValue();
12584
12585 // VPMOVWB is only available with avx512bw.
12586 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
12587 return SDValue();
12588
12589 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12590 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12591 }
12592
12593 return SDValue();
12594}
12595
12596// Attempt to match binary shuffle patterns as a truncate.
12597static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12598 SDValue V2, ArrayRef<int> Mask,
12599 const APInt &Zeroable,
12600 const X86Subtarget &Subtarget,
12601 SelectionDAG &DAG) {
12602 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12603, __extension__
__PRETTY_FUNCTION__))
12603 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12603, __extension__
__PRETTY_FUNCTION__))
;
12604 if (!Subtarget.hasAVX512())
12605 return SDValue();
12606
12607 unsigned NumElts = VT.getVectorNumElements();
12608 unsigned EltSizeInBits = VT.getScalarSizeInBits();
12609 unsigned MaxScale = 64 / EltSizeInBits;
12610 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12611 // TODO: Support non-BWI VPMOVWB truncations?
12612 unsigned SrcEltBits = EltSizeInBits * Scale;
12613 if (SrcEltBits < 32 && !Subtarget.hasBWI())
12614 continue;
12615
12616 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
12617 // Bail if the V2 elements are undef.
12618 unsigned NumHalfSrcElts = NumElts / Scale;
12619 unsigned NumSrcElts = 2 * NumHalfSrcElts;
12620 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
12621 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
12622 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12623 continue;
12624
12625 // The elements beyond the truncation must be undef/zero.
12626 unsigned UpperElts = NumElts - NumSrcElts;
12627 if (UpperElts > 0 &&
12628 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12629 continue;
12630 bool UndefUppers =
12631 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12632
12633 // For offset truncations, ensure that the concat is cheap.
12634 if (Offset) {
12635 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
12636 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12637 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12638 return Lo.getOperand(0) == Hi.getOperand(0);
12639 if (ISD::isNormalLoad(Lo.getNode()) &&
12640 ISD::isNormalLoad(Hi.getNode())) {
12641 auto *LDLo = cast<LoadSDNode>(Lo);
12642 auto *LDHi = cast<LoadSDNode>(Hi);
12643 return DAG.areNonVolatileConsecutiveLoads(
12644 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
12645 }
12646 return false;
12647 };
12648 if (!IsCheapConcat(V1, V2))
12649 continue;
12650 }
12651
12652 // As we're using both sources then we need to concat them together
12653 // and truncate from the double-sized src.
12654 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12655 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12656
12657 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12658 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12659 Src = DAG.getBitcast(SrcVT, Src);
12660
12661 // Shift the offset'd elements into place for the truncation.
12662 // TODO: Use getTargetVShiftByConstNode.
12663 if (Offset)
12664 Src = DAG.getNode(
12665 X86ISD::VSRLI, DL, SrcVT, Src,
12666 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
12667
12668 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12669 }
12670 }
12671
12672 return SDValue();
12673}
12674
12675/// Check whether a compaction lowering can be done by dropping even/odd
12676/// elements and compute how many times even/odd elements must be dropped.
12677///
12678/// This handles shuffles which take every Nth element where N is a power of
12679/// two. Example shuffle masks:
12680///
12681/// (even)
12682/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12683/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12684/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12685/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12686/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12687/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12688///
12689/// (odd)
12690/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
12691/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12692///
12693/// Any of these lanes can of course be undef.
12694///
12695/// This routine only supports N <= 3.
12696/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12697/// for larger N.
12698///
12699/// \returns N above, or the number of times even/odd elements must be dropped
12700/// if there is such a number. Otherwise returns zero.
12701static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12702 bool IsSingleInput) {
12703 // The modulus for the shuffle vector entries is based on whether this is
12704 // a single input or not.
12705 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12706 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12707, __extension__
__PRETTY_FUNCTION__))
12707 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12707, __extension__
__PRETTY_FUNCTION__))
;
12708
12709 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12710 int Offset = MatchEven ? 0 : 1;
12711
12712 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12713 // and 2^3 simultaneously. This is because we may have ambiguity with
12714 // partially undef inputs.
12715 bool ViableForN[3] = {true, true, true};
12716
12717 for (int i = 0, e = Mask.size(); i < e; ++i) {
12718 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12719 // want.
12720 if (Mask[i] < 0)
12721 continue;
12722
12723 bool IsAnyViable = false;
12724 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12725 if (ViableForN[j]) {
12726 uint64_t N = j + 1;
12727
12728 // The shuffle mask must be equal to (i * 2^N) % M.
12729 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12730 IsAnyViable = true;
12731 else
12732 ViableForN[j] = false;
12733 }
12734 // Early exit if we exhaust the possible powers of two.
12735 if (!IsAnyViable)
12736 break;
12737 }
12738
12739 for (unsigned j = 0; j != std::size(ViableForN); ++j)
12740 if (ViableForN[j])
12741 return j + 1;
12742
12743 // Return 0 as there is no viable power of two.
12744 return 0;
12745}
12746
12747// X86 has dedicated pack instructions that can handle specific truncation
12748// operations: PACKSS and PACKUS.
12749// Checks for compaction shuffle masks if MaxStages > 1.
12750// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12751static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12752 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12753 const SelectionDAG &DAG,
12754 const X86Subtarget &Subtarget,
12755 unsigned MaxStages = 1) {
12756 unsigned NumElts = VT.getVectorNumElements();
12757 unsigned BitSize = VT.getScalarSizeInBits();
12758 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12759, __extension__
__PRETTY_FUNCTION__))
12759 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12759, __extension__
__PRETTY_FUNCTION__))
;
12760
12761 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12762 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12763 unsigned NumPackedBits = NumSrcBits - BitSize;
12764 N1 = peekThroughBitcasts(N1);
12765 N2 = peekThroughBitcasts(N2);
12766 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12767 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12768 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12769 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12770 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12771 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12772 return false;
12773 if (Subtarget.hasSSE41() || BitSize == 8) {
12774 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12775 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12776 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12777 V1 = N1;
12778 V2 = N2;
12779 SrcVT = PackVT;
12780 PackOpcode = X86ISD::PACKUS;
12781 return true;
12782 }
12783 }
12784 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12785 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12786 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12787 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12788 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12789 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12790 V1 = N1;
12791 V2 = N2;
12792 SrcVT = PackVT;
12793 PackOpcode = X86ISD::PACKSS;
12794 return true;
12795 }
12796 return false;
12797 };
12798
12799 // Attempt to match against wider and wider compaction patterns.
12800 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12801 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12802 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12803
12804 // Try binary shuffle.
12805 SmallVector<int, 32> BinaryMask;
12806 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12807 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
12808 if (MatchPACK(V1, V2, PackVT))
12809 return true;
12810
12811 // Try unary shuffle.
12812 SmallVector<int, 32> UnaryMask;
12813 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12814 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
12815 if (MatchPACK(V1, V1, PackVT))
12816 return true;
12817 }
12818
12819 return false;
12820}
12821
12822static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12823 SDValue V1, SDValue V2, SelectionDAG &DAG,
12824 const X86Subtarget &Subtarget) {
12825 MVT PackVT;
12826 unsigned PackOpcode;
12827 unsigned SizeBits = VT.getSizeInBits();
12828 unsigned EltBits = VT.getScalarSizeInBits();
12829 unsigned MaxStages = Log2_32(64 / EltBits);
12830 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12831 Subtarget, MaxStages))
12832 return SDValue();
12833
12834 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12835 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12836
12837 // Don't lower multi-stage packs on AVX512, truncation is better.
12838 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12839 return SDValue();
12840
12841 // Pack to the largest type possible:
12842 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12843 unsigned MaxPackBits = 16;
12844 if (CurrentEltBits > 16 &&
12845 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12846 MaxPackBits = 32;
12847
12848 // Repeatedly pack down to the target size.
12849 SDValue Res;
12850 for (unsigned i = 0; i != NumStages; ++i) {
12851 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12852 unsigned NumSrcElts = SizeBits / SrcEltBits;
12853 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12854 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12855 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12856 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12857 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12858 DAG.getBitcast(SrcVT, V2));
12859 V1 = V2 = Res;
12860 CurrentEltBits /= 2;
12861 }
12862 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12863, __extension__
__PRETTY_FUNCTION__))
12863 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12863, __extension__
__PRETTY_FUNCTION__))
;
12864 return Res;
12865}
12866
12867/// Try to emit a bitmask instruction for a shuffle.
12868///
12869/// This handles cases where we can model a blend exactly as a bitmask due to
12870/// one of the inputs being zeroable.
12871static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12872 SDValue V2, ArrayRef<int> Mask,
12873 const APInt &Zeroable,
12874 const X86Subtarget &Subtarget,
12875 SelectionDAG &DAG) {
12876 MVT MaskVT = VT;
12877 MVT EltVT = VT.getVectorElementType();
12878 SDValue Zero, AllOnes;
12879 // Use f64 if i64 isn't legal.
12880 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12881 EltVT = MVT::f64;
12882 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12883 }
12884
12885 MVT LogicVT = VT;
12886 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12887 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12888 APFloat AllOnesValue =
12889 APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
12890 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12891 LogicVT =
12892 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12893 } else {
12894 Zero = DAG.getConstant(0, DL, EltVT);
12895 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12896 }
12897
12898 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12899 SDValue V;
12900 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12901 if (Zeroable[i])
12902 continue;
12903 if (Mask[i] % Size != i)
12904 return SDValue(); // Not a blend.
12905 if (!V)
12906 V = Mask[i] < Size ? V1 : V2;
12907 else if (V != (Mask[i] < Size ? V1 : V2))
12908 return SDValue(); // Can only let one input through the mask.
12909
12910 VMaskOps[i] = AllOnes;
12911 }
12912 if (!V)
12913 return SDValue(); // No non-zeroable elements!
12914
12915 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12916 VMask = DAG.getBitcast(LogicVT, VMask);
12917 V = DAG.getBitcast(LogicVT, V);
12918 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12919 return DAG.getBitcast(VT, And);
12920}
12921
12922/// Try to emit a blend instruction for a shuffle using bit math.
12923///
12924/// This is used as a fallback approach when first class blend instructions are
12925/// unavailable. Currently it is only suitable for integer vectors, but could
12926/// be generalized for floating point vectors if desirable.
12927static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12928 SDValue V2, ArrayRef<int> Mask,
12929 SelectionDAG &DAG) {
12930 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12930, __extension__
__PRETTY_FUNCTION__))
;
12931 MVT EltVT = VT.getVectorElementType();
12932 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12933 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12934 SmallVector<SDValue, 16> MaskOps;
12935 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12936 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12937 return SDValue(); // Shuffled input!
12938 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12939 }
12940
12941 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12942 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12943 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12944 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12945}
12946
12947static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12948 SDValue PreservedSrc,
12949 const X86Subtarget &Subtarget,
12950 SelectionDAG &DAG);
12951
12952static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12953 MutableArrayRef<int> Mask,
12954 const APInt &Zeroable, bool &ForceV1Zero,
12955 bool &ForceV2Zero, uint64_t &BlendMask) {
12956 bool V1IsZeroOrUndef =
12957 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12958 bool V2IsZeroOrUndef =
12959 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12960
12961 BlendMask = 0;
12962 ForceV1Zero = false, ForceV2Zero = false;
12963 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12963, __extension__
__PRETTY_FUNCTION__))
;
12964
12965 // Attempt to generate the binary blend mask. If an input is zero then
12966 // we can use any lane.
12967 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12968 int M = Mask[i];
12969 if (M == SM_SentinelUndef)
12970 continue;
12971 if (M == i ||
12972 (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12973 Mask[i] = i;
12974 continue;
12975 }
12976 if (M == (i + Size) ||
12977 (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
12978 BlendMask |= 1ull << i;
12979 Mask[i] = i + Size;
12980 continue;
12981 }
12982 if (Zeroable[i]) {
12983 if (V1IsZeroOrUndef) {
12984 ForceV1Zero = true;
12985 Mask[i] = i;
12986 continue;
12987 }
12988 if (V2IsZeroOrUndef) {
12989 ForceV2Zero = true;
12990 BlendMask |= 1ull << i;
12991 Mask[i] = i + Size;
12992 continue;
12993 }
12994 }
12995 return false;
12996 }
12997 return true;
12998}
12999
13000static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
13001 int Scale) {
13002 uint64_t ScaledMask = 0;
13003 for (int i = 0; i != Size; ++i)
13004 if (BlendMask & (1ull << i))
13005 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
13006 return ScaledMask;
13007}
13008
13009/// Try to emit a blend instruction for a shuffle.
13010///
13011/// This doesn't do any checks for the availability of instructions for blending
13012/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
13013/// be matched in the backend with the type given. What it does check for is
13014/// that the shuffle mask is a blend, or convertible into a blend with zero.
13015static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
13016 SDValue V2, ArrayRef<int> Original,
13017 const APInt &Zeroable,
13018 const X86Subtarget &Subtarget,
13019 SelectionDAG &DAG) {
13020 uint64_t BlendMask = 0;
13021 bool ForceV1Zero = false, ForceV2Zero = false;
13022 SmallVector<int, 64> Mask(Original);
13023 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
13024 BlendMask))
13025 return SDValue();
13026
13027 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
13028 if (ForceV1Zero)
13029 V1 = getZeroVector(VT, Subtarget, DAG, DL);
13030 if (ForceV2Zero)
13031 V2 = getZeroVector(VT, Subtarget, DAG, DL);
13032
13033 unsigned NumElts = VT.getVectorNumElements();
13034
13035 switch (VT.SimpleTy) {
13036 case MVT::v4i64:
13037 case MVT::v8i32:
13038 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13038, __extension__
__PRETTY_FUNCTION__))
;
13039 [[fallthrough]];
13040 case MVT::v4f64:
13041 case MVT::v8f32:
13042 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13042, __extension__
__PRETTY_FUNCTION__))
;
13043 [[fallthrough]];
13044 case MVT::v2f64:
13045 case MVT::v2i64:
13046 case MVT::v4f32:
13047 case MVT::v4i32:
13048 case MVT::v8i16:
13049 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13049, __extension__
__PRETTY_FUNCTION__))
;
13050 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
13051 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13052 case MVT::v16i16: {
13053 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13053, __extension__
__PRETTY_FUNCTION__))
;
13054 SmallVector<int, 8> RepeatedMask;
13055 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13056 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
13057 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13057, __extension__
__PRETTY_FUNCTION__))
;
13058 BlendMask = 0;
13059 for (int i = 0; i < 8; ++i)
13060 if (RepeatedMask[i] >= 8)
13061 BlendMask |= 1ull << i;
13062 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13063 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13064 }
13065 // Use PBLENDW for lower/upper lanes and then blend lanes.
13066 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
13067 // merge to VSELECT where useful.
13068 uint64_t LoMask = BlendMask & 0xFF;
13069 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
13070 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
13071 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13072 DAG.getTargetConstant(LoMask, DL, MVT::i8));
13073 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13074 DAG.getTargetConstant(HiMask, DL, MVT::i8));
13075 return DAG.getVectorShuffle(
13076 MVT::v16i16, DL, Lo, Hi,
13077 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
13078 }
13079 [[fallthrough]];
13080 }
13081 case MVT::v32i8:
13082 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13082, __extension__
__PRETTY_FUNCTION__))
;
13083 [[fallthrough]];
13084 case MVT::v16i8: {
13085 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13085, __extension__
__PRETTY_FUNCTION__))
;
13086
13087 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
13088 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13089 Subtarget, DAG))
13090 return Masked;
13091
13092 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
13093 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13094 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13095 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13096 }
13097
13098 // If we have VPTERNLOG, we can use that as a bit blend.
13099 if (Subtarget.hasVLX())
13100 if (SDValue BitBlend =
13101 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13102 return BitBlend;
13103
13104 // Scale the blend by the number of bytes per element.
13105 int Scale = VT.getScalarSizeInBits() / 8;
13106
13107 // This form of blend is always done on bytes. Compute the byte vector
13108 // type.
13109 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13110
13111 // x86 allows load folding with blendvb from the 2nd source operand. But
13112 // we are still using LLVM select here (see comment below), so that's V1.
13113 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
13114 // allow that load-folding possibility.
13115 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
13116 ShuffleVectorSDNode::commuteMask(Mask);
13117 std::swap(V1, V2);
13118 }
13119
13120 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
13121 // mix of LLVM's code generator and the x86 backend. We tell the code
13122 // generator that boolean values in the elements of an x86 vector register
13123 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
13124 // mapping a select to operand #1, and 'false' mapping to operand #2. The
13125 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
13126 // of the element (the remaining are ignored) and 0 in that high bit would
13127 // mean operand #1 while 1 in the high bit would mean operand #2. So while
13128 // the LLVM model for boolean values in vector elements gets the relevant
13129 // bit set, it is set backwards and over constrained relative to x86's
13130 // actual model.
13131 SmallVector<SDValue, 32> VSELECTMask;
13132 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13133 for (int j = 0; j < Scale; ++j)
13134 VSELECTMask.push_back(
13135 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
13136 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
13137 MVT::i8));
13138
13139 V1 = DAG.getBitcast(BlendVT, V1);
13140 V2 = DAG.getBitcast(BlendVT, V2);
13141 return DAG.getBitcast(
13142 VT,
13143 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
13144 V1, V2));
13145 }
13146 case MVT::v16f32:
13147 case MVT::v8f64:
13148 case MVT::v8i64:
13149 case MVT::v16i32:
13150 case MVT::v32i16:
13151 case MVT::v64i8: {
13152 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
13153 bool OptForSize = DAG.shouldOptForSize();
13154 if (!OptForSize) {
13155 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13156 Subtarget, DAG))
13157 return Masked;
13158 }
13159
13160 // Otherwise load an immediate into a GPR, cast to k-register, and use a
13161 // masked move.
13162 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13163 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13164 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13165 }
13166 default:
13167 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13167)
;
13168 }
13169}
13170
13171/// Try to lower as a blend of elements from two inputs followed by
13172/// a single-input permutation.
13173///
13174/// This matches the pattern where we can blend elements from two inputs and
13175/// then reduce the shuffle to a single-input permutation.
13176static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
13177 SDValue V1, SDValue V2,
13178 ArrayRef<int> Mask,
13179 SelectionDAG &DAG,
13180 bool ImmBlends = false) {
13181 // We build up the blend mask while checking whether a blend is a viable way
13182 // to reduce the shuffle.
13183 SmallVector<int, 32> BlendMask(Mask.size(), -1);
13184 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
13185
13186 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13187 if (Mask[i] < 0)
13188 continue;
13189
13190 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13190, __extension__
__PRETTY_FUNCTION__))
;
13191
13192 if (BlendMask[Mask[i] % Size] < 0)
13193 BlendMask[Mask[i] % Size] = Mask[i];
13194 else if (BlendMask[Mask[i] % Size] != Mask[i])
13195 return SDValue(); // Can't blend in the needed input!
13196
13197 PermuteMask[i] = Mask[i] % Size;
13198 }
13199
13200 // If only immediate blends, then bail if the blend mask can't be widened to
13201 // i16.
13202 unsigned EltSize = VT.getScalarSizeInBits();
13203 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
13204 return SDValue();
13205
13206 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
13207 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
13208}
13209
13210/// Try to lower as an unpack of elements from two inputs followed by
13211/// a single-input permutation.
13212///
13213/// This matches the pattern where we can unpack elements from two inputs and
13214/// then reduce the shuffle to a single-input (wider) permutation.
13215static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
13216 SDValue V1, SDValue V2,
13217 ArrayRef<int> Mask,
13218 SelectionDAG &DAG) {
13219 int NumElts = Mask.size();
13220 int NumLanes = VT.getSizeInBits() / 128;
13221 int NumLaneElts = NumElts / NumLanes;
13222 int NumHalfLaneElts = NumLaneElts / 2;
13223
13224 bool MatchLo = true, MatchHi = true;
13225 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13226
13227 // Determine UNPCKL/UNPCKH type and operand order.
13228 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
13229 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
13230 int M = Mask[Lane + Elt];
13231 if (M < 0)
13232 continue;
13233
13234 SDValue &Op = Ops[Elt & 1];
13235 if (M < NumElts && (Op.isUndef() || Op == V1))
13236 Op = V1;
13237 else if (NumElts <= M && (Op.isUndef() || Op == V2))
13238 Op = V2;
13239 else
13240 return SDValue();
13241
13242 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
13243 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
13244 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
13245 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
13246 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
13247 if (!MatchLo && !MatchHi)
13248 return SDValue();
13249 }
13250 }
13251 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13251, __extension__
__PRETTY_FUNCTION__))
;
13252
13253 // Now check that each pair of elts come from the same unpack pair
13254 // and set the permute mask based on each pair.
13255 // TODO - Investigate cases where we permute individual elements.
13256 SmallVector<int, 32> PermuteMask(NumElts, -1);
13257 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
13258 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
13259 int M0 = Mask[Lane + Elt + 0];
13260 int M1 = Mask[Lane + Elt + 1];
13261 if (0 <= M0 && 0 <= M1 &&
13262 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
13263 return SDValue();
13264 if (0 <= M0)
13265 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
13266 if (0 <= M1)
13267 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
13268 }
13269 }
13270
13271 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
13272 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
13273 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
13274}
13275
13276/// Try to lower a shuffle as a permute of the inputs followed by an
13277/// UNPCK instruction.
13278///
13279/// This specifically targets cases where we end up with alternating between
13280/// the two inputs, and so can permute them into something that feeds a single
13281/// UNPCK instruction. Note that this routine only targets integer vectors
13282/// because for floating point vectors we have a generalized SHUFPS lowering
13283/// strategy that handles everything that doesn't *exactly* match an unpack,
13284/// making this clever lowering unnecessary.
13285static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
13286 SDValue V1, SDValue V2,
13287 ArrayRef<int> Mask,
13288 const X86Subtarget &Subtarget,
13289 SelectionDAG &DAG) {
13290 int Size = Mask.size();
13291 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13291, __extension__
__PRETTY_FUNCTION__))
;
13292
13293 // This routine only supports 128-bit integer dual input vectors.
13294 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
13295 return SDValue();
13296
13297 int NumLoInputs =
13298 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13299 int NumHiInputs =
13300 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13301
13302 bool UnpackLo = NumLoInputs >= NumHiInputs;
13303
13304 auto TryUnpack = [&](int ScalarSize, int Scale) {
13305 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13306 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13307
13308 for (int i = 0; i < Size; ++i) {
13309 if (Mask[i] < 0)
13310 continue;
13311
13312 // Each element of the unpack contains Scale elements from this mask.
13313 int UnpackIdx = i / Scale;
13314
13315 // We only handle the case where V1 feeds the first slots of the unpack.
13316 // We rely on canonicalization to ensure this is the case.
13317 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13318 return SDValue();
13319
13320 // Setup the mask for this input. The indexing is tricky as we have to
13321 // handle the unpack stride.
13322 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13323 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13324 Mask[i] % Size;
13325 }
13326
13327 // If we will have to shuffle both inputs to use the unpack, check whether
13328 // we can just unpack first and shuffle the result. If so, skip this unpack.
13329 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13330 !isNoopShuffleMask(V2Mask))
13331 return SDValue();
13332
13333 // Shuffle the inputs into place.
13334 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13335 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13336
13337 // Cast the inputs to the type we will use to unpack them.
13338 MVT UnpackVT =
13339 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13340 V1 = DAG.getBitcast(UnpackVT, V1);
13341 V2 = DAG.getBitcast(UnpackVT, V2);
13342
13343 // Unpack the inputs and cast the result back to the desired type.
13344 return DAG.getBitcast(
13345 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13346 UnpackVT, V1, V2));
13347 };
13348
13349 // We try each unpack from the largest to the smallest to try and find one
13350 // that fits this mask.
13351 int OrigScalarSize = VT.getScalarSizeInBits();
13352 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13353 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13354 return Unpack;
13355
13356 // If we're shuffling with a zero vector then we're better off not doing
13357 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13358 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13359 ISD::isBuildVectorAllZeros(V2.getNode()))
13360 return SDValue();
13361
13362 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13363 // initial unpack.
13364 if (NumLoInputs == 0 || NumHiInputs == 0) {
13365 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13366, __extension__
__PRETTY_FUNCTION__))
13366 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13366, __extension__
__PRETTY_FUNCTION__))
;
13367 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13368
13369 // FIXME: We could consider the total complexity of the permute of each
13370 // possible unpacking. Or at the least we should consider how many
13371 // half-crossings are created.
13372 // FIXME: We could consider commuting the unpacks.
13373
13374 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13375 for (int i = 0; i < Size; ++i) {
13376 if (Mask[i] < 0)
13377 continue;
13378
13379 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13379, __extension__
__PRETTY_FUNCTION__))
;
13380
13381 PermMask[i] =
13382 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13383 }
13384 return DAG.getVectorShuffle(
13385 VT, DL,
13386 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
13387 V1, V2),
13388 DAG.getUNDEF(VT), PermMask);
13389 }
13390
13391 return SDValue();
13392}
13393
13394/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
13395/// permuting the elements of the result in place.
13396static SDValue lowerShuffleAsByteRotateAndPermute(
13397 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13398 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13399 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
13400 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
13401 (VT.is512BitVector() && !Subtarget.hasBWI()))
13402 return SDValue();
13403
13404 // We don't currently support lane crossing permutes.
13405 if (is128BitLaneCrossingShuffleMask(VT, Mask))
13406 return SDValue();
13407
13408 int Scale = VT.getScalarSizeInBits() / 8;
13409 int NumLanes = VT.getSizeInBits() / 128;
13410 int NumElts = VT.getVectorNumElements();
13411 int NumEltsPerLane = NumElts / NumLanes;
13412
13413 // Determine range of mask elts.
13414 bool Blend1 = true;
13415 bool Blend2 = true;
13416 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13417 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
13418 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13419 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13420 int M = Mask[Lane + Elt];
13421 if (M < 0)
13422 continue;
13423 if (M < NumElts) {
13424 Blend1 &= (M == (Lane + Elt));
13425 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13425, __extension__
__PRETTY_FUNCTION__))
;
13426 M = M % NumEltsPerLane;
13427 Range1.first = std::min(Range1.first, M);
13428 Range1.second = std::max(Range1.second, M);
13429 } else {
13430 M -= NumElts;
13431 Blend2 &= (M == (Lane + Elt));
13432 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13432, __extension__
__PRETTY_FUNCTION__))
;
13433 M = M % NumEltsPerLane;
13434 Range2.first = std::min(Range2.first, M);
13435 Range2.second = std::max(Range2.second, M);
13436 }
13437 }
13438 }
13439
13440 // Bail if we don't need both elements.
13441 // TODO - it might be worth doing this for unary shuffles if the permute
13442 // can be widened.
13443 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
13444 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
13445 return SDValue();
13446
13447 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
13448 return SDValue();
13449
13450 // Rotate the 2 ops so we can access both ranges, then permute the result.
13451 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
13452 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13453 SDValue Rotate = DAG.getBitcast(
13454 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
13455 DAG.getBitcast(ByteVT, Lo),
13456 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
13457 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
13458 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13459 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13460 int M = Mask[Lane + Elt];
13461 if (M < 0)
13462 continue;
13463 if (M < NumElts)
13464 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
13465 else
13466 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
13467 }
13468 }
13469 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
13470 };
13471
13472 // Check if the ranges are small enough to rotate from either direction.
13473 if (Range2.second < Range1.first)
13474 return RotateAndPermute(V1, V2, Range1.first, 0);
13475 if (Range1.second < Range2.first)
13476 return RotateAndPermute(V2, V1, Range2.first, NumElts);
13477 return SDValue();
13478}
13479
13480static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
13481 return isUndefOrEqual(Mask, 0);
13482}
13483
13484static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
13485 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
13486}
13487
13488/// Generic routine to decompose a shuffle and blend into independent
13489/// blends and permutes.
13490///
13491/// This matches the extremely common pattern for handling combined
13492/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
13493/// operations. It will try to pick the best arrangement of shuffles and
13494/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
13495static SDValue lowerShuffleAsDecomposedShuffleMerge(
13496 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13497 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13498 int NumElts = Mask.size();
13499 int NumLanes = VT.getSizeInBits() / 128;
13500 int NumEltsPerLane = NumElts / NumLanes;
13501
13502 // Shuffle the input elements into the desired positions in V1 and V2 and
13503 // unpack/blend them together.
13504 bool IsAlternating = true;
13505 SmallVector<int, 32> V1Mask(NumElts, -1);
13506 SmallVector<int, 32> V2Mask(NumElts, -1);
13507 SmallVector<int, 32> FinalMask(NumElts, -1);
13508 for (int i = 0; i < NumElts; ++i) {
13509 int M = Mask[i];
13510 if (M >= 0 && M < NumElts) {
13511 V1Mask[i] = M;
13512 FinalMask[i] = i;
13513 IsAlternating &= (i & 1) == 0;
13514 } else if (M >= NumElts) {
13515 V2Mask[i] = M - NumElts;
13516 FinalMask[i] = i + NumElts;
13517 IsAlternating &= (i & 1) == 1;
13518 }
13519 }
13520
13521 // If we effectively only demand the 0'th element of \p Input, and not only
13522 // as 0'th element, then broadcast said input,
13523 // and change \p InputMask to be a no-op (identity) mask.
13524 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
13525 &DAG](SDValue &Input,
13526 MutableArrayRef<int> InputMask) {
13527 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
13528 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
13529 !X86::mayFoldLoad(Input, Subtarget)))
13530 return;
13531 if (isNoopShuffleMask(InputMask))
13532 return;
13533 assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__))
13534 "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__))
;
13535 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13536 for (auto I : enumerate(InputMask)) {
13537 int &InputMaskElt = I.value();
13538 if (InputMaskElt >= 0)
13539 InputMaskElt = I.index();
13540 }
13541 };
13542
13543 // Currently, we may need to produce one shuffle per input, and blend results.
13544 // It is possible that the shuffle for one of the inputs is already a no-op.
13545 // See if we can simplify non-no-op shuffles into broadcasts,
13546 // which we consider to be strictly better than an arbitrary shuffle.
13547 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13548 isNoopOrBroadcastShuffleMask(V2Mask)) {
13549 canonicalizeBroadcastableInput(V1, V1Mask);
13550 canonicalizeBroadcastableInput(V2, V2Mask);
13551 }
13552
13553 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13554 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13555 // the shuffle may be able to fold with a load or other benefit. However, when
13556 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13557 // pre-shuffle first is a better strategy.
13558 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13559 // Only prefer immediate blends to unpack/rotate.
13560 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13561 DAG, true))
13562 return BlendPerm;
13563 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
13564 DAG))
13565 return UnpackPerm;
13566 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13567 DL, VT, V1, V2, Mask, Subtarget, DAG))
13568 return RotatePerm;
13569 // Unpack/rotate failed - try again with variable blends.
13570 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13571 DAG))
13572 return BlendPerm;
13573 if (VT.getScalarSizeInBits() >= 32)
13574 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
13575 DL, VT, V1, V2, Mask, Subtarget, DAG))
13576 return PermUnpack;
13577 }
13578
13579 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13580 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13581 // TODO: It doesn't have to be alternating - but each lane mustn't have more
13582 // than half the elements coming from each source.
13583 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13584 V1Mask.assign(NumElts, -1);
13585 V2Mask.assign(NumElts, -1);
13586 FinalMask.assign(NumElts, -1);
13587 for (int i = 0; i != NumElts; i += NumEltsPerLane)
13588 for (int j = 0; j != NumEltsPerLane; ++j) {
13589 int M = Mask[i + j];
13590 if (M >= 0 && M < NumElts) {
13591 V1Mask[i + (j / 2)] = M;
13592 FinalMask[i + j] = i + (j / 2);
13593 } else if (M >= NumElts) {
13594 V2Mask[i + (j / 2)] = M - NumElts;
13595 FinalMask[i + j] = i + (j / 2) + NumElts;
13596 }
13597 }
13598 }
13599
13600 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13601 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13602 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13603}
13604
13605/// Try to lower a vector shuffle as a bit rotation.
13606///
13607/// Look for a repeated rotation pattern in each sub group.
13608/// Returns a ISD::ROTL element rotation amount or -1 if failed.
13609static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13610 int NumElts = Mask.size();
13611 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13611, __extension__
__PRETTY_FUNCTION__))
;
13612
13613 int RotateAmt = -1;
13614 for (int i = 0; i != NumElts; i += NumSubElts) {
13615 for (int j = 0; j != NumSubElts; ++j) {
13616 int M = Mask[i + j];
13617 if (M < 0)
13618 continue;
13619 if (!isInRange(M, i, i + NumSubElts))
13620 return -1;
13621 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13622 if (0 <= RotateAmt && Offset != RotateAmt)
13623 return -1;
13624 RotateAmt = Offset;
13625 }
13626 }
13627 return RotateAmt;
13628}
13629
13630static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13631 const X86Subtarget &Subtarget,
13632 ArrayRef<int> Mask) {
13633 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13633, __extension__
__PRETTY_FUNCTION__))
;
13634 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13634, __extension__
__PRETTY_FUNCTION__))
;
13635
13636 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13637 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13638 int MaxSubElts = 64 / EltSizeInBits;
13639 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13640 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13641 if (RotateAmt < 0)
13642 continue;
13643
13644 int NumElts = Mask.size();
13645 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13646 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13647 return RotateAmt * EltSizeInBits;
13648 }
13649
13650 return -1;
13651}
13652
13653/// Lower shuffle using X86ISD::VROTLI rotations.
13654static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13655 ArrayRef<int> Mask,
13656 const X86Subtarget &Subtarget,
13657 SelectionDAG &DAG) {
13658 // Only XOP + AVX512 targets have bit rotation instructions.
13659 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13660 bool IsLegal =
13661 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13662 if (!IsLegal && Subtarget.hasSSE3())
13663 return SDValue();
13664
13665 MVT RotateVT;
13666 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13667 Subtarget, Mask);
13668 if (RotateAmt < 0)
13669 return SDValue();
13670
13671 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13672 // expanded to OR(SRL,SHL), will be more efficient, but if they can
13673 // widen to vXi16 or more then existing lowering should will be better.
13674 if (!IsLegal) {
13675 if ((RotateAmt % 16) == 0)
13676 return SDValue();
13677 // TODO: Use getTargetVShiftByConstNode.
13678 unsigned ShlAmt = RotateAmt;
13679 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13680 V1 = DAG.getBitcast(RotateVT, V1);
13681 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13682 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13683 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13684 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13685 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13686 return DAG.getBitcast(VT, Rot);
13687 }
13688
13689 SDValue Rot =
13690 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13691 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13692 return DAG.getBitcast(VT, Rot);
13693}
13694
13695/// Try to match a vector shuffle as an element rotation.
13696///
13697/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13698static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13699 ArrayRef<int> Mask) {
13700 int NumElts = Mask.size();
13701
13702 // We need to detect various ways of spelling a rotation:
13703 // [11, 12, 13, 14, 15, 0, 1, 2]
13704 // [-1, 12, 13, 14, -1, -1, 1, -1]
13705 // [-1, -1, -1, -1, -1, -1, 1, 2]
13706 // [ 3, 4, 5, 6, 7, 8, 9, 10]
13707 // [-1, 4, 5, 6, -1, -1, 9, -1]
13708 // [-1, 4, 5, 6, -1, -1, -1, -1]
13709 int Rotation = 0;
13710 SDValue Lo, Hi;
13711 for (int i = 0; i < NumElts; ++i) {
13712 int M = Mask[i];
13713 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13714, __extension__
__PRETTY_FUNCTION__))
13714 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13714, __extension__
__PRETTY_FUNCTION__))
;
13715 if (M < 0)
13716 continue;
13717
13718 // Determine where a rotated vector would have started.
13719 int StartIdx = i - (M % NumElts);
13720 if (StartIdx == 0)
13721 // The identity rotation isn't interesting, stop.
13722 return -1;
13723
13724 // If we found the tail of a vector the rotation must be the missing
13725 // front. If we found the head of a vector, it must be how much of the
13726 // head.
13727 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13728
13729 if (Rotation == 0)
13730 Rotation = CandidateRotation;
13731 else if (Rotation != CandidateRotation)
13732 // The rotations don't match, so we can't match this mask.
13733 return -1;
13734
13735 // Compute which value this mask is pointing at.
13736 SDValue MaskV = M < NumElts ? V1 : V2;
13737
13738 // Compute which of the two target values this index should be assigned
13739 // to. This reflects whether the high elements are remaining or the low
13740 // elements are remaining.
13741 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13742
13743 // Either set up this value if we've not encountered it before, or check
13744 // that it remains consistent.
13745 if (!TargetV)
13746 TargetV = MaskV;
13747 else if (TargetV != MaskV)
13748 // This may be a rotation, but it pulls from the inputs in some
13749 // unsupported interleaving.
13750 return -1;
13751 }
13752
13753 // Check that we successfully analyzed the mask, and normalize the results.
13754 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13754, __extension__
__PRETTY_FUNCTION__))
;
13755 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13755, __extension__
__PRETTY_FUNCTION__))
;
13756 if (!Lo)
13757 Lo = Hi;
13758 else if (!Hi)
13759 Hi = Lo;
13760
13761 V1 = Lo;
13762 V2 = Hi;
13763
13764 return Rotation;
13765}
13766
13767/// Try to lower a vector shuffle as a byte rotation.
13768///
13769/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13770/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13771/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13772/// try to generically lower a vector shuffle through such an pattern. It
13773/// does not check for the profitability of lowering either as PALIGNR or
13774/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13775/// This matches shuffle vectors that look like:
13776///
13777/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13778///
13779/// Essentially it concatenates V1 and V2, shifts right by some number of
13780/// elements, and takes the low elements as the result. Note that while this is
13781/// specified as a *right shift* because x86 is little-endian, it is a *left
13782/// rotate* of the vector lanes.
13783static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13784 ArrayRef<int> Mask) {
13785 // Don't accept any shuffles with zero elements.
13786 if (isAnyZero(Mask))
13787 return -1;
13788
13789 // PALIGNR works on 128-bit lanes.
13790 SmallVector<int, 16> RepeatedMask;
13791 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13792 return -1;
13793
13794 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
13795 if (Rotation <= 0)
13796 return -1;
13797
13798 // PALIGNR rotates bytes, so we need to scale the
13799 // rotation based on how many bytes are in the vector lane.
13800 int NumElts = RepeatedMask.size();
13801 int Scale = 16 / NumElts;
13802 return Rotation * Scale;
13803}
13804
13805static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
13806 SDValue V2, ArrayRef<int> Mask,
13807 const X86Subtarget &Subtarget,
13808 SelectionDAG &DAG) {
13809 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13809, __extension__
__PRETTY_FUNCTION__))
;
13810
13811 SDValue Lo = V1, Hi = V2;
13812 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
13813 if (ByteRotation <= 0)
13814 return SDValue();
13815
13816 // Cast the inputs to i8 vector of correct length to match PALIGNR or
13817 // PSLLDQ/PSRLDQ.
13818 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13819 Lo = DAG.getBitcast(ByteVT, Lo);
13820 Hi = DAG.getBitcast(ByteVT, Hi);
13821
13822 // SSSE3 targets can use the palignr instruction.
13823 if (Subtarget.hasSSSE3()) {
13824 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13825, __extension__
__PRETTY_FUNCTION__))
13825 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13825, __extension__
__PRETTY_FUNCTION__))
;
13826 return DAG.getBitcast(
13827 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
13828 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
13829 }
13830
13831 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13832, __extension__
__PRETTY_FUNCTION__))
13832 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13832, __extension__
__PRETTY_FUNCTION__))
;
13833 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13834, __extension__
__PRETTY_FUNCTION__))
13834 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13834, __extension__
__PRETTY_FUNCTION__))
;
13835 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13836, __extension__
__PRETTY_FUNCTION__))
13836 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13836, __extension__
__PRETTY_FUNCTION__))
;
13837
13838 // Default SSE2 implementation
13839 int LoByteShift = 16 - ByteRotation;
13840 int HiByteShift = ByteRotation;
13841
13842 SDValue LoShift =
13843 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
13844 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
13845 SDValue HiShift =
13846 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
13847 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
13848 return DAG.getBitcast(VT,
13849 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
13850}
13851
13852/// Try to lower a vector shuffle as a dword/qword rotation.
13853///
13854/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13855/// rotation of the concatenation of two vectors; This routine will
13856/// try to generically lower a vector shuffle through such an pattern.
13857///
13858/// Essentially it concatenates V1 and V2, shifts right by some number of
13859/// elements, and takes the low elements as the result. Note that while this is
13860/// specified as a *right shift* because x86 is little-endian, it is a *left
13861/// rotate* of the vector lanes.
13862static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13863 SDValue V2, ArrayRef<int> Mask,
13864 const X86Subtarget &Subtarget,
13865 SelectionDAG &DAG) {
13866 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13867, __extension__
__PRETTY_FUNCTION__))
13867 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13867, __extension__
__PRETTY_FUNCTION__))
;
13868
13869 // 128/256-bit vectors are only supported with VLX.
13870 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13871, __extension__
__PRETTY_FUNCTION__))
13871 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13871, __extension__
__PRETTY_FUNCTION__))
;
13872
13873 SDValue Lo = V1, Hi = V2;
13874 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13875 if (Rotation <= 0)
13876 return SDValue();
13877
13878 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13879 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13880}
13881
13882/// Try to lower a vector shuffle as a byte shift sequence.
13883static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13884 SDValue V2, ArrayRef<int> Mask,
13885 const APInt &Zeroable,
13886 const X86Subtarget &Subtarget,
13887 SelectionDAG &DAG) {
13888 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13888, __extension__
__PRETTY_FUNCTION__))
;
13889 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13889, __extension__
__PRETTY_FUNCTION__))
;
13890
13891 // We need a shuffle that has zeros at one/both ends and a sequential
13892 // shuffle from one source within.
13893 unsigned ZeroLo = Zeroable.countTrailingOnes();
13894 unsigned ZeroHi = Zeroable.countLeadingOnes();
13895 if (!ZeroLo && !ZeroHi)
13896 return SDValue();
13897
13898 unsigned NumElts = Mask.size();
13899 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13900 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13901 return SDValue();
13902
13903 unsigned Scale = VT.getScalarSizeInBits() / 8;
13904 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13905 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13906 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13907 return SDValue();
13908
13909 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13910 Res = DAG.getBitcast(MVT::v16i8, Res);
13911
13912 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13913 // inner sequential set of elements, possibly offset:
13914 // 01234567 --> zzzzzz01 --> 1zzzzzzz
13915 // 01234567 --> 4567zzzz --> zzzzz456
13916 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13917 if (ZeroLo == 0) {
13918 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13919 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13920 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13921 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13922 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13923 } else if (ZeroHi == 0) {
13924 unsigned Shift = Mask[ZeroLo] % NumElts;
13925 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13926 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13927 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13928 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13929 } else if (!Subtarget.hasSSSE3()) {
13930 // If we don't have PSHUFB then its worth avoiding an AND constant mask
13931 // by performing 3 byte shifts. Shuffle combining can kick in above that.
13932 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13933 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13934 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13935 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13936 Shift += Mask[ZeroLo] % NumElts;
13937 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13938 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13939 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13940 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13941 } else
13942 return SDValue();
13943
13944 return DAG.getBitcast(VT, Res);
13945}
13946
13947/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
13948///
13949/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
13950/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
13951/// matches elements from one of the input vectors shuffled to the left or
13952/// right with zeroable elements 'shifted in'. It handles both the strictly
13953/// bit-wise element shifts and the byte shift across an entire 128-bit double
13954/// quad word lane.
13955///
13956/// PSHL : (little-endian) left bit shift.
13957/// [ zz, 0, zz, 2 ]
13958/// [ -1, 4, zz, -1 ]
13959/// PSRL : (little-endian) right bit shift.
13960/// [ 1, zz, 3, zz]
13961/// [ -1, -1, 7, zz]
13962/// PSLLDQ : (little-endian) left byte shift
13963/// [ zz, 0, 1, 2, 3, 4, 5, 6]
13964/// [ zz, zz, -1, -1, 2, 3, 4, -1]
13965/// [ zz, zz, zz, zz, zz, zz, -1, 1]
13966/// PSRLDQ : (little-endian) right byte shift
13967/// [ 5, 6, 7, zz, zz, zz, zz, zz]
13968/// [ -1, 5, 6, 7, zz, zz, zz, zz]
13969/// [ 1, 2, -1, -1, -1, -1, zz, zz]
13970static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
13971 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
13972 int MaskOffset, const APInt &Zeroable,
13973 const X86Subtarget &Subtarget) {
13974 int Size = Mask.size();
13975 unsigned SizeInBits = Size * ScalarSizeInBits;
13976
13977 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
13978 for (int i = 0; i < Size; i += Scale)
13979 for (int j = 0; j < Shift; ++j)
13980 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
13981 return false;
13982
13983 return true;
13984 };
13985
13986 auto MatchShift = [&](int Shift, int Scale, bool Left) {
13987 for (int i = 0; i != Size; i += Scale) {
13988 unsigned Pos = Left ? i + Shift : i;
13989 unsigned Low = Left ? i : i + Shift;
13990 unsigned Len = Scale - Shift;
13991 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
13992 return -1;
13993 }
13994
13995 int ShiftEltBits = ScalarSizeInBits * Scale;
13996 bool ByteShift = ShiftEltBits > 64;
13997 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
13998 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
13999 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
14000
14001 // Normalize the scale for byte shifts to still produce an i64 element
14002 // type.
14003 Scale = ByteShift ? Scale / 2 : Scale;
14004
14005 // We need to round trip through the appropriate type for the shift.
14006 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
14007 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
14008 : MVT::getVectorVT(ShiftSVT, Size / Scale);
14009 return (int)ShiftAmt;
14010 };
14011
14012 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
14013 // keep doubling the size of the integer elements up to that. We can
14014 // then shift the elements of the integer vector by whole multiples of
14015 // their width within the elements of the larger integer vector. Test each
14016 // multiple to see if we can find a match with the moved element indices
14017 // and that the shifted in elements are all zeroable.
14018 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
14019 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
14020 for (int Shift = 1; Shift != Scale; ++Shift)
14021 for (bool Left : {true, false})
14022 if (CheckZeros(Shift, Scale, Left)) {
14023 int ShiftAmt = MatchShift(Shift, Scale, Left);
14024 if (0 < ShiftAmt)
14025 return ShiftAmt;
14026 }
14027
14028 // no match
14029 return -1;
14030}
14031
14032static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
14033 SDValue V2, ArrayRef<int> Mask,
14034 const APInt &Zeroable,
14035 const X86Subtarget &Subtarget,
14036 SelectionDAG &DAG) {
14037 int Size = Mask.size();
14038 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14038, __extension__
__PRETTY_FUNCTION__))
;
14039
14040 MVT ShiftVT;
14041 SDValue V = V1;
14042 unsigned Opcode;
14043
14044 // Try to match shuffle against V1 shift.
14045 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14046 Mask, 0, Zeroable, Subtarget);
14047
14048 // If V1 failed, try to match shuffle against V2 shift.
14049 if (ShiftAmt < 0) {
14050 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14051 Mask, Size, Zeroable, Subtarget);
14052 V = V2;
14053 }
14054
14055 if (ShiftAmt < 0)
14056 return SDValue();
14057
14058 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14059, __extension__
__PRETTY_FUNCTION__))
14059 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14059, __extension__
__PRETTY_FUNCTION__))
;
14060 V = DAG.getBitcast(ShiftVT, V);
14061 V = DAG.getNode(Opcode, DL, ShiftVT, V,
14062 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
14063 return DAG.getBitcast(VT, V);
14064}
14065
14066// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
14067// Remainder of lower half result is zero and upper half is all undef.
14068static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
14069 ArrayRef<int> Mask, uint64_t &BitLen,
14070 uint64_t &BitIdx, const APInt &Zeroable) {
14071 int Size = Mask.size();
14072 int HalfSize = Size / 2;
14073 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14073, __extension__
__PRETTY_FUNCTION__))
;
14074 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14074, __extension__
__PRETTY_FUNCTION__))
;
14075
14076 // Upper half must be undefined.
14077 if (!isUndefUpperHalf(Mask))
14078 return false;
14079
14080 // Determine the extraction length from the part of the
14081 // lower half that isn't zeroable.
14082 int Len = HalfSize;
14083 for (; Len > 0; --Len)
14084 if (!Zeroable[Len - 1])
14085 break;
14086 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14086, __extension__
__PRETTY_FUNCTION__))
;
14087
14088 // Attempt to match first Len sequential elements from the lower half.
14089 SDValue Src;
14090 int Idx = -1;
14091 for (int i = 0; i != Len; ++i) {
14092 int M = Mask[i];
14093 if (M == SM_SentinelUndef)
14094 continue;
14095 SDValue &V = (M < Size ? V1 : V2);
14096 M = M % Size;
14097
14098 // The extracted elements must start at a valid index and all mask
14099 // elements must be in the lower half.
14100 if (i > M || M >= HalfSize)
14101 return false;
14102
14103 if (Idx < 0 || (Src == V && Idx == (M - i))) {
14104 Src = V;
14105 Idx = M - i;
14106 continue;
14107 }
14108 return false;
14109 }
14110
14111 if (!Src || Idx < 0)
14112 return false;
14113
14114 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14114, __extension__
__PRETTY_FUNCTION__))
;
14115 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14116 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14117 V1 = Src;
14118 return true;
14119}
14120
14121// INSERTQ: Extract lowest Len elements from lower half of second source and
14122// insert over first source, starting at Idx.
14123// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
14124static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
14125 ArrayRef<int> Mask, uint64_t &BitLen,
14126 uint64_t &BitIdx) {
14127 int Size = Mask.size();
14128 int HalfSize = Size / 2;
14129 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14129, __extension__
__PRETTY_FUNCTION__))
;
14130
14131 // Upper half must be undefined.
14132 if (!isUndefUpperHalf(Mask))
14133 return false;
14134
14135 for (int Idx = 0; Idx != HalfSize; ++Idx) {
14136 SDValue Base;
14137
14138 // Attempt to match first source from mask before insertion point.
14139 if (isUndefInRange(Mask, 0, Idx)) {
14140 /* EMPTY */
14141 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
14142 Base = V1;
14143 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
14144 Base = V2;
14145 } else {
14146 continue;
14147 }
14148
14149 // Extend the extraction length looking to match both the insertion of
14150 // the second source and the remaining elements of the first.
14151 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
14152 SDValue Insert;
14153 int Len = Hi - Idx;
14154
14155 // Match insertion.
14156 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
14157 Insert = V1;
14158 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
14159 Insert = V2;
14160 } else {
14161 continue;
14162 }
14163
14164 // Match the remaining elements of the lower half.
14165 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
14166 /* EMPTY */
14167 } else if ((!Base || (Base == V1)) &&
14168 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
14169 Base = V1;
14170 } else if ((!Base || (Base == V2)) &&
14171 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
14172 Size + Hi)) {
14173 Base = V2;
14174 } else {
14175 continue;
14176 }
14177
14178 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14179 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14180 V1 = Base;
14181 V2 = Insert;
14182 return true;
14183 }
14184 }
14185
14186 return false;
14187}
14188
14189/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
14190static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
14191 SDValue V2, ArrayRef<int> Mask,
14192 const APInt &Zeroable, SelectionDAG &DAG) {
14193 uint64_t BitLen, BitIdx;
14194 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
14195 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
14196 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14197 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14198
14199 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
14200 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
14201 V2 ? V2 : DAG.getUNDEF(VT),
14202 DAG.getTargetConstant(BitLen, DL, MVT::i8),
14203 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14204
14205 return SDValue();
14206}
14207
14208/// Lower a vector shuffle as a zero or any extension.
14209///
14210/// Given a specific number of elements, element bit width, and extension
14211/// stride, produce either a zero or any extension based on the available
14212/// features of the subtarget. The extended elements are consecutive and
14213/// begin and can start from an offsetted element index in the input; to
14214/// avoid excess shuffling the offset must either being in the bottom lane
14215/// or at the start of a higher lane. All extended elements must be from
14216/// the same lane.
14217static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14218 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
14219 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14220 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14220, __extension__
__PRETTY_FUNCTION__))
;
14221 int EltBits = VT.getScalarSizeInBits();
14222 int NumElements = VT.getVectorNumElements();
14223 int NumEltsPerLane = 128 / EltBits;
14224 int OffsetLane = Offset / NumEltsPerLane;
14225 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14226, __extension__
__PRETTY_FUNCTION__))
14226 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14226, __extension__
__PRETTY_FUNCTION__))
;
14227 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14227, __extension__
__PRETTY_FUNCTION__))
;
14228 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14228, __extension__
__PRETTY_FUNCTION__))
;
14229 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14230, __extension__
__PRETTY_FUNCTION__))
14230 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14230, __extension__
__PRETTY_FUNCTION__))
;
14231
14232 // Check that an index is in same lane as the base offset.
14233 auto SafeOffset = [&](int Idx) {
14234 return OffsetLane == (Idx / NumEltsPerLane);
14235 };
14236
14237 // Shift along an input so that the offset base moves to the first element.
14238 auto ShuffleOffset = [&](SDValue V) {
14239 if (!Offset)
14240 return V;
14241
14242 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14243 for (int i = 0; i * Scale < NumElements; ++i) {
14244 int SrcIdx = i + Offset;
14245 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
14246 }
14247 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
14248 };
14249
14250 // Found a valid a/zext mask! Try various lowering strategies based on the
14251 // input type and available ISA extensions.
14252 if (Subtarget.hasSSE41()) {
14253 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
14254 // PUNPCK will catch this in a later shuffle match.
14255 if (Offset && Scale == 2 && VT.is128BitVector())
14256 return SDValue();
14257 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14258 NumElements / Scale);
14259 InputV = ShuffleOffset(InputV);
14260 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14261 DL, ExtVT, InputV, DAG);
14262 return DAG.getBitcast(VT, InputV);
14263 }
14264
14265 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14265, __extension__
__PRETTY_FUNCTION__))
;
14266
14267 // For any extends we can cheat for larger element sizes and use shuffle
14268 // instructions that can fold with a load and/or copy.
14269 if (AnyExt && EltBits == 32) {
14270 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
14271 -1};
14272 return DAG.getBitcast(
14273 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14274 DAG.getBitcast(MVT::v4i32, InputV),
14275 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14276 }
14277 if (AnyExt && EltBits == 16 && Scale > 2) {
14278 int PSHUFDMask[4] = {Offset / 2, -1,
14279 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
14280 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14281 DAG.getBitcast(MVT::v4i32, InputV),
14282 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14283 int PSHUFWMask[4] = {1, -1, -1, -1};
14284 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
14285 return DAG.getBitcast(
14286 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
14287 DAG.getBitcast(MVT::v8i16, InputV),
14288 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
14289 }
14290
14291 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
14292 // to 64-bits.
14293 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
14294 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14294, __extension__
__PRETTY_FUNCTION__))
;
14295 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14295, __extension__
__PRETTY_FUNCTION__))
;
14296
14297 int LoIdx = Offset * EltBits;
14298 SDValue Lo = DAG.getBitcast(
14299 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14300 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14301 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
14302
14303 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
14304 return DAG.getBitcast(VT, Lo);
14305
14306 int HiIdx = (Offset + 1) * EltBits;
14307 SDValue Hi = DAG.getBitcast(
14308 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14309 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14310 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
14311 return DAG.getBitcast(VT,
14312 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
14313 }
14314
14315 // If this would require more than 2 unpack instructions to expand, use
14316 // pshufb when available. We can only use more than 2 unpack instructions
14317 // when zero extending i8 elements which also makes it easier to use pshufb.
14318 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
14319 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14319, __extension__
__PRETTY_FUNCTION__))
;
14320 SDValue PSHUFBMask[16];
14321 for (int i = 0; i < 16; ++i) {
14322 int Idx = Offset + (i / Scale);
14323 if ((i % Scale == 0 && SafeOffset(Idx))) {
14324 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
14325 continue;
14326 }
14327 PSHUFBMask[i] =
14328 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
14329 }
14330 InputV = DAG.getBitcast(MVT::v16i8, InputV);
14331 return DAG.getBitcast(
14332 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
14333 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
14334 }
14335
14336 // If we are extending from an offset, ensure we start on a boundary that
14337 // we can unpack from.
14338 int AlignToUnpack = Offset % (NumElements / Scale);
14339 if (AlignToUnpack) {
14340 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14341 for (int i = AlignToUnpack; i < NumElements; ++i)
14342 ShMask[i - AlignToUnpack] = i;
14343 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
14344 Offset -= AlignToUnpack;
14345 }
14346
14347 // Otherwise emit a sequence of unpacks.
14348 do {
14349 unsigned UnpackLoHi = X86ISD::UNPCKL;
14350 if (Offset >= (NumElements / 2)) {
14351 UnpackLoHi = X86ISD::UNPCKH;
14352 Offset -= (NumElements / 2);
14353 }
14354
14355 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
14356 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
14357 : getZeroVector(InputVT, Subtarget, DAG, DL);
14358 InputV = DAG.getBitcast(InputVT, InputV);
14359 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
14360 Scale /= 2;
14361 EltBits *= 2;
14362 NumElements /= 2;
14363 } while (Scale > 1);
14364 return DAG.getBitcast(VT, InputV);
14365}
14366
14367/// Try to lower a vector shuffle as a zero extension on any microarch.
14368///
14369/// This routine will try to do everything in its power to cleverly lower
14370/// a shuffle which happens to match the pattern of a zero extend. It doesn't
14371/// check for the profitability of this lowering, it tries to aggressively
14372/// match this pattern. It will use all of the micro-architectural details it
14373/// can to emit an efficient lowering. It handles both blends with all-zero
14374/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
14375/// masking out later).
14376///
14377/// The reason we have dedicated lowering for zext-style shuffles is that they
14378/// are both incredibly common and often quite performance sensitive.
14379static SDValue lowerShuffleAsZeroOrAnyExtend(
14380 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14381 const APInt &Zeroable, const X86Subtarget &Subtarget,
14382 SelectionDAG &DAG) {
14383 int Bits = VT.getSizeInBits();
14384 int NumLanes = Bits / 128;
14385 int NumElements = VT.getVectorNumElements();
14386 int NumEltsPerLane = NumElements / NumLanes;
14387 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14388, __extension__
__PRETTY_FUNCTION__))
14388 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14388, __extension__
__PRETTY_FUNCTION__))
;
14389 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14389, __extension__
__PRETTY_FUNCTION__))
;
14390
14391 // Define a helper function to check a particular ext-scale and lower to it if
14392 // valid.
14393 auto Lower = [&](int Scale) -> SDValue {
14394 SDValue InputV;
14395 bool AnyExt = true;
14396 int Offset = 0;
14397 int Matches = 0;
14398 for (int i = 0; i < NumElements; ++i) {
14399 int M = Mask[i];
14400 if (M < 0)
14401 continue; // Valid anywhere but doesn't tell us anything.
14402 if (i % Scale != 0) {
14403 // Each of the extended elements need to be zeroable.
14404 if (!Zeroable[i])
14405 return SDValue();
14406
14407 // We no longer are in the anyext case.
14408 AnyExt = false;
14409 continue;
14410 }
14411
14412 // Each of the base elements needs to be consecutive indices into the
14413 // same input vector.
14414 SDValue V = M < NumElements ? V1 : V2;
14415 M = M % NumElements;
14416 if (!InputV) {
14417 InputV = V;
14418 Offset = M - (i / Scale);
14419 } else if (InputV != V)
14420 return SDValue(); // Flip-flopping inputs.
14421
14422 // Offset must start in the lowest 128-bit lane or at the start of an
14423 // upper lane.
14424 // FIXME: Is it ever worth allowing a negative base offset?
14425 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
14426 (Offset % NumEltsPerLane) == 0))
14427 return SDValue();
14428
14429 // If we are offsetting, all referenced entries must come from the same
14430 // lane.
14431 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
14432 return SDValue();
14433
14434 if ((M % NumElements) != (Offset + (i / Scale)))
14435 return SDValue(); // Non-consecutive strided elements.
14436 Matches++;
14437 }
14438
14439 // If we fail to find an input, we have a zero-shuffle which should always
14440 // have already been handled.
14441 // FIXME: Maybe handle this here in case during blending we end up with one?
14442 if (!InputV)
14443 return SDValue();
14444
14445 // If we are offsetting, don't extend if we only match a single input, we
14446 // can always do better by using a basic PSHUF or PUNPCK.
14447 if (Offset != 0 && Matches < 2)
14448 return SDValue();
14449
14450 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
14451 InputV, Mask, Subtarget, DAG);
14452 };
14453
14454 // The widest scale possible for extending is to a 64-bit integer.
14455 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__))
14456 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__))
;
14457 int NumExtElements = Bits / 64;
14458
14459 // Each iteration, try extending the elements half as much, but into twice as
14460 // many elements.
14461 for (; NumExtElements < NumElements; NumExtElements *= 2) {
14462 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14463, __extension__
__PRETTY_FUNCTION__))
14463 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14463, __extension__
__PRETTY_FUNCTION__))
;
14464 if (SDValue V = Lower(NumElements / NumExtElements))
14465 return V;
14466 }
14467
14468 // General extends failed, but 128-bit vectors may be able to use MOVQ.
14469 if (Bits != 128)
14470 return SDValue();
14471
14472 // Returns one of the source operands if the shuffle can be reduced to a
14473 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
14474 auto CanZExtLowHalf = [&]() {
14475 for (int i = NumElements / 2; i != NumElements; ++i)
14476 if (!Zeroable[i])
14477 return SDValue();
14478 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
14479 return V1;
14480 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
14481 return V2;
14482 return SDValue();
14483 };
14484
14485 if (SDValue V = CanZExtLowHalf()) {
14486 V = DAG.getBitcast(MVT::v2i64, V);
14487 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
14488 return DAG.getBitcast(VT, V);
14489 }
14490
14491 // No viable ext lowering found.
14492 return SDValue();
14493}
14494
14495/// Try to get a scalar value for a specific element of a vector.
14496///
14497/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
14498static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
14499 SelectionDAG &DAG) {
14500 MVT VT = V.getSimpleValueType();
14501 MVT EltVT = VT.getVectorElementType();
14502 V = peekThroughBitcasts(V);
14503
14504 // If the bitcasts shift the element size, we can't extract an equivalent
14505 // element from it.
14506 MVT NewVT = V.getSimpleValueType();
14507 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
14508 return SDValue();
14509
14510 if (V.getOpcode() == ISD::BUILD_VECTOR ||
14511 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
14512 // Ensure the scalar operand is the same size as the destination.
14513 // FIXME: Add support for scalar truncation where possible.
14514 SDValue S = V.getOperand(Idx);
14515 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
14516 return DAG.getBitcast(EltVT, S);
14517 }
14518
14519 return SDValue();
14520}
14521
14522/// Helper to test for a load that can be folded with x86 shuffles.
14523///
14524/// This is particularly important because the set of instructions varies
14525/// significantly based on whether the operand is a load or not.
14526static bool isShuffleFoldableLoad(SDValue V) {
14527 return V->hasOneUse() &&
14528 ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
14529}
14530
14531template<typename T>
14532static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
14533 return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
14534}
14535
14536template<typename T>
14537bool X86TargetLowering::isSoftFP16(T VT) const {
14538 return ::isSoftFP16(VT, Subtarget);
14539}
14540
14541/// Try to lower insertion of a single element into a zero vector.
14542///
14543/// This is a common pattern that we have especially efficient patterns to lower
14544/// across all subtarget feature sets.
14545static SDValue lowerShuffleAsElementInsertion(
14546 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14547 const APInt &Zeroable, const X86Subtarget &Subtarget,
14548 SelectionDAG &DAG) {
14549 MVT ExtVT = VT;
14550 MVT EltVT = VT.getVectorElementType();
14551
14552 if (isSoftFP16(EltVT, Subtarget))
14553 return SDValue();
14554
14555 int V2Index =
14556 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14557 Mask.begin();
14558 bool IsV1Zeroable = true;
14559 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14560 if (i != V2Index && !Zeroable[i]) {
14561 IsV1Zeroable = false;
14562 break;
14563 }
14564
14565 // Check for a single input from a SCALAR_TO_VECTOR node.
14566 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14567 // all the smarts here sunk into that routine. However, the current
14568 // lowering of BUILD_VECTOR makes that nearly impossible until the old
14569 // vector shuffle lowering is dead.
14570 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14571 DAG);
14572 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14573 // We need to zext the scalar if it is smaller than an i32.
14574 V2S = DAG.getBitcast(EltVT, V2S);
14575 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14576 // Using zext to expand a narrow element won't work for non-zero
14577 // insertions.
14578 if (!IsV1Zeroable)
14579 return SDValue();
14580
14581 // Zero-extend directly to i32.
14582 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14583 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14584 }
14585 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14586 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14587 EltVT == MVT::i16) {
14588 // Either not inserting from the low element of the input or the input
14589 // element size is too small to use VZEXT_MOVL to clear the high bits.
14590 return SDValue();
14591 }
14592
14593 if (!IsV1Zeroable) {
14594 // If V1 can't be treated as a zero vector we have fewer options to lower
14595 // this. We can't support integer vectors or non-zero targets cheaply, and
14596 // the V1 elements can't be permuted in any way.
14597 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14597, __extension__
__PRETTY_FUNCTION__))
;
14598 if (!VT.isFloatingPoint() || V2Index != 0)
14599 return SDValue();
14600 SmallVector<int, 8> V1Mask(Mask);
14601 V1Mask[V2Index] = -1;
14602 if (!isNoopShuffleMask(V1Mask))
14603 return SDValue();
14604 if (!VT.is128BitVector())
14605 return SDValue();
14606
14607 // Otherwise, use MOVSD, MOVSS or MOVSH.
14608 unsigned MovOpc = 0;
14609 if (EltVT == MVT::f16)
14610 MovOpc = X86ISD::MOVSH;
14611 else if (EltVT == MVT::f32)
14612 MovOpc = X86ISD::MOVSS;
14613 else if (EltVT == MVT::f64)
14614 MovOpc = X86ISD::MOVSD;
14615 else
14616 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14616)
;
14617 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14618 }
14619
14620 // This lowering only works for the low element with floating point vectors.
14621 if (VT.isFloatingPoint() && V2Index != 0)
14622 return SDValue();
14623
14624 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14625 if (ExtVT != VT)
14626 V2 = DAG.getBitcast(VT, V2);
14627
14628 if (V2Index != 0) {
14629 // If we have 4 or fewer lanes we can cheaply shuffle the element into
14630 // the desired position. Otherwise it is more efficient to do a vector
14631 // shift left. We know that we can do a vector shift left because all
14632 // the inputs are zero.
14633 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
14634 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14635 V2Shuffle[V2Index] = 0;
14636 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14637 } else {
14638 V2 = DAG.getBitcast(MVT::v16i8, V2);
14639 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14640 DAG.getTargetConstant(
14641 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
14642 V2 = DAG.getBitcast(VT, V2);
14643 }
14644 }
14645 return V2;
14646}
14647
14648/// Try to lower broadcast of a single - truncated - integer element,
14649/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14650///
14651/// This assumes we have AVX2.
14652static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14653 int BroadcastIdx,
14654 const X86Subtarget &Subtarget,
14655 SelectionDAG &DAG) {
14656 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14657, __extension__
__PRETTY_FUNCTION__))
14657 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14657, __extension__
__PRETTY_FUNCTION__))
;
14658
14659 MVT EltVT = VT.getVectorElementType();
14660 MVT V0VT = V0.getSimpleValueType();
14661
14662 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14662, __extension__
__PRETTY_FUNCTION__))
;
14663 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14663, __extension__
__PRETTY_FUNCTION__))
;
14664
14665 MVT V0EltVT = V0VT.getVectorElementType();
14666 if (!V0EltVT.isInteger())
14667 return SDValue();
14668
14669 const unsigned EltSize = EltVT.getSizeInBits();
14670 const unsigned V0EltSize = V0EltVT.getSizeInBits();
14671
14672 // This is only a truncation if the original element type is larger.
14673 if (V0EltSize <= EltSize)
14674 return SDValue();
14675
14676 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14677, __extension__
__PRETTY_FUNCTION__))
14677 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14677, __extension__
__PRETTY_FUNCTION__))
;
14678
14679 const unsigned V0Opc = V0.getOpcode();
14680 const unsigned Scale = V0EltSize / EltSize;
14681 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14682
14683 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14684 V0Opc != ISD::BUILD_VECTOR)
14685 return SDValue();
14686
14687 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14688
14689 // If we're extracting non-least-significant bits, shift so we can truncate.
14690 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14691 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14692 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14693 if (const int OffsetIdx = BroadcastIdx % Scale)
14694 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14695 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14696
14697 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14698 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14699}
14700
14701/// Test whether this can be lowered with a single SHUFPS instruction.
14702///
14703/// This is used to disable more specialized lowerings when the shufps lowering
14704/// will happen to be efficient.
14705static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14706 // This routine only handles 128-bit shufps.
14707 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14707, __extension__
__PRETTY_FUNCTION__))
;
14708 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14708, __extension__
__PRETTY_FUNCTION__))
;
14709 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14709, __extension__
__PRETTY_FUNCTION__))
;
14710 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14710, __extension__
__PRETTY_FUNCTION__))
;
14711 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14711, __extension__
__PRETTY_FUNCTION__))
;
14712
14713 // To lower with a single SHUFPS we need to have the low half and high half
14714 // each requiring a single input.
14715 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14716 return false;
14717 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14718 return false;
14719
14720 return true;
14721}
14722
14723/// Test whether the specified input (0 or 1) is in-place blended by the
14724/// given mask.
14725///
14726/// This returns true if the elements from a particular input are already in the
14727/// slot required by the given mask and require no permutation.
14728static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14729 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14729, __extension__
__PRETTY_FUNCTION__))
;
14730 int Size = Mask.size();
14731 for (int i = 0; i < Size; ++i)
14732 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14733 return false;
14734
14735 return true;
14736}
14737
14738/// If we are extracting two 128-bit halves of a vector and shuffling the
14739/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14740/// multi-shuffle lowering.
14741static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14742 SDValue N1, ArrayRef<int> Mask,
14743 SelectionDAG &DAG) {
14744 MVT VT = N0.getSimpleValueType();
14745 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14747, __extension__
__PRETTY_FUNCTION__))
14746 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14747, __extension__
__PRETTY_FUNCTION__))
14747 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14747, __extension__
__PRETTY_FUNCTION__))
;
14748
14749 // Check that both sources are extracts of the same source vector.
14750 if (!N0.hasOneUse() || !N1.hasOneUse() ||
14751 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14752 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14753 N0.getOperand(0) != N1.getOperand(0))
14754 return SDValue();
14755
14756 SDValue WideVec = N0.getOperand(0);
14757 MVT WideVT = WideVec.getSimpleValueType();
14758 if (!WideVT.is256BitVector())
14759 return SDValue();
14760
14761 // Match extracts of each half of the wide source vector. Commute the shuffle
14762 // if the extract of the low half is N1.
14763 unsigned NumElts = VT.getVectorNumElements();
14764 SmallVector<int, 4> NewMask(Mask);
14765 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14766 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14767 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14768 ShuffleVectorSDNode::commuteMask(NewMask);
14769 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
14770 return SDValue();
14771
14772 // Final bailout: if the mask is simple, we are better off using an extract
14773 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
14774 // because that avoids a constant load from memory.
14775 if (NumElts == 4 &&
14776 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
14777 return SDValue();
14778
14779 // Extend the shuffle mask with undef elements.
14780 NewMask.append(NumElts, -1);
14781
14782 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
14783 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
14784 NewMask);
14785 // This is free: ymm -> xmm.
14786 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
14787 DAG.getIntPtrConstant(0, DL));
14788}
14789
14790/// Try to lower broadcast of a single element.
14791///
14792/// For convenience, this code also bundles all of the subtarget feature set
14793/// filtering. While a little annoying to re-dispatch on type here, there isn't
14794/// a convenient way to factor it out.
14795static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
14796 SDValue V2, ArrayRef<int> Mask,
14797 const X86Subtarget &Subtarget,
14798 SelectionDAG &DAG) {
14799 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
14800 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
14801 (Subtarget.hasAVX2() && VT.isInteger())))
14802 return SDValue();
14803
14804 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
14805 // we can only broadcast from a register with AVX2.
14806 unsigned NumEltBits = VT.getScalarSizeInBits();
14807 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
14808 ? X86ISD::MOVDDUP
14809 : X86ISD::VBROADCAST;
14810 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
14811
14812 // Check that the mask is a broadcast.
14813 int BroadcastIdx = getSplatIndex(Mask);
14814 if (BroadcastIdx < 0)
14815 return SDValue();
14816 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14818, __extension__
__PRETTY_FUNCTION__))
14817 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14818, __extension__
__PRETTY_FUNCTION__))
14818 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14818, __extension__
__PRETTY_FUNCTION__))
;
14819
14820 // Go up the chain of (vector) values to find a scalar load that we can
14821 // combine with the broadcast.
14822 // TODO: Combine this logic with findEltLoadSrc() used by
14823 // EltsFromConsecutiveLoads().
14824 int BitOffset = BroadcastIdx * NumEltBits;
14825 SDValue V = V1;
14826 for (;;) {
14827 switch (V.getOpcode()) {
14828 case ISD::BITCAST: {
14829 V = V.getOperand(0);
14830 continue;
14831 }
14832 case ISD::CONCAT_VECTORS: {
14833 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
14834 int OpIdx = BitOffset / OpBitWidth;
14835 V = V.getOperand(OpIdx);
14836 BitOffset %= OpBitWidth;
14837 continue;
14838 }
14839 case ISD::EXTRACT_SUBVECTOR: {
14840 // The extraction index adds to the existing offset.
14841 unsigned EltBitWidth = V.getScalarValueSizeInBits();
14842 unsigned Idx = V.getConstantOperandVal(1);
14843 unsigned BeginOffset = Idx * EltBitWidth;
14844 BitOffset += BeginOffset;
14845 V = V.getOperand(0);
14846 continue;
14847 }
14848 case ISD::INSERT_SUBVECTOR: {
14849 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
14850 int EltBitWidth = VOuter.getScalarValueSizeInBits();
14851 int Idx = (int)V.getConstantOperandVal(2);
14852 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
14853 int BeginOffset = Idx * EltBitWidth;
14854 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
14855 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
14856 BitOffset -= BeginOffset;
14857 V = VInner;
14858 } else {
14859 V = VOuter;
14860 }
14861 continue;
14862 }
14863 }
14864 break;
14865 }
14866 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14866, __extension__
__PRETTY_FUNCTION__))
;
14867 BroadcastIdx = BitOffset / NumEltBits;
14868
14869 // Do we need to bitcast the source to retrieve the original broadcast index?
14870 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
14871
14872 // Check if this is a broadcast of a scalar. We special case lowering
14873 // for scalars so that we can more effectively fold with loads.
14874 // If the original value has a larger element type than the shuffle, the
14875 // broadcast element is in essence truncated. Make that explicit to ease
14876 // folding.
14877 if (BitCastSrc && VT.isInteger())
14878 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
14879 DL, VT, V, BroadcastIdx, Subtarget, DAG))
14880 return TruncBroadcast;
14881
14882 // Also check the simpler case, where we can directly reuse the scalar.
14883 if (!BitCastSrc &&
14884 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14885 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14886 V = V.getOperand(BroadcastIdx);
14887
14888 // If we can't broadcast from a register, check that the input is a load.
14889 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14890 return SDValue();
14891 } else if (ISD::isNormalLoad(V.getNode()) &&
14892 cast<LoadSDNode>(V)->isSimple()) {
14893 // We do not check for one-use of the vector load because a broadcast load
14894 // is expected to be a win for code size, register pressure, and possibly
14895 // uops even if the original vector load is not eliminated.
14896
14897 // Reduce the vector load and shuffle to a broadcasted scalar load.
14898 LoadSDNode *Ld = cast<LoadSDNode>(V);
14899 SDValue BaseAddr = Ld->getOperand(1);
14900 MVT SVT = VT.getScalarType();
14901 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14902 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14902, __extension__
__PRETTY_FUNCTION__))
;
14903 SDValue NewAddr =
14904 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14905
14906 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14907 // than MOVDDUP.
14908 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14909 if (Opcode == X86ISD::VBROADCAST) {
14910 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14911 SDValue Ops[] = {Ld->getChain(), NewAddr};
14912 V = DAG.getMemIntrinsicNode(
14913 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14914 DAG.getMachineFunction().getMachineMemOperand(
14915 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14916 DAG.makeEquivalentMemoryOrdering(Ld, V);
14917 return DAG.getBitcast(VT, V);
14918 }
14919 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14919, __extension__
__PRETTY_FUNCTION__))
;
14920 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14921 DAG.getMachineFunction().getMachineMemOperand(
14922 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14923 DAG.makeEquivalentMemoryOrdering(Ld, V);
14924 } else if (!BroadcastFromReg) {
14925 // We can't broadcast from a vector register.
14926 return SDValue();
14927 } else if (BitOffset != 0) {
14928 // We can only broadcast from the zero-element of a vector register,
14929 // but it can be advantageous to broadcast from the zero-element of a
14930 // subvector.
14931 if (!VT.is256BitVector() && !VT.is512BitVector())
14932 return SDValue();
14933
14934 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
14935 if (VT == MVT::v4f64 || VT == MVT::v4i64)
14936 return SDValue();
14937
14938 // Only broadcast the zero-element of a 128-bit subvector.
14939 if ((BitOffset % 128) != 0)
14940 return SDValue();
14941
14942 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14943, __extension__
__PRETTY_FUNCTION__))
14943 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14943, __extension__
__PRETTY_FUNCTION__))
;
14944 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14945, __extension__
__PRETTY_FUNCTION__))
14945 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14945, __extension__
__PRETTY_FUNCTION__))
;
14946 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
14947 V = extract128BitVector(V, ExtractIdx, DAG, DL);
14948 }
14949
14950 // On AVX we can use VBROADCAST directly for scalar sources.
14951 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
14952 V = DAG.getBitcast(MVT::f64, V);
14953 if (Subtarget.hasAVX()) {
14954 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
14955 return DAG.getBitcast(VT, V);
14956 }
14957 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
14958 }
14959
14960 // If this is a scalar, do the broadcast on this type and bitcast.
14961 if (!V.getValueType().isVector()) {
14962 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14963, __extension__
__PRETTY_FUNCTION__))
14963 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14963, __extension__
__PRETTY_FUNCTION__))
;
14964 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
14965 VT.getVectorNumElements());
14966 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
14967 }
14968
14969 // We only support broadcasting from 128-bit vectors to minimize the
14970 // number of patterns we need to deal with in isel. So extract down to
14971 // 128-bits, removing as many bitcasts as possible.
14972 if (V.getValueSizeInBits() > 128)
14973 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
14974
14975 // Otherwise cast V to a vector with the same element type as VT, but
14976 // possibly narrower than VT. Then perform the broadcast.
14977 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
14978 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
14979 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
14980}
14981
14982// Check for whether we can use INSERTPS to perform the shuffle. We only use
14983// INSERTPS when the V1 elements are already in the correct locations
14984// because otherwise we can just always use two SHUFPS instructions which
14985// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
14986// perform INSERTPS if a single V1 element is out of place and all V2
14987// elements are zeroable.
14988static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
14989 unsigned &InsertPSMask,
14990 const APInt &Zeroable,
14991 ArrayRef<int> Mask, SelectionDAG &DAG) {
14992 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14992, __extension__
__PRETTY_FUNCTION__))
;
14993 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14993, __extension__
__PRETTY_FUNCTION__))
;
14994 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14994, __extension__
__PRETTY_FUNCTION__))
;
14995
14996 // Attempt to match INSERTPS with one element from VA or VB being
14997 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
14998 // are updated.
14999 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
15000 ArrayRef<int> CandidateMask) {
15001 unsigned ZMask = 0;
15002 int VADstIndex = -1;
15003 int VBDstIndex = -1;
15004 bool VAUsedInPlace = false;
15005
15006 for (int i = 0; i < 4; ++i) {
15007 // Synthesize a zero mask from the zeroable elements (includes undefs).
15008 if (Zeroable[i]) {
15009 ZMask |= 1 << i;
15010 continue;
15011 }
15012
15013 // Flag if we use any VA inputs in place.
15014 if (i == CandidateMask[i]) {
15015 VAUsedInPlace = true;
15016 continue;
15017 }
15018
15019 // We can only insert a single non-zeroable element.
15020 if (VADstIndex >= 0 || VBDstIndex >= 0)
15021 return false;
15022
15023 if (CandidateMask[i] < 4) {
15024 // VA input out of place for insertion.
15025 VADstIndex = i;
15026 } else {
15027 // VB input for insertion.
15028 VBDstIndex = i;
15029 }
15030 }
15031
15032 // Don't bother if we have no (non-zeroable) element for insertion.
15033 if (VADstIndex < 0 && VBDstIndex < 0)
15034 return false;
15035
15036 // Determine element insertion src/dst indices. The src index is from the
15037 // start of the inserted vector, not the start of the concatenated vector.
15038 unsigned VBSrcIndex = 0;
15039 if (VADstIndex >= 0) {
15040 // If we have a VA input out of place, we use VA as the V2 element
15041 // insertion and don't use the original V2 at all.
15042 VBSrcIndex = CandidateMask[VADstIndex];
15043 VBDstIndex = VADstIndex;
15044 VB = VA;
15045 } else {
15046 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
15047 }
15048
15049 // If no V1 inputs are used in place, then the result is created only from
15050 // the zero mask and the V2 insertion - so remove V1 dependency.
15051 if (!VAUsedInPlace)
15052 VA = DAG.getUNDEF(MVT::v4f32);
15053
15054 // Update V1, V2 and InsertPSMask accordingly.
15055 V1 = VA;
15056 V2 = VB;
15057
15058 // Insert the V2 element into the desired position.
15059 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
15060 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15060, __extension__
__PRETTY_FUNCTION__))
;
15061 return true;
15062 };
15063
15064 if (matchAsInsertPS(V1, V2, Mask))
15065 return true;
15066
15067 // Commute and try again.
15068 SmallVector<int, 4> CommutedMask(Mask);
15069 ShuffleVectorSDNode::commuteMask(CommutedMask);
15070 if (matchAsInsertPS(V2, V1, CommutedMask))
15071 return true;
15072
15073 return false;
15074}
15075
15076static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
15077 ArrayRef<int> Mask, const APInt &Zeroable,
15078 SelectionDAG &DAG) {
15079 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15079, __extension__
__PRETTY_FUNCTION__))
;
15080 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15080, __extension__
__PRETTY_FUNCTION__))
;
15081
15082 // Attempt to match the insertps pattern.
15083 unsigned InsertPSMask = 0;
15084 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
15085 return SDValue();
15086
15087 // Insert the V2 element into the desired position.
15088 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
15089 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
15090}
15091
15092/// Handle lowering of 2-lane 64-bit floating point shuffles.
15093///
15094/// This is the basis function for the 2-lane 64-bit shuffles as we have full
15095/// support for floating point shuffles but not integer shuffles. These
15096/// instructions will incur a domain crossing penalty on some chips though so
15097/// it is better to avoid lowering through this for integer vectors where
15098/// possible.
15099static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15100 const APInt &Zeroable, SDValue V1, SDValue V2,
15101 const X86Subtarget &Subtarget,
15102 SelectionDAG &DAG) {
15103 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15103, __extension__
__PRETTY_FUNCTION__))
;
15104 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15104, __extension__
__PRETTY_FUNCTION__))
;
15105 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15105, __extension__
__PRETTY_FUNCTION__))
;
15106
15107 if (V2.isUndef()) {
15108 // Check for being able to broadcast a single element.
15109 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
15110 Mask, Subtarget, DAG))
15111 return Broadcast;
15112
15113 // Straight shuffle of a single input vector. Simulate this by using the
15114 // single input as both of the "inputs" to this instruction..
15115 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
15116
15117 if (Subtarget.hasAVX()) {
15118 // If we have AVX, we can use VPERMILPS which will allow folding a load
15119 // into the shuffle.
15120 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
15121 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15122 }
15123
15124 return DAG.getNode(
15125 X86ISD::SHUFP, DL, MVT::v2f64,
15126 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15127 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15128 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15129 }
15130 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15130, __extension__
__PRETTY_FUNCTION__))
;
15131 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15131, __extension__
__PRETTY_FUNCTION__))
;
15132 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15132, __extension__
__PRETTY_FUNCTION__))
;
15133 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15133, __extension__
__PRETTY_FUNCTION__))
;
15134
15135 if (Subtarget.hasAVX2())
15136 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15137 return Extract;
15138
15139 // When loading a scalar and then shuffling it into a vector we can often do
15140 // the insertion cheaply.
15141 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15142 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15143 return Insertion;
15144 // Try inverting the insertion since for v2 masks it is easy to do and we
15145 // can't reliably sort the mask one way or the other.
15146 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
15147 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
15148 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15149 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15150 return Insertion;
15151
15152 // Try to use one of the special instruction patterns to handle two common
15153 // blend patterns if a zero-blend above didn't work.
15154 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
15155 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
15156 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
15157 // We can either use a special instruction to load over the low double or
15158 // to move just the low double.
15159 return DAG.getNode(
15160 X86ISD::MOVSD, DL, MVT::v2f64, V2,
15161 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
15162
15163 if (Subtarget.hasSSE41())
15164 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
15165 Zeroable, Subtarget, DAG))
15166 return Blend;
15167
15168 // Use dedicated unpack instructions for masks that match their pattern.
15169 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
15170 return V;
15171
15172 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
15173 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
15174 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15175}
15176
15177/// Handle lowering of 2-lane 64-bit integer shuffles.
15178///
15179/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
15180/// the integer unit to minimize domain crossing penalties. However, for blends
15181/// it falls back to the floating point shuffle operation with appropriate bit
15182/// casting.
15183static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15184 const APInt &Zeroable, SDValue V1, SDValue V2,
15185 const X86Subtarget &Subtarget,
15186 SelectionDAG &DAG) {
15187 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15187, __extension__
__PRETTY_FUNCTION__))
;
15188 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15188, __extension__
__PRETTY_FUNCTION__))
;
15189 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15189, __extension__
__PRETTY_FUNCTION__))
;
15190
15191 if (V2.isUndef()) {
15192 // Check for being able to broadcast a single element.
15193 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
15194 Mask, Subtarget, DAG))
15195 return Broadcast;
15196
15197 // Straight shuffle of a single input vector. For everything from SSE2
15198 // onward this has a single fast instruction with no scary immediates.
15199 // We have to map the mask as it is actually a v4i32 shuffle instruction.
15200 V1 = DAG.getBitcast(MVT::v4i32, V1);
15201 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
15202 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
15203 Mask[1] < 0 ? -1 : (Mask[1] * 2),
15204 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
15205 return DAG.getBitcast(
15206 MVT::v2i64,
15207 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15208 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
15209 }
15210 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15210, __extension__
__PRETTY_FUNCTION__))
;
15211 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15211, __extension__
__PRETTY_FUNCTION__))
;
15212 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15212, __extension__
__PRETTY_FUNCTION__))
;
15213 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15213, __extension__
__PRETTY_FUNCTION__))
;
15214
15215 if (Subtarget.hasAVX2())
15216 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15217 return Extract;
15218
15219 // Try to use shift instructions.
15220 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
15221 Zeroable, Subtarget, DAG))
15222 return Shift;
15223
15224 // When loading a scalar and then shuffling it into a vector we can often do
15225 // the insertion cheaply.
15226 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15227 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15228 return Insertion;
15229 // Try inverting the insertion since for v2 masks it is easy to do and we
15230 // can't reliably sort the mask one way or the other.
15231 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
15232 if (SDValue Insertion = lowerShuffleAsElementInsertion(
15233 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15234 return Insertion;
15235
15236 // We have different paths for blend lowering, but they all must use the
15237 // *exact* same predicate.
15238 bool IsBlendSupported = Subtarget.hasSSE41();
15239 if (IsBlendSupported)
15240 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
15241 Zeroable, Subtarget, DAG))
15242 return Blend;
15243
15244 // Use dedicated unpack instructions for masks that match their pattern.
15245 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
15246 return V;
15247
15248 // Try to use byte rotation instructions.
15249 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15250 if (Subtarget.hasSSSE3()) {
15251 if (Subtarget.hasVLX())
15252 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
15253 Subtarget, DAG))
15254 return Rotate;
15255
15256 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
15257 Subtarget, DAG))
15258 return Rotate;
15259 }
15260
15261 // If we have direct support for blends, we should lower by decomposing into
15262 // a permute. That will be faster than the domain cross.
15263 if (IsBlendSupported)
15264 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
15265 Subtarget, DAG);
15266
15267 // We implement this with SHUFPD which is pretty lame because it will likely
15268 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
15269 // However, all the alternatives are still more cycles and newer chips don't
15270 // have this problem. It would be really nice if x86 had better shuffles here.
15271 V1 = DAG.getBitcast(MVT::v2f64, V1);
15272 V2 = DAG.getBitcast(MVT::v2f64, V2);
15273 return DAG.getBitcast(MVT::v2i64,
15274 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
15275}
15276
15277/// Lower a vector shuffle using the SHUFPS instruction.
15278///
15279/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
15280/// It makes no assumptions about whether this is the *best* lowering, it simply
15281/// uses it.
15282static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
15283 ArrayRef<int> Mask, SDValue V1,
15284 SDValue V2, SelectionDAG &DAG) {
15285 SDValue LowV = V1, HighV = V2;
15286 SmallVector<int, 4> NewMask(Mask);
15287 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15288
15289 if (NumV2Elements == 1) {
15290 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
15291
15292 // Compute the index adjacent to V2Index and in the same half by toggling
15293 // the low bit.
15294 int V2AdjIndex = V2Index ^ 1;
15295
15296 if (Mask[V2AdjIndex] < 0) {
15297 // Handles all the cases where we have a single V2 element and an undef.
15298 // This will only ever happen in the high lanes because we commute the
15299 // vector otherwise.
15300 if (V2Index < 2)
15301 std::swap(LowV, HighV);
15302 NewMask[V2Index] -= 4;
15303 } else {
15304 // Handle the case where the V2 element ends up adjacent to a V1 element.
15305 // To make this work, blend them together as the first step.
15306 int V1Index = V2AdjIndex;
15307 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
15308 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
15309 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15310
15311 // Now proceed to reconstruct the final blend as we have the necessary
15312 // high or low half formed.
15313 if (V2Index < 2) {
15314 LowV = V2;
15315 HighV = V1;
15316 } else {
15317 HighV = V2;
15318 }
15319 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
15320 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
15321 }
15322 } else if (NumV2Elements == 2) {
15323 if (Mask[0] < 4 && Mask[1] < 4) {
15324 // Handle the easy case where we have V1 in the low lanes and V2 in the
15325 // high lanes.
15326 NewMask[2] -= 4;
15327 NewMask[3] -= 4;
15328 } else if (Mask[2] < 4 && Mask[3] < 4) {
15329 // We also handle the reversed case because this utility may get called
15330 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
15331 // arrange things in the right direction.
15332 NewMask[0] -= 4;
15333 NewMask[1] -= 4;
15334 HighV = V1;
15335 LowV = V2;
15336 } else {
15337 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
15338 // trying to place elements directly, just blend them and set up the final
15339 // shuffle to place them.
15340
15341 // The first two blend mask elements are for V1, the second two are for
15342 // V2.
15343 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
15344 Mask[2] < 4 ? Mask[2] : Mask[3],
15345 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
15346 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
15347 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15348 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15349
15350 // Now we do a normal shuffle of V1 by giving V1 as both operands to
15351 // a blend.
15352 LowV = HighV = V1;
15353 NewMask[0] = Mask[0] < 4 ? 0 : 2;
15354 NewMask[1] = Mask[0] < 4 ? 2 : 0;
15355 NewMask[2] = Mask[2] < 4 ? 1 : 3;
15356 NewMask[3] = Mask[2] < 4 ? 3 : 1;
15357 }
15358 } else if (NumV2Elements == 3) {
15359 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
15360 // we can get here due to other paths (e.g repeated mask matching) that we
15361 // don't want to do another round of lowerVECTOR_SHUFFLE.
15362 ShuffleVectorSDNode::commuteMask(NewMask);
15363 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
15364 }
15365 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
15366 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
15367}
15368
15369/// Lower 4-lane 32-bit floating point shuffles.
15370///
15371/// Uses instructions exclusively from the floating point unit to minimize
15372/// domain crossing penalties, as these are sufficient to implement all v4f32
15373/// shuffles.
15374static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15375 const APInt &Zeroable, SDValue V1, SDValue V2,
15376 const X86Subtarget &Subtarget,
15377 SelectionDAG &DAG) {
15378 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15378, __extension__
__PRETTY_FUNCTION__))
;
15379 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15379, __extension__
__PRETTY_FUNCTION__))
;
15380 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15380, __extension__
__PRETTY_FUNCTION__))
;
15381
15382 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15383
15384 if (NumV2Elements == 0) {
15385 // Check for being able to broadcast a single element.
15386 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
15387 Mask, Subtarget, DAG))
15388 return Broadcast;
15389
15390 // Use even/odd duplicate instructions for masks that match their pattern.
15391 if (Subtarget.hasSSE3()) {
15392 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15393 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
15394 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
15395 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
15396 }
15397
15398 if (Subtarget.hasAVX()) {
15399 // If we have AVX, we can use VPERMILPS which will allow folding a load
15400 // into the shuffle.
15401 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
15402 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15403 }
15404
15405 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
15406 // in SSE1 because otherwise they are widened to v2f64 and never get here.
15407 if (!Subtarget.hasSSE2()) {
15408 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
15409 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
15410 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
15411 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
15412 }
15413
15414 // Otherwise, use a straight shuffle of a single input vector. We pass the
15415 // input vector to both operands to simulate this with a SHUFPS.
15416 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
15417 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15418 }
15419
15420 if (Subtarget.hasAVX2())
15421 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15422 return Extract;
15423
15424 // There are special ways we can lower some single-element blends. However, we
15425 // have custom ways we can lower more complex single-element blends below that
15426 // we defer to if both this and BLENDPS fail to match, so restrict this to
15427 // when the V2 input is targeting element 0 of the mask -- that is the fast
15428 // case here.
15429 if (NumV2Elements == 1 && Mask[0] >= 4)
15430 if (SDValue V = lowerShuffleAsElementInsertion(
15431 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15432 return V;
15433
15434 if (Subtarget.hasSSE41()) {
15435 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15436 Zeroable, Subtarget, DAG))
15437 return Blend;
15438
15439 // Use INSERTPS if we can complete the shuffle efficiently.
15440 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
15441 return V;
15442
15443 if (!isSingleSHUFPSMask(Mask))
15444 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
15445 V2, Mask, DAG))
15446 return BlendPerm;
15447 }
15448
15449 // Use low/high mov instructions. These are only valid in SSE1 because
15450 // otherwise they are widened to v2f64 and never get here.
15451 if (!Subtarget.hasSSE2()) {
15452 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15453 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15454 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15455 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15456 }
15457
15458 // Use dedicated unpack instructions for masks that match their pattern.
15459 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15460 return V;
15461
15462 // Otherwise fall back to a SHUFPS lowering strategy.
15463 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15464}
15465
15466/// Lower 4-lane i32 vector shuffles.
15467///
15468/// We try to handle these with integer-domain shuffles where we can, but for
15469/// blends we use the floating point domain blend instructions.
15470static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15471 const APInt &Zeroable, SDValue V1, SDValue V2,
15472 const X86Subtarget &Subtarget,
15473 SelectionDAG &DAG) {
15474 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15474, __extension__
__PRETTY_FUNCTION__))
;
15475 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15475, __extension__
__PRETTY_FUNCTION__))
;
15476 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15476, __extension__
__PRETTY_FUNCTION__))
;
15477
15478 // Whenever we can lower this as a zext, that instruction is strictly faster
15479 // than any alternative. It also allows us to fold memory operands into the
15480 // shuffle in many cases.
15481 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15482 Zeroable, Subtarget, DAG))
15483 return ZExt;
15484
15485 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15486
15487 if (NumV2Elements == 0) {
15488 // Try to use broadcast unless the mask only has one non-undef element.
15489 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15490 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15491 Mask, Subtarget, DAG))
15492 return Broadcast;
15493 }
15494
15495 // Straight shuffle of a single input vector. For everything from SSE2
15496 // onward this has a single fast instruction with no scary immediates.
15497 // We coerce the shuffle pattern to be compatible with UNPCK instructions
15498 // but we aren't actually going to use the UNPCK instruction because doing
15499 // so prevents folding a load into this instruction or making a copy.
15500 const int UnpackLoMask[] = {0, 0, 1, 1};
15501 const int UnpackHiMask[] = {2, 2, 3, 3};
15502 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15503 Mask = UnpackLoMask;
15504 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15505 Mask = UnpackHiMask;
15506
15507 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15508 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15509 }
15510
15511 if (Subtarget.hasAVX2())
15512 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15513 return Extract;
15514
15515 // Try to use shift instructions.
15516 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
15517 Zeroable, Subtarget, DAG))
15518 return Shift;
15519
15520 // There are special ways we can lower some single-element blends.
15521 if (NumV2Elements == 1)
15522 if (SDValue V = lowerShuffleAsElementInsertion(
15523 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15524 return V;
15525
15526 // We have different paths for blend lowering, but they all must use the
15527 // *exact* same predicate.
15528 bool IsBlendSupported = Subtarget.hasSSE41();
15529 if (IsBlendSupported)
15530 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15531 Zeroable, Subtarget, DAG))
15532 return Blend;
15533
15534 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15535 Zeroable, Subtarget, DAG))
15536 return Masked;
15537
15538 // Use dedicated unpack instructions for masks that match their pattern.
15539 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15540 return V;
15541
15542 // Try to use byte rotation instructions.
15543 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15544 if (Subtarget.hasSSSE3()) {
15545 if (Subtarget.hasVLX())
15546 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15547 Subtarget, DAG))
15548 return Rotate;
15549
15550 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15551 Subtarget, DAG))
15552 return Rotate;
15553 }
15554
15555 // Assume that a single SHUFPS is faster than an alternative sequence of
15556 // multiple instructions (even if the CPU has a domain penalty).
15557 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15558 if (!isSingleSHUFPSMask(Mask)) {
15559 // If we have direct support for blends, we should lower by decomposing into
15560 // a permute. That will be faster than the domain cross.
15561 if (IsBlendSupported)
15562 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15563 Subtarget, DAG);
15564
15565 // Try to lower by permuting the inputs into an unpack instruction.
15566 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15567 Mask, Subtarget, DAG))
15568 return Unpack;
15569 }
15570
15571 // We implement this with SHUFPS because it can blend from two vectors.
15572 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15573 // up the inputs, bypassing domain shift penalties that we would incur if we
15574 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15575 // relevant.
15576 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15577 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15578 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15579 return DAG.getBitcast(MVT::v4i32, ShufPS);
15580}
15581
15582/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15583/// shuffle lowering, and the most complex part.
15584///
15585/// The lowering strategy is to try to form pairs of input lanes which are
15586/// targeted at the same half of the final vector, and then use a dword shuffle
15587/// to place them onto the right half, and finally unpack the paired lanes into
15588/// their final position.
15589///
15590/// The exact breakdown of how to form these dword pairs and align them on the
15591/// correct sides is really tricky. See the comments within the function for
15592/// more of the details.
15593///
15594/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15595/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15596/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15597/// vector, form the analogous 128-bit 8-element Mask.
15598static SDValue lowerV8I16GeneralSingleInputShuffle(
15599 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15600 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15601 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15601, __extension__
__PRETTY_FUNCTION__))
;
15602 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15603
15604 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15604, __extension__
__PRETTY_FUNCTION__))
;
15605 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15606 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15607
15608 // Attempt to directly match PSHUFLW or PSHUFHW.
15609 if (isUndefOrInRange(LoMask, 0, 4) &&
15610 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15611 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15612 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15613 }
15614 if (isUndefOrInRange(HiMask, 4, 8) &&
15615 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15616 for (int i = 0; i != 4; ++i)
15617 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15618 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15619 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15620 }
15621
15622 SmallVector<int, 4> LoInputs;
15623 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15624 array_pod_sort(LoInputs.begin(), LoInputs.end());
15625 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15626 SmallVector<int, 4> HiInputs;
15627 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15628 array_pod_sort(HiInputs.begin(), HiInputs.end());
15629 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15630 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15631 int NumHToL = LoInputs.size() - NumLToL;
15632 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15633 int NumHToH = HiInputs.size() - NumLToH;
15634 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15635 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15636 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15637 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15638
15639 // If we are shuffling values from one half - check how many different DWORD
15640 // pairs we need to create. If only 1 or 2 then we can perform this as a
15641 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15642 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15643 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15644 V = DAG.getNode(ShufWOp, DL, VT, V,
15645 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15646 V = DAG.getBitcast(PSHUFDVT, V);
15647 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15648 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15649 return DAG.getBitcast(VT, V);
15650 };
15651
15652 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15653 int PSHUFDMask[4] = { -1, -1, -1, -1 };
15654 SmallVector<std::pair<int, int>, 4> DWordPairs;
15655 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15656
15657 // Collect the different DWORD pairs.
15658 for (int DWord = 0; DWord != 4; ++DWord) {
15659 int M0 = Mask[2 * DWord + 0];
15660 int M1 = Mask[2 * DWord + 1];
15661 M0 = (M0 >= 0 ? M0 % 4 : M0);
15662 M1 = (M1 >= 0 ? M1 % 4 : M1);
15663 if (M0 < 0 && M1 < 0)
15664 continue;
15665
15666 bool Match = false;
15667 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15668 auto &DWordPair = DWordPairs[j];
15669 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15670 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15671 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15672 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15673 PSHUFDMask[DWord] = DOffset + j;
15674 Match = true;
15675 break;
15676 }
15677 }
15678 if (!Match) {
15679 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15680 DWordPairs.push_back(std::make_pair(M0, M1));
15681 }
15682 }
15683
15684 if (DWordPairs.size() <= 2) {
15685 DWordPairs.resize(2, std::make_pair(-1, -1));
15686 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15687 DWordPairs[1].first, DWordPairs[1].second};
15688 if ((NumHToL + NumHToH) == 0)
15689 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15690 if ((NumLToL + NumLToH) == 0)
15691 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15692 }
15693 }
15694
15695 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15696 // such inputs we can swap two of the dwords across the half mark and end up
15697 // with <=2 inputs to each half in each half. Once there, we can fall through
15698 // to the generic code below. For example:
15699 //
15700 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15701 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15702 //
15703 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15704 // and an existing 2-into-2 on the other half. In this case we may have to
15705 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15706 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15707 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15708 // because any other situation (including a 3-into-1 or 1-into-3 in the other
15709 // half than the one we target for fixing) will be fixed when we re-enter this
15710 // path. We will also combine away any sequence of PSHUFD instructions that
15711 // result into a single instruction. Here is an example of the tricky case:
15712 //
15713 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15714 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15715 //
15716 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15717 //
15718 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15719 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15720 //
15721 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15722 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15723 //
15724 // The result is fine to be handled by the generic logic.
15725 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15726 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15727 int AOffset, int BOffset) {
15728 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15729, __extension__
__PRETTY_FUNCTION__))
15729 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15729, __extension__
__PRETTY_FUNCTION__))
;
15730 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15731, __extension__
__PRETTY_FUNCTION__))
15731 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15731, __extension__
__PRETTY_FUNCTION__))
;
15732 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15733, __extension__
__PRETTY_FUNCTION__))
15733 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15733, __extension__
__PRETTY_FUNCTION__))
;
15734
15735 bool ThreeAInputs = AToAInputs.size() == 3;
15736
15737 // Compute the index of dword with only one word among the three inputs in
15738 // a half by taking the sum of the half with three inputs and subtracting
15739 // the sum of the actual three inputs. The difference is the remaining
15740 // slot.
15741 int ADWord = 0, BDWord = 0;
15742 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15743 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15744 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15745 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
15746 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
15747 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
15748 int TripleNonInputIdx =
15749 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
15750 TripleDWord = TripleNonInputIdx / 2;
15751
15752 // We use xor with one to compute the adjacent DWord to whichever one the
15753 // OneInput is in.
15754 OneInputDWord = (OneInput / 2) ^ 1;
15755
15756 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
15757 // and BToA inputs. If there is also such a problem with the BToB and AToB
15758 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
15759 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
15760 // is essential that we don't *create* a 3<-1 as then we might oscillate.
15761 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
15762 // Compute how many inputs will be flipped by swapping these DWords. We
15763 // need
15764 // to balance this to ensure we don't form a 3-1 shuffle in the other
15765 // half.
15766 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
15767 llvm::count(AToBInputs, 2 * ADWord + 1);
15768 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
15769 llvm::count(BToBInputs, 2 * BDWord + 1);
15770 if ((NumFlippedAToBInputs == 1 &&
15771 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15772 (NumFlippedBToBInputs == 1 &&
15773 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15774 // We choose whether to fix the A half or B half based on whether that
15775 // half has zero flipped inputs. At zero, we may not be able to fix it
15776 // with that half. We also bias towards fixing the B half because that
15777 // will more commonly be the high half, and we have to bias one way.
15778 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15779 ArrayRef<int> Inputs) {
15780 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15781 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15782 // Determine whether the free index is in the flipped dword or the
15783 // unflipped dword based on where the pinned index is. We use this bit
15784 // in an xor to conditionally select the adjacent dword.
15785 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15786 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15787 if (IsFixIdxInput == IsFixFreeIdxInput)
15788 FixFreeIdx += 1;
15789 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15790 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15791, __extension__
__PRETTY_FUNCTION__))
15791 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15791, __extension__
__PRETTY_FUNCTION__))
;
15792 int PSHUFHalfMask[] = {0, 1, 2, 3};
15793 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15794 V = DAG.getNode(
15795 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15796 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15797 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15798
15799 for (int &M : Mask)
15800 if (M >= 0 && M == FixIdx)
15801 M = FixFreeIdx;
15802 else if (M >= 0 && M == FixFreeIdx)
15803 M = FixIdx;
15804 };
15805 if (NumFlippedBToBInputs != 0) {
15806 int BPinnedIdx =
15807 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15808 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15809 } else {
15810 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15810, __extension__
__PRETTY_FUNCTION__))
;
15811 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15812 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15813 }
15814 }
15815 }
15816
15817 int PSHUFDMask[] = {0, 1, 2, 3};
15818 PSHUFDMask[ADWord] = BDWord;
15819 PSHUFDMask[BDWord] = ADWord;
15820 V = DAG.getBitcast(
15821 VT,
15822 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15823 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15824
15825 // Adjust the mask to match the new locations of A and B.
15826 for (int &M : Mask)
15827 if (M >= 0 && M/2 == ADWord)
15828 M = 2 * BDWord + M % 2;
15829 else if (M >= 0 && M/2 == BDWord)
15830 M = 2 * ADWord + M % 2;
15831
15832 // Recurse back into this routine to re-compute state now that this isn't
15833 // a 3 and 1 problem.
15834 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15835 };
15836 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15837 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15838 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15839 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15840
15841 // At this point there are at most two inputs to the low and high halves from
15842 // each half. That means the inputs can always be grouped into dwords and
15843 // those dwords can then be moved to the correct half with a dword shuffle.
15844 // We use at most one low and one high word shuffle to collect these paired
15845 // inputs into dwords, and finally a dword shuffle to place them.
15846 int PSHUFLMask[4] = {-1, -1, -1, -1};
15847 int PSHUFHMask[4] = {-1, -1, -1, -1};
15848 int PSHUFDMask[4] = {-1, -1, -1, -1};
15849
15850 // First fix the masks for all the inputs that are staying in their
15851 // original halves. This will then dictate the targets of the cross-half
15852 // shuffles.
15853 auto fixInPlaceInputs =
15854 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15855 MutableArrayRef<int> SourceHalfMask,
15856 MutableArrayRef<int> HalfMask, int HalfOffset) {
15857 if (InPlaceInputs.empty())
15858 return;
15859 if (InPlaceInputs.size() == 1) {
15860 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15861 InPlaceInputs[0] - HalfOffset;
15862 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15863 return;
15864 }
15865 if (IncomingInputs.empty()) {
15866 // Just fix all of the in place inputs.
15867 for (int Input : InPlaceInputs) {
15868 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15869 PSHUFDMask[Input / 2] = Input / 2;
15870 }
15871 return;
15872 }
15873
15874 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15874, __extension__
__PRETTY_FUNCTION__))
;
15875 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15876 InPlaceInputs[0] - HalfOffset;
15877 // Put the second input next to the first so that they are packed into
15878 // a dword. We find the adjacent index by toggling the low bit.
15879 int AdjIndex = InPlaceInputs[0] ^ 1;
15880 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15881 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15882 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15883 };
15884 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15885 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15886
15887 // Now gather the cross-half inputs and place them into a free dword of
15888 // their target half.
15889 // FIXME: This operation could almost certainly be simplified dramatically to
15890 // look more like the 3-1 fixing operation.
15891 auto moveInputsToRightHalf = [&PSHUFDMask](
15892 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15893 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15894 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15895 int DestOffset) {
15896 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15897 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15898 };
15899 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15900 int Word) {
15901 int LowWord = Word & ~1;
15902 int HighWord = Word | 1;
15903 return isWordClobbered(SourceHalfMask, LowWord) ||
15904 isWordClobbered(SourceHalfMask, HighWord);
15905 };
15906
15907 if (IncomingInputs.empty())
15908 return;
15909
15910 if (ExistingInputs.empty()) {
15911 // Map any dwords with inputs from them into the right half.
15912 for (int Input : IncomingInputs) {
15913 // If the source half mask maps over the inputs, turn those into
15914 // swaps and use the swapped lane.
15915 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
15916 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
15917 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
15918 Input - SourceOffset;
15919 // We have to swap the uses in our half mask in one sweep.
15920 for (int &M : HalfMask)
15921 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
15922 M = Input;
15923 else if (M == Input)
15924 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15925 } else {
15926 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15928, __extension__
__PRETTY_FUNCTION__))
15927 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15928, __extension__
__PRETTY_FUNCTION__))
15928 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15928, __extension__
__PRETTY_FUNCTION__))
;
15929 }
15930 // Note that this correctly re-maps both when we do a swap and when
15931 // we observe the other side of the swap above. We rely on that to
15932 // avoid swapping the members of the input list directly.
15933 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15934 }
15935
15936 // Map the input's dword into the correct half.
15937 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
15938 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
15939 else
15940 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15942, __extension__
__PRETTY_FUNCTION__))
15941 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15942, __extension__
__PRETTY_FUNCTION__))
15942 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15942, __extension__
__PRETTY_FUNCTION__))
;
15943 }
15944
15945 // And just directly shift any other-half mask elements to be same-half
15946 // as we will have mirrored the dword containing the element into the
15947 // same position within that half.
15948 for (int &M : HalfMask)
15949 if (M >= SourceOffset && M < SourceOffset + 4) {
15950 M = M - SourceOffset + DestOffset;
15951 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15951, __extension__
__PRETTY_FUNCTION__))
;
15952 }
15953 return;
15954 }
15955
15956 // Ensure we have the input in a viable dword of its current half. This
15957 // is particularly tricky because the original position may be clobbered
15958 // by inputs being moved and *staying* in that half.
15959 if (IncomingInputs.size() == 1) {
15960 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15961 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
15962 SourceOffset;
15963 SourceHalfMask[InputFixed - SourceOffset] =
15964 IncomingInputs[0] - SourceOffset;
15965 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
15966 InputFixed);
15967 IncomingInputs[0] = InputFixed;
15968 }
15969 } else if (IncomingInputs.size() == 2) {
15970 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
15971 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15972 // We have two non-adjacent or clobbered inputs we need to extract from
15973 // the source half. To do this, we need to map them into some adjacent
15974 // dword slot in the source mask.
15975 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
15976 IncomingInputs[1] - SourceOffset};
15977
15978 // If there is a free slot in the source half mask adjacent to one of
15979 // the inputs, place the other input in it. We use (Index XOR 1) to
15980 // compute an adjacent index.
15981 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
15982 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
15983 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
15984 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15985 InputsFixed[1] = InputsFixed[0] ^ 1;
15986 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
15987 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
15988 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
15989 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
15990 InputsFixed[0] = InputsFixed[1] ^ 1;
15991 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
15992 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
15993 // The two inputs are in the same DWord but it is clobbered and the
15994 // adjacent DWord isn't used at all. Move both inputs to the free
15995 // slot.
15996 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
15997 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
15998 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
15999 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
16000 } else {
16001 // The only way we hit this point is if there is no clobbering
16002 // (because there are no off-half inputs to this half) and there is no
16003 // free slot adjacent to one of the inputs. In this case, we have to
16004 // swap an input with a non-input.
16005 for (int i = 0; i < 4; ++i)
16006 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16007, __extension__
__PRETTY_FUNCTION__))
16007 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16007, __extension__
__PRETTY_FUNCTION__))
;
16008 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16009, __extension__
__PRETTY_FUNCTION__))
16009 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16009, __extension__
__PRETTY_FUNCTION__))
;
16010
16011 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16012 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
16013
16014 // We also have to update the final source mask in this case because
16015 // it may need to undo the above swap.
16016 for (int &M : FinalSourceHalfMask)
16017 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
16018 M = InputsFixed[1] + SourceOffset;
16019 else if (M == InputsFixed[1] + SourceOffset)
16020 M = (InputsFixed[0] ^ 1) + SourceOffset;
16021
16022 InputsFixed[1] = InputsFixed[0] ^ 1;
16023 }
16024
16025 // Point everything at the fixed inputs.
16026 for (int &M : HalfMask)
16027 if (M == IncomingInputs[0])
16028 M = InputsFixed[0] + SourceOffset;
16029 else if (M == IncomingInputs[1])
16030 M = InputsFixed[1] + SourceOffset;
16031
16032 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
16033 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
16034 }
16035 } else {
16036 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16036)
;
16037 }
16038
16039 // Now hoist the DWord down to the right half.
16040 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
16041 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16041, __extension__
__PRETTY_FUNCTION__))
;
16042 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
16043 for (int &M : HalfMask)
16044 for (int Input : IncomingInputs)
16045 if (M == Input)
16046 M = FreeDWord * 2 + Input % 2;
16047 };
16048 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
16049 /*SourceOffset*/ 4, /*DestOffset*/ 0);
16050 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
16051 /*SourceOffset*/ 0, /*DestOffset*/ 4);
16052
16053 // Now enact all the shuffles we've computed to move the inputs into their
16054 // target half.
16055 if (!isNoopShuffleMask(PSHUFLMask))
16056 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16057 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
16058 if (!isNoopShuffleMask(PSHUFHMask))
16059 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16060 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
16061 if (!isNoopShuffleMask(PSHUFDMask))
16062 V = DAG.getBitcast(
16063 VT,
16064 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16065 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16066
16067 // At this point, each half should contain all its inputs, and we can then
16068 // just shuffle them into their final position.
16069 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16070, __extension__
__PRETTY_FUNCTION__))
16070 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16070, __extension__
__PRETTY_FUNCTION__))
;
16071 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16072, __extension__
__PRETTY_FUNCTION__))
16072 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16072, __extension__
__PRETTY_FUNCTION__))
;
16073
16074 // Do a half shuffle for the low mask.
16075 if (!isNoopShuffleMask(LoMask))
16076 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16077 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
16078
16079 // Do a half shuffle with the high mask after shifting its values down.
16080 for (int &M : HiMask)
16081 if (M >= 0)
16082 M -= 4;
16083 if (!isNoopShuffleMask(HiMask))
16084 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16085 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
16086
16087 return V;
16088}
16089
16090/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
16091/// blend if only one input is used.
16092static SDValue lowerShuffleAsBlendOfPSHUFBs(
16093 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16094 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
16095 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16096, __extension__
__PRETTY_FUNCTION__))
16096 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16096, __extension__
__PRETTY_FUNCTION__))
;
16097
16098 int NumBytes = VT.getSizeInBits() / 8;
16099 int Size = Mask.size();
16100 int Scale = NumBytes / Size;
16101
16102 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16103 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16104 V1InUse = false;
16105 V2InUse = false;
16106
16107 for (int i = 0; i < NumBytes; ++i) {
16108 int M = Mask[i / Scale];
16109 if (M < 0)
16110 continue;
16111
16112 const int ZeroMask = 0x80;
16113 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
16114 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
16115 if (Zeroable[i / Scale])
16116 V1Idx = V2Idx = ZeroMask;
16117
16118 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
16119 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
16120 V1InUse |= (ZeroMask != V1Idx);
16121 V2InUse |= (ZeroMask != V2Idx);
16122 }
16123
16124 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
16125 if (V1InUse)
16126 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
16127 DAG.getBuildVector(ShufVT, DL, V1Mask));
16128 if (V2InUse)
16129 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
16130 DAG.getBuildVector(ShufVT, DL, V2Mask));
16131
16132 // If we need shuffled inputs from both, blend the two.
16133 SDValue V;
16134 if (V1InUse && V2InUse)
16135 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
16136 else
16137 V = V1InUse ? V1 : V2;
16138
16139 // Cast the result back to the correct type.
16140 return DAG.getBitcast(VT, V);
16141}
16142
16143/// Generic lowering of 8-lane i16 shuffles.
16144///
16145/// This handles both single-input shuffles and combined shuffle/blends with
16146/// two inputs. The single input shuffles are immediately delegated to
16147/// a dedicated lowering routine.
16148///
16149/// The blends are lowered in one of three fundamental ways. If there are few
16150/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
16151/// of the input is significantly cheaper when lowered as an interleaving of
16152/// the two inputs, try to interleave them. Otherwise, blend the low and high
16153/// halves of the inputs separately (making them have relatively few inputs)
16154/// and then concatenate them.
16155static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16156 const APInt &Zeroable, SDValue V1, SDValue V2,
16157 const X86Subtarget &Subtarget,
16158 SelectionDAG &DAG) {
16159 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16159, __extension__
__PRETTY_FUNCTION__))
;
16160 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16160, __extension__
__PRETTY_FUNCTION__))
;
16161 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16161, __extension__
__PRETTY_FUNCTION__))
;
16162
16163 // Whenever we can lower this as a zext, that instruction is strictly faster
16164 // than any alternative.
16165 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
16166 Zeroable, Subtarget, DAG))
16167 return ZExt;
16168
16169 // Try to use lower using a truncation.
16170 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16171 Subtarget, DAG))
16172 return V;
16173
16174 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
16175
16176 if (NumV2Inputs == 0) {
16177 // Try to use shift instructions.
16178 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
16179 Zeroable, Subtarget, DAG))
16180 return Shift;
16181
16182 // Check for being able to broadcast a single element.
16183 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
16184 Mask, Subtarget, DAG))
16185 return Broadcast;
16186
16187 // Try to use bit rotation instructions.
16188 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
16189 Subtarget, DAG))
16190 return Rotate;
16191
16192 // Use dedicated unpack instructions for masks that match their pattern.
16193 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16194 return V;
16195
16196 // Use dedicated pack instructions for masks that match their pattern.
16197 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16198 Subtarget))
16199 return V;
16200
16201 // Try to use byte rotation instructions.
16202 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
16203 Subtarget, DAG))
16204 return Rotate;
16205
16206 // Make a copy of the mask so it can be modified.
16207 SmallVector<int, 8> MutableMask(Mask);
16208 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
16209 Subtarget, DAG);
16210 }
16211
16212 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16214, __extension__
__PRETTY_FUNCTION__))
16213 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16214, __extension__
__PRETTY_FUNCTION__))
16214 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16214, __extension__
__PRETTY_FUNCTION__))
;
16215
16216 // Try to use shift instructions.
16217 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
16218 Zeroable, Subtarget, DAG))
16219 return Shift;
16220
16221 // See if we can use SSE4A Extraction / Insertion.
16222 if (Subtarget.hasSSE4A())
16223 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
16224 Zeroable, DAG))
16225 return V;
16226
16227 // There are special ways we can lower some single-element blends.
16228 if (NumV2Inputs == 1)
16229 if (SDValue V = lowerShuffleAsElementInsertion(
16230 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16231 return V;
16232
16233 // We have different paths for blend lowering, but they all must use the
16234 // *exact* same predicate.
16235 bool IsBlendSupported = Subtarget.hasSSE41();
16236 if (IsBlendSupported)
16237 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
16238 Zeroable, Subtarget, DAG))
16239 return Blend;
16240
16241 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
16242 Zeroable, Subtarget, DAG))
16243 return Masked;
16244
16245 // Use dedicated unpack instructions for masks that match their pattern.
16246 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16247 return V;
16248
16249 // Use dedicated pack instructions for masks that match their pattern.
16250 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16251 Subtarget))
16252 return V;
16253
16254 // Try to use lower using a truncation.
16255 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16256 Subtarget, DAG))
16257 return V;
16258
16259 // Try to use byte rotation instructions.
16260 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
16261 Subtarget, DAG))
16262 return Rotate;
16263
16264 if (SDValue BitBlend =
16265 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
16266 return BitBlend;
16267
16268 // Try to use byte shift instructions to mask.
16269 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
16270 Zeroable, Subtarget, DAG))
16271 return V;
16272
16273 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
16274 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
16275 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
16276 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
16277 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
16278 !Subtarget.hasVLX()) {
16279 // Check if this is part of a 256-bit vector truncation.
16280 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
16281 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16282 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
16283 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
16284 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
16285 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
16286 DAG.getTargetConstant(0xEE, DL, MVT::i8));
16287 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
16288 V1 = extract128BitVector(V1V2, 0, DAG, DL);
16289 V2 = extract128BitVector(V1V2, 4, DAG, DL);
16290 } else {
16291 SmallVector<SDValue, 4> DWordClearOps(4,
16292 DAG.getConstant(0, DL, MVT::i32));
16293 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
16294 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
16295 SDValue DWordClearMask =
16296 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
16297 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
16298 DWordClearMask);
16299 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
16300 DWordClearMask);
16301 }
16302 // Now pack things back together.
16303 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
16304 if (NumEvenDrops == 2) {
16305 Result = DAG.getBitcast(MVT::v4i32, Result);
16306 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
16307 }
16308 return Result;
16309 }
16310
16311 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
16312 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
16313 if (NumOddDrops == 1) {
16314 bool HasSSE41 = Subtarget.hasSSE41();
16315 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16316 DAG.getBitcast(MVT::v4i32, V1),
16317 DAG.getTargetConstant(16, DL, MVT::i8));
16318 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16319 DAG.getBitcast(MVT::v4i32, V2),
16320 DAG.getTargetConstant(16, DL, MVT::i8));
16321 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
16322 MVT::v8i16, V1, V2);
16323 }
16324
16325 // Try to lower by permuting the inputs into an unpack instruction.
16326 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
16327 Mask, Subtarget, DAG))
16328 return Unpack;
16329
16330 // If we can't directly blend but can use PSHUFB, that will be better as it
16331 // can both shuffle and set up the inefficient blend.
16332 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
16333 bool V1InUse, V2InUse;
16334 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
16335 Zeroable, DAG, V1InUse, V2InUse);
16336 }
16337
16338 // We can always bit-blend if we have to so the fallback strategy is to
16339 // decompose into single-input permutes and blends/unpacks.
16340 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
16341 Mask, Subtarget, DAG);
16342}
16343
16344/// Lower 8-lane 16-bit floating point shuffles.
16345static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16346 const APInt &Zeroable, SDValue V1, SDValue V2,
16347 const X86Subtarget &Subtarget,
16348 SelectionDAG &DAG) {
16349 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16349, __extension__
__PRETTY_FUNCTION__))
;
16350 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16350, __extension__
__PRETTY_FUNCTION__))
;
16351 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16351, __extension__
__PRETTY_FUNCTION__))
;
16352 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16353
16354 if (Subtarget.hasFP16()) {
16355 if (NumV2Elements == 0) {
16356 // Check for being able to broadcast a single element.
16357 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
16358 Mask, Subtarget, DAG))
16359 return Broadcast;
16360 }
16361 if (NumV2Elements == 1 && Mask[0] >= 8)
16362 if (SDValue V = lowerShuffleAsElementInsertion(
16363 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16364 return V;
16365 }
16366
16367 V1 = DAG.getBitcast(MVT::v8i16, V1);
16368 V2 = DAG.getBitcast(MVT::v8i16, V2);
16369 return DAG.getBitcast(MVT::v8f16,
16370 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
16371}
16372
16373// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
16374// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
16375// the active subvector is extracted.
16376static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
16377 ArrayRef<int> Mask, SDValue V1, SDValue V2,
16378 const X86Subtarget &Subtarget,
16379 SelectionDAG &DAG) {
16380 MVT MaskVT = VT.changeTypeToInteger();
16381 SDValue MaskNode;
16382 MVT ShuffleVT = VT;
16383 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
16384 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
16385 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
16386 ShuffleVT = V1.getSimpleValueType();
16387
16388 // Adjust mask to correct indices for the second input.
16389 int NumElts = VT.getVectorNumElements();
16390 unsigned Scale = 512 / VT.getSizeInBits();
16391 SmallVector<int, 32> AdjustedMask(Mask);
16392 for (int &M : AdjustedMask)
16393 if (NumElts <= M)
16394 M += (Scale - 1) * NumElts;
16395 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
16396 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
16397 } else {
16398 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
16399 }
16400
16401 SDValue Result;
16402 if (V2.isUndef())
16403 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
16404 else
16405 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
16406
16407 if (VT != ShuffleVT)
16408 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
16409
16410 return Result;
16411}
16412
16413/// Generic lowering of v16i8 shuffles.
16414///
16415/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
16416/// detect any complexity reducing interleaving. If that doesn't help, it uses
16417/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
16418/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
16419/// back together.
16420static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16421 const APInt &Zeroable, SDValue V1, SDValue V2,
16422 const X86Subtarget &Subtarget,
16423 SelectionDAG &DAG) {
16424 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16424, __extension__
__PRETTY_FUNCTION__))
;
16425 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16425, __extension__
__PRETTY_FUNCTION__))
;
16426 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16426, __extension__
__PRETTY_FUNCTION__))
;
16427
16428 // Try to use shift instructions.
16429 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
16430 Zeroable, Subtarget, DAG))
16431 return Shift;
16432
16433 // Try to use byte rotation instructions.
16434 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
16435 Subtarget, DAG))
16436 return Rotate;
16437
16438 // Use dedicated pack instructions for masks that match their pattern.
16439 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
16440 Subtarget))
16441 return V;
16442
16443 // Try to use a zext lowering.
16444 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
16445 Zeroable, Subtarget, DAG))
16446 return ZExt;
16447
16448 // Try to use lower using a truncation.
16449 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16450 Subtarget, DAG))
16451 return V;
16452
16453 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16454 Subtarget, DAG))
16455 return V;
16456
16457 // See if we can use SSE4A Extraction / Insertion.
16458 if (Subtarget.hasSSE4A())
16459 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16460 Zeroable, DAG))
16461 return V;
16462
16463 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16464
16465 // For single-input shuffles, there are some nicer lowering tricks we can use.
16466 if (NumV2Elements == 0) {
16467 // Check for being able to broadcast a single element.
16468 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16469 Mask, Subtarget, DAG))
16470 return Broadcast;
16471
16472 // Try to use bit rotation instructions.
16473 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16474 Subtarget, DAG))
16475 return Rotate;
16476
16477 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16478 return V;
16479
16480 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16481 // Notably, this handles splat and partial-splat shuffles more efficiently.
16482 // However, it only makes sense if the pre-duplication shuffle simplifies
16483 // things significantly. Currently, this means we need to be able to
16484 // express the pre-duplication shuffle as an i16 shuffle.
16485 //
16486 // FIXME: We should check for other patterns which can be widened into an
16487 // i16 shuffle as well.
16488 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16489 for (int i = 0; i < 16; i += 2)
16490 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16491 return false;
16492
16493 return true;
16494 };
16495 auto tryToWidenViaDuplication = [&]() -> SDValue {
16496 if (!canWidenViaDuplication(Mask))
16497 return SDValue();
16498 SmallVector<int, 4> LoInputs;
16499 copy_if(Mask, std::back_inserter(LoInputs),
16500 [](int M) { return M >= 0 && M < 8; });
16501 array_pod_sort(LoInputs.begin(), LoInputs.end());
16502 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16503 LoInputs.end());
16504 SmallVector<int, 4> HiInputs;
16505 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16506 array_pod_sort(HiInputs.begin(), HiInputs.end());
16507 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16508 HiInputs.end());
16509
16510 bool TargetLo = LoInputs.size() >= HiInputs.size();
16511 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16512 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16513
16514 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16515 SmallDenseMap<int, int, 8> LaneMap;
16516 for (int I : InPlaceInputs) {
16517 PreDupI16Shuffle[I/2] = I/2;
16518 LaneMap[I] = I;
16519 }
16520 int j = TargetLo ? 0 : 4, je = j + 4;
16521 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16522 // Check if j is already a shuffle of this input. This happens when
16523 // there are two adjacent bytes after we move the low one.
16524 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16525 // If we haven't yet mapped the input, search for a slot into which
16526 // we can map it.
16527 while (j < je && PreDupI16Shuffle[j] >= 0)
16528 ++j;
16529
16530 if (j == je)
16531 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16532 return SDValue();
16533
16534 // Map this input with the i16 shuffle.
16535 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16536 }
16537
16538 // Update the lane map based on the mapping we ended up with.
16539 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16540 }
16541 V1 = DAG.getBitcast(
16542 MVT::v16i8,
16543 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16544 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16545
16546 // Unpack the bytes to form the i16s that will be shuffled into place.
16547 bool EvenInUse = false, OddInUse = false;
16548 for (int i = 0; i < 16; i += 2) {
16549 EvenInUse |= (Mask[i + 0] >= 0);
16550 OddInUse |= (Mask[i + 1] >= 0);
16551 if (EvenInUse && OddInUse)
16552 break;
16553 }
16554 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16555 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16556 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16557
16558 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16559 for (int i = 0; i < 16; ++i)
16560 if (Mask[i] >= 0) {
16561 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16562 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16562, __extension__
__PRETTY_FUNCTION__))
;
16563 if (PostDupI16Shuffle[i / 2] < 0)
16564 PostDupI16Shuffle[i / 2] = MappedMask;
16565 else
16566 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16567, __extension__
__PRETTY_FUNCTION__))
16567 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16567, __extension__
__PRETTY_FUNCTION__))
;
16568 }
16569 return DAG.getBitcast(
16570 MVT::v16i8,
16571 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16572 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16573 };
16574 if (SDValue V = tryToWidenViaDuplication())
16575 return V;
16576 }
16577
16578 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16579 Zeroable, Subtarget, DAG))
16580 return Masked;
16581
16582 // Use dedicated unpack instructions for masks that match their pattern.
16583 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16584 return V;
16585
16586 // Try to use byte shift instructions to mask.
16587 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16588 Zeroable, Subtarget, DAG))
16589 return V;
16590
16591 // Check for compaction patterns.
16592 bool IsSingleInput = V2.isUndef();
16593 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16594
16595 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16596 // with PSHUFB. It is important to do this before we attempt to generate any
16597 // blends but after all of the single-input lowerings. If the single input
16598 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16599 // want to preserve that and we can DAG combine any longer sequences into
16600 // a PSHUFB in the end. But once we start blending from multiple inputs,
16601 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16602 // and there are *very* few patterns that would actually be faster than the
16603 // PSHUFB approach because of its ability to zero lanes.
16604 //
16605 // If the mask is a binary compaction, we can more efficiently perform this
16606 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16607 //
16608 // FIXME: The only exceptions to the above are blends which are exact
16609 // interleavings with direct instructions supporting them. We currently don't
16610 // handle those well here.
16611 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16612 bool V1InUse = false;
16613 bool V2InUse = false;
16614
16615 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16616 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16617
16618 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16619 // do so. This avoids using them to handle blends-with-zero which is
16620 // important as a single pshufb is significantly faster for that.
16621 if (V1InUse && V2InUse) {
16622 if (Subtarget.hasSSE41())
16623 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16624 Zeroable, Subtarget, DAG))
16625 return Blend;
16626
16627 // We can use an unpack to do the blending rather than an or in some
16628 // cases. Even though the or may be (very minorly) more efficient, we
16629 // preference this lowering because there are common cases where part of
16630 // the complexity of the shuffles goes away when we do the final blend as
16631 // an unpack.
16632 // FIXME: It might be worth trying to detect if the unpack-feeding
16633 // shuffles will both be pshufb, in which case we shouldn't bother with
16634 // this.
16635 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16636 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16637 return Unpack;
16638
16639 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16640 if (Subtarget.hasVBMI())
16641 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16642 DAG);
16643
16644 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16645 if (Subtarget.hasXOP()) {
16646 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16647 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16648 }
16649
16650 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16651 // PALIGNR will be cheaper than the second PSHUFB+OR.
16652 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16653 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16654 return V;
16655 }
16656
16657 return PSHUFB;
16658 }
16659
16660 // There are special ways we can lower some single-element blends.
16661 if (NumV2Elements == 1)
16662 if (SDValue V = lowerShuffleAsElementInsertion(
16663 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16664 return V;
16665
16666 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16667 return Blend;
16668
16669 // Check whether a compaction lowering can be done. This handles shuffles
16670 // which take every Nth element for some even N. See the helper function for
16671 // details.
16672 //
16673 // We special case these as they can be particularly efficiently handled with
16674 // the PACKUSB instruction on x86 and they show up in common patterns of
16675 // rearranging bytes to truncate wide elements.
16676 if (NumEvenDrops) {
16677 // NumEvenDrops is the power of two stride of the elements. Another way of
16678 // thinking about it is that we need to drop the even elements this many
16679 // times to get the original input.
16680
16681 // First we need to zero all the dropped bytes.
16682 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16683, __extension__
__PRETTY_FUNCTION__))
16683 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16683, __extension__
__PRETTY_FUNCTION__))
;
16684 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16685 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16686 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16687 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16688 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16689 WordClearMask);
16690 if (!IsSingleInput)
16691 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16692 WordClearMask);
16693
16694 // Now pack things back together.
16695 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16696 IsSingleInput ? V1 : V2);
16697 for (int i = 1; i < NumEvenDrops; ++i) {
16698 Result = DAG.getBitcast(MVT::v8i16, Result);
16699 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16700 }
16701 return Result;
16702 }
16703
16704 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16705 if (NumOddDrops == 1) {
16706 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16707 DAG.getBitcast(MVT::v8i16, V1),
16708 DAG.getTargetConstant(8, DL, MVT::i8));
16709 if (!IsSingleInput)
16710 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16711 DAG.getBitcast(MVT::v8i16, V2),
16712 DAG.getTargetConstant(8, DL, MVT::i8));
16713 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16714 IsSingleInput ? V1 : V2);
16715 }
16716
16717 // Handle multi-input cases by blending/unpacking single-input shuffles.
16718 if (NumV2Elements > 0)
16719 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16720 Subtarget, DAG);
16721
16722 // The fallback path for single-input shuffles widens this into two v8i16
16723 // vectors with unpacks, shuffles those, and then pulls them back together
16724 // with a pack.
16725 SDValue V = V1;
16726
16727 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16728 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16729 for (int i = 0; i < 16; ++i)
16730 if (Mask[i] >= 0)
16731 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
16732
16733 SDValue VLoHalf, VHiHalf;
16734 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
16735 // them out and avoid using UNPCK{L,H} to extract the elements of V as
16736 // i16s.
16737 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
16738 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
16739 // Use a mask to drop the high bytes.
16740 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
16741 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
16742 DAG.getConstant(0x00FF, DL, MVT::v8i16));
16743
16744 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
16745 VHiHalf = DAG.getUNDEF(MVT::v8i16);
16746
16747 // Squash the masks to point directly into VLoHalf.
16748 for (int &M : LoBlendMask)
16749 if (M >= 0)
16750 M /= 2;
16751 for (int &M : HiBlendMask)
16752 if (M >= 0)
16753 M /= 2;
16754 } else {
16755 // Otherwise just unpack the low half of V into VLoHalf and the high half into
16756 // VHiHalf so that we can blend them as i16s.
16757 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
16758
16759 VLoHalf = DAG.getBitcast(
16760 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
16761 VHiHalf = DAG.getBitcast(
16762 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
16763 }
16764
16765 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
16766 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
16767
16768 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
16769}
16770
16771/// Dispatching routine to lower various 128-bit x86 vector shuffles.
16772///
16773/// This routine breaks down the specific type of 128-bit shuffle and
16774/// dispatches to the lowering routines accordingly.
16775static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16776 MVT VT, SDValue V1, SDValue V2,
16777 const APInt &Zeroable,
16778 const X86Subtarget &Subtarget,
16779 SelectionDAG &DAG) {
16780 switch (VT.SimpleTy) {
16781 case MVT::v2i64:
16782 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16783 case MVT::v2f64:
16784 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16785 case MVT::v4i32:
16786 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16787 case MVT::v4f32:
16788 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16789 case MVT::v8i16:
16790 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16791 case MVT::v8f16:
16792 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16793 case MVT::v16i8:
16794 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16795
16796 default:
16797 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16797)
;
16798 }
16799}
16800
16801/// Generic routine to split vector shuffle into half-sized shuffles.
16802///
16803/// This routine just extracts two subvectors, shuffles them independently, and
16804/// then concatenates them back together. This should work effectively with all
16805/// AVX vector shuffle types.
16806static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16807 SDValue V2, ArrayRef<int> Mask,
16808 SelectionDAG &DAG) {
16809 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))
16810 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))
;
16811 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16811, __extension__
__PRETTY_FUNCTION__))
;
16812 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16812, __extension__
__PRETTY_FUNCTION__))
;
16813
16814 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16815 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16816
16817 int NumElements = VT.getVectorNumElements();
16818 int SplitNumElements = NumElements / 2;
16819 MVT ScalarVT = VT.getVectorElementType();
16820 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16821
16822 // Use splitVector/extractSubVector so that split build-vectors just build two
16823 // narrower build vectors. This helps shuffling with splats and zeros.
16824 auto SplitVector = [&](SDValue V) {
16825 SDValue LoV, HiV;
16826 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16827 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16828 DAG.getBitcast(SplitVT, HiV));
16829 };
16830
16831 SDValue LoV1, HiV1, LoV2, HiV2;
16832 std::tie(LoV1, HiV1) = SplitVector(V1);
16833 std::tie(LoV2, HiV2) = SplitVector(V2);
16834
16835 // Now create two 4-way blends of these half-width vectors.
16836 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16837 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16838 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16839 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16840 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16841 for (int i = 0; i < SplitNumElements; ++i) {
16842 int M = HalfMask[i];
16843 if (M >= NumElements) {
16844 if (M >= NumElements + SplitNumElements)
16845 UseHiV2 = true;
16846 else
16847 UseLoV2 = true;
16848 V2BlendMask[i] = M - NumElements;
16849 BlendMask[i] = SplitNumElements + i;
16850 } else if (M >= 0) {
16851 if (M >= SplitNumElements)
16852 UseHiV1 = true;
16853 else
16854 UseLoV1 = true;
16855 V1BlendMask[i] = M;
16856 BlendMask[i] = i;
16857 }
16858 }
16859
16860 // Because the lowering happens after all combining takes place, we need to
16861 // manually combine these blend masks as much as possible so that we create
16862 // a minimal number of high-level vector shuffle nodes.
16863
16864 // First try just blending the halves of V1 or V2.
16865 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16866 return DAG.getUNDEF(SplitVT);
16867 if (!UseLoV2 && !UseHiV2)
16868 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16869 if (!UseLoV1 && !UseHiV1)
16870 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16871
16872 SDValue V1Blend, V2Blend;
16873 if (UseLoV1 && UseHiV1) {
16874 V1Blend =
16875 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16876 } else {
16877 // We only use half of V1 so map the usage down into the final blend mask.
16878 V1Blend = UseLoV1 ? LoV1 : HiV1;
16879 for (int i = 0; i < SplitNumElements; ++i)
16880 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16881 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16882 }
16883 if (UseLoV2 && UseHiV2) {
16884 V2Blend =
16885 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16886 } else {
16887 // We only use half of V2 so map the usage down into the final blend mask.
16888 V2Blend = UseLoV2 ? LoV2 : HiV2;
16889 for (int i = 0; i < SplitNumElements; ++i)
16890 if (BlendMask[i] >= SplitNumElements)
16891 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
16892 }
16893 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16894 };
16895 SDValue Lo = HalfBlend(LoMask);
16896 SDValue Hi = HalfBlend(HiMask);
16897 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16898}
16899
16900/// Either split a vector in halves or decompose the shuffles and the
16901/// blend/unpack.
16902///
16903/// This is provided as a good fallback for many lowerings of non-single-input
16904/// shuffles with more than one 128-bit lane. In those cases, we want to select
16905/// between splitting the shuffle into 128-bit components and stitching those
16906/// back together vs. extracting the single-input shuffles and blending those
16907/// results.
16908static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
16909 SDValue V2, ArrayRef<int> Mask,
16910 const X86Subtarget &Subtarget,
16911 SelectionDAG &DAG) {
16912 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16913, __extension__
__PRETTY_FUNCTION__))
16913 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16913, __extension__
__PRETTY_FUNCTION__))
;
16914 int Size = Mask.size();
16915
16916 // If this can be modeled as a broadcast of two elements followed by a blend,
16917 // prefer that lowering. This is especially important because broadcasts can
16918 // often fold with memory operands.
16919 auto DoBothBroadcast = [&] {
16920 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
16921 for (int M : Mask)
16922 if (M >= Size) {
16923 if (V2BroadcastIdx < 0)
16924 V2BroadcastIdx = M - Size;
16925 else if (M - Size != V2BroadcastIdx)
16926 return false;
16927 } else if (M >= 0) {
16928 if (V1BroadcastIdx < 0)
16929 V1BroadcastIdx = M;
16930 else if (M != V1BroadcastIdx)
16931 return false;
16932 }
16933 return true;
16934 };
16935 if (DoBothBroadcast())
16936 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16937 DAG);
16938
16939 // If the inputs all stem from a single 128-bit lane of each input, then we
16940 // split them rather than blending because the split will decompose to
16941 // unusually few instructions.
16942 int LaneCount = VT.getSizeInBits() / 128;
16943 int LaneSize = Size / LaneCount;
16944 SmallBitVector LaneInputs[2];
16945 LaneInputs[0].resize(LaneCount, false);
16946 LaneInputs[1].resize(LaneCount, false);
16947 for (int i = 0; i < Size; ++i)
16948 if (Mask[i] >= 0)
16949 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
16950 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
16951 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16952
16953 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
16954 // requires that the decomposed single-input shuffles don't end up here.
16955 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16956 DAG);
16957}
16958
16959// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16960// TODO: Extend to support v8f32 (+ 512-bit shuffles).
16961static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
16962 SDValue V1, SDValue V2,
16963 ArrayRef<int> Mask,
16964 SelectionDAG &DAG) {
16965 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16965, __extension__
__PRETTY_FUNCTION__))
;
16966
16967 int LHSMask[4] = {-1, -1, -1, -1};
16968 int RHSMask[4] = {-1, -1, -1, -1};
16969 unsigned SHUFPMask = 0;
16970
16971 // As SHUFPD uses a single LHS/RHS element per lane, we can always
16972 // perform the shuffle once the lanes have been shuffled in place.
16973 for (int i = 0; i != 4; ++i) {
16974 int M = Mask[i];
16975 if (M < 0)
16976 continue;
16977 int LaneBase = i & ~1;
16978 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
16979 LaneMask[LaneBase + (M & 1)] = M;
16980 SHUFPMask |= (M & 1) << i;
16981 }
16982
16983 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
16984 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
16985 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
16986 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
16987}
16988
16989/// Lower a vector shuffle crossing multiple 128-bit lanes as
16990/// a lane permutation followed by a per-lane permutation.
16991///
16992/// This is mainly for cases where we can have non-repeating permutes
16993/// in each lane.
16994///
16995/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
16996/// we should investigate merging them.
16997static SDValue lowerShuffleAsLanePermuteAndPermute(
16998 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16999 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17000 int NumElts = VT.getVectorNumElements();
17001 int NumLanes = VT.getSizeInBits() / 128;
17002 int NumEltsPerLane = NumElts / NumLanes;
17003 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
17004
17005 /// Attempts to find a sublane permute with the given size
17006 /// that gets all elements into their target lanes.
17007 ///
17008 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
17009 /// If unsuccessful, returns false and may overwrite InLaneMask.
17010 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
17011 int NumSublanesPerLane = NumSublanes / NumLanes;
17012 int NumEltsPerSublane = NumElts / NumSublanes;
17013
17014 SmallVector<int, 16> CrossLaneMask;
17015 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
17016 // CrossLaneMask but one entry == one sublane.
17017 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
17018
17019 for (int i = 0; i != NumElts; ++i) {
17020 int M = Mask[i];
17021 if (M < 0)
17022 continue;
17023
17024 int SrcSublane = M / NumEltsPerSublane;
17025 int DstLane = i / NumEltsPerLane;
17026
17027 // We only need to get the elements into the right lane, not sublane.
17028 // So search all sublanes that make up the destination lane.
17029 bool Found = false;
17030 int DstSubStart = DstLane * NumSublanesPerLane;
17031 int DstSubEnd = DstSubStart + NumSublanesPerLane;
17032 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
17033 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
17034 continue;
17035
17036 Found = true;
17037 CrossLaneMaskLarge[DstSublane] = SrcSublane;
17038 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
17039 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
17040 break;
17041 }
17042 if (!Found)
17043 return SDValue();
17044 }
17045
17046 // Fill CrossLaneMask using CrossLaneMaskLarge.
17047 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
17048
17049 if (!CanUseSublanes) {
17050 // If we're only shuffling a single lowest lane and the rest are identity
17051 // then don't bother.
17052 // TODO - isShuffleMaskInputInPlace could be extended to something like
17053 // this.
17054 int NumIdentityLanes = 0;
17055 bool OnlyShuffleLowestLane = true;
17056 for (int i = 0; i != NumLanes; ++i) {
17057 int LaneOffset = i * NumEltsPerLane;
17058 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
17059 i * NumEltsPerLane))
17060 NumIdentityLanes++;
17061 else if (CrossLaneMask[LaneOffset] != 0)
17062 OnlyShuffleLowestLane = false;
17063 }
17064 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
17065 return SDValue();
17066 }
17067
17068 // Avoid returning the same shuffle operation. For example,
17069 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
17070 // undef:v16i16
17071 if (CrossLaneMask == Mask || InLaneMask == Mask)
17072 return SDValue();
17073
17074 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
17075 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
17076 InLaneMask);
17077 };
17078
17079 // First attempt a solution with full lanes.
17080 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
17081 return V;
17082
17083 // The rest of the solutions use sublanes.
17084 if (!CanUseSublanes)
17085 return SDValue();
17086
17087 // Then attempt a solution with 64-bit sublanes (vpermq).
17088 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
17089 return V;
17090
17091 // If that doesn't work and we have fast variable cross-lane shuffle,
17092 // attempt 32-bit sublanes (vpermd).
17093 if (!Subtarget.hasFastVariableCrossLaneShuffle())
17094 return SDValue();
17095
17096 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17097}
17098
17099/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17100/// source with a lane permutation.
17101///
17102/// This lowering strategy results in four instructions in the worst case for a
17103/// single-input cross lane shuffle which is lower than any other fully general
17104/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
17105/// shuffle pattern should be handled prior to trying this lowering.
17106static SDValue lowerShuffleAsLanePermuteAndShuffle(
17107 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17108 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17109 // FIXME: This should probably be generalized for 512-bit vectors as well.
17110 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17110, __extension__
__PRETTY_FUNCTION__))
;
17111 int Size = Mask.size();
17112 int LaneSize = Size / 2;
17113
17114 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17115 // Only do this if the elements aren't all from the lower lane,
17116 // otherwise we're (probably) better off doing a split.
17117 if (VT == MVT::v4f64 &&
17118 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
17119 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
17120
17121 // If there are only inputs from one 128-bit lane, splitting will in fact be
17122 // less expensive. The flags track whether the given lane contains an element
17123 // that crosses to another lane.
17124 bool AllLanes;
17125 if (!Subtarget.hasAVX2()) {
17126 bool LaneCrossing[2] = {false, false};
17127 for (int i = 0; i < Size; ++i)
17128 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
17129 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
17130 AllLanes = LaneCrossing[0] && LaneCrossing[1];
17131 } else {
17132 bool LaneUsed[2] = {false, false};
17133 for (int i = 0; i < Size; ++i)
17134 if (Mask[i] >= 0)
17135 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
17136 AllLanes = LaneUsed[0] && LaneUsed[1];
17137 }
17138
17139 // TODO - we could support shuffling V2 in the Flipped input.
17140 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__))
17141 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__))
;
17142
17143 SmallVector<int, 32> InLaneMask(Mask);
17144 for (int i = 0; i < Size; ++i) {
17145 int &M = InLaneMask[i];
17146 if (M < 0)
17147 continue;
17148 if (((M % Size) / LaneSize) != (i / LaneSize))
17149 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17150 }
17151 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17152, __extension__
__PRETTY_FUNCTION__))
17152 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17152, __extension__
__PRETTY_FUNCTION__))
;
17153
17154 // If we're not using both lanes in each lane and the inlane mask is not
17155 // repeating, then we're better off splitting.
17156 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17157 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17158
17159 // Flip the lanes, and shuffle the results which should now be in-lane.
17160 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
17161 SDValue Flipped = DAG.getBitcast(PVT, V1);
17162 Flipped =
17163 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
17164 Flipped = DAG.getBitcast(VT, Flipped);
17165 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
17166}
17167
17168/// Handle lowering 2-lane 128-bit shuffles.
17169static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
17170 SDValue V2, ArrayRef<int> Mask,
17171 const APInt &Zeroable,
17172 const X86Subtarget &Subtarget,
17173 SelectionDAG &DAG) {
17174 if (V2.isUndef()) {
17175 // Attempt to match VBROADCAST*128 subvector broadcast load.
17176 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
17177 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
17178 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
17179 X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
17180 MVT MemVT = VT.getHalfNumVectorElementsVT();
17181 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
17182 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
17183 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
17184 VT, MemVT, Ld, Ofs, DAG))
17185 return BcstLd;
17186 }
17187
17188 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
17189 if (Subtarget.hasAVX2())
17190 return SDValue();
17191 }
17192
17193 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
17194
17195 SmallVector<int, 4> WidenedMask;
17196 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
17197 return SDValue();
17198
17199 bool IsLowZero = (Zeroable & 0x3) == 0x3;
17200 bool IsHighZero = (Zeroable & 0xc) == 0xc;
17201
17202 // Try to use an insert into a zero vector.
17203 if (WidenedMask[0] == 0 && IsHighZero) {
17204 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17205 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17206 DAG.getIntPtrConstant(0, DL));
17207 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17208 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17209 DAG.getIntPtrConstant(0, DL));
17210 }
17211
17212 // TODO: If minimizing size and one of the inputs is a zero vector and the
17213 // the zero vector has only one use, we could use a VPERM2X128 to save the
17214 // instruction bytes needed to explicitly generate the zero vector.
17215
17216 // Blends are faster and handle all the non-lane-crossing cases.
17217 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
17218 Subtarget, DAG))
17219 return Blend;
17220
17221 // If either input operand is a zero vector, use VPERM2X128 because its mask
17222 // allows us to replace the zero input with an implicit zero.
17223 if (!IsLowZero && !IsHighZero) {
17224 // Check for patterns which can be matched with a single insert of a 128-bit
17225 // subvector.
17226 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
17227 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
17228
17229 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
17230 // this will likely become vinsertf128 which can't fold a 256-bit memop.
17231 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
17232 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17233 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
17234 OnlyUsesV1 ? V1 : V2,
17235 DAG.getIntPtrConstant(0, DL));
17236 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17237 DAG.getIntPtrConstant(2, DL));
17238 }
17239 }
17240
17241 // Try to use SHUF128 if possible.
17242 if (Subtarget.hasVLX()) {
17243 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
17244 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
17245 ((WidenedMask[1] % 2) << 1);
17246 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
17247 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17248 }
17249 }
17250 }
17251
17252 // Otherwise form a 128-bit permutation. After accounting for undefs,
17253 // convert the 64-bit shuffle mask selection values into 128-bit
17254 // selection bits by dividing the indexes by 2 and shifting into positions
17255 // defined by a vperm2*128 instruction's immediate control byte.
17256
17257 // The immediate permute control byte looks like this:
17258 // [1:0] - select 128 bits from sources for low half of destination
17259 // [2] - ignore
17260 // [3] - zero low half of destination
17261 // [5:4] - select 128 bits from sources for high half of destination
17262 // [6] - ignore
17263 // [7] - zero high half of destination
17264
17265 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17266, __extension__
__PRETTY_FUNCTION__))
17266 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17266, __extension__
__PRETTY_FUNCTION__))
;
17267
17268 unsigned PermMask = 0;
17269 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
17270 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
17271
17272 // Check the immediate mask and replace unused sources with undef.
17273 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
17274 V1 = DAG.getUNDEF(VT);
17275 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
17276 V2 = DAG.getUNDEF(VT);
17277
17278 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
17279 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17280}
17281
17282/// Lower a vector shuffle by first fixing the 128-bit lanes and then
17283/// shuffling each lane.
17284///
17285/// This attempts to create a repeated lane shuffle where each lane uses one
17286/// or two of the lanes of the inputs. The lanes of the input vectors are
17287/// shuffled in one or two independent shuffles to get the lanes into the
17288/// position needed by the final shuffle.
17289static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
17290 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17291 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17292 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17292, __extension__
__PRETTY_FUNCTION__))
;
12
'?' condition is true
17293
17294 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
13
Assuming the condition is false
14
Taking false branch
17295 return SDValue();
17296
17297 int NumElts = Mask.size();
17298 int NumLanes = VT.getSizeInBits() / 128;
17299 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15
'NumLaneElts' initialized here
17300 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
17301 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
17302
17303 // First pass will try to fill in the RepeatMask from lanes that need two
17304 // sources.
17305 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16
Assuming 'Lane' is not equal to 'NumLanes'
17
Loop condition is true. Entering loop body
22
Assuming 'Lane' is equal to 'NumLanes'
23
Loop condition is false. Execution continues on line 17377
17306 int Srcs[2] = {-1, -1};
17307 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
17308 for (int i = 0; i != NumLaneElts; ++i) {
18
Assuming 'i' is equal to 'NumLaneElts'
19
Loop condition is false. Execution continues on line 17330
17309 int M = Mask[(Lane * NumLaneElts) + i];
17310 if (M < 0)
17311 continue;
17312 // Determine which of the possible input lanes (NumLanes from each source)
17313 // this element comes from. Assign that as one of the sources for this
17314 // lane. We can assign up to 2 sources for this lane. If we run out
17315 // sources we can't do anything.
17316 int LaneSrc = M / NumLaneElts;
17317 int Src;
17318 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
17319 Src = 0;
17320 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
17321 Src = 1;
17322 else
17323 return SDValue();
17324
17325 Srcs[Src] = LaneSrc;
17326 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
17327 }
17328
17329 // If this lane has two sources, see if it fits with the repeat mask so far.
17330 if (Srcs[1] < 0)
20
Taking true branch
17331 continue;
21
Execution continues on line 17305
17332
17333 LaneSrcs[Lane][0] = Srcs[0];
17334 LaneSrcs[Lane][1] = Srcs[1];
17335
17336 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
17337 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17337, __extension__
__PRETTY_FUNCTION__))
;
17338 for (int i = 0, e = M1.size(); i != e; ++i)
17339 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
17340 return false;
17341 return true;
17342 };
17343
17344 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
17345 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17345, __extension__
__PRETTY_FUNCTION__))
;
17346 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
17347 int M = Mask[i];
17348 if (M < 0)
17349 continue;
17350 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17351, __extension__
__PRETTY_FUNCTION__))
17351 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17351, __extension__
__PRETTY_FUNCTION__))
;
17352 MergedMask[i] = M;
17353 }
17354 };
17355
17356 if (MatchMasks(InLaneMask, RepeatMask)) {
17357 // Merge this lane mask into the final repeat mask.
17358 MergeMasks(InLaneMask, RepeatMask);
17359 continue;
17360 }
17361
17362 // Didn't find a match. Swap the operands and try again.
17363 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
17364 ShuffleVectorSDNode::commuteMask(InLaneMask);
17365
17366 if (MatchMasks(InLaneMask, RepeatMask)) {
17367 // Merge this lane mask into the final repeat mask.
17368 MergeMasks(InLaneMask, RepeatMask);
17369 continue;
17370 }
17371
17372 // Couldn't find a match with the operands in either order.
17373 return SDValue();
17374 }
17375
17376 // Now handle any lanes with only one source.
17377 for (int Lane = 0; Lane != NumLanes; ++Lane) {
24
Loop condition is true. Entering loop body
28
Loop condition is false. Execution continues on line 17406
17378 // If this lane has already been processed, skip it.
17379 if (LaneSrcs[Lane][0] >= 0)
25
Assuming the condition is true
26
Taking true branch
17380 continue;
27
Execution continues on line 17377
17381
17382 for (int i = 0; i != NumLaneElts; ++i) {
17383 int M = Mask[(Lane * NumLaneElts) + i];
17384 if (M < 0)
17385 continue;
17386
17387 // If RepeatMask isn't defined yet we can define it ourself.
17388 if (RepeatMask[i] < 0)
17389 RepeatMask[i] = M % NumLaneElts;
17390
17391 if (RepeatMask[i] < NumElts) {
17392 if (RepeatMask[i] != M % NumLaneElts)
17393 return SDValue();
17394 LaneSrcs[Lane][0] = M / NumLaneElts;
17395 } else {
17396 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
17397 return SDValue();
17398 LaneSrcs[Lane][1] = M / NumLaneElts;
17399 }
17400 }
17401
17402 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
17403 return SDValue();
17404 }
17405
17406 SmallVector<int, 16> NewMask(NumElts, -1);
17407 for (int Lane = 0; Lane != NumLanes; ++Lane) {
29
Loop condition is true. Entering loop body
31
Loop condition is false. Execution continues on line 17416
17408 int Src = LaneSrcs[Lane][0];
17409 for (int i = 0; i != NumLaneElts; ++i) {
30
Loop condition is false. Execution continues on line 17407
17410 int M = -1;
17411 if (Src >= 0)
17412 M = Src * NumLaneElts + i;
17413 NewMask[Lane * NumLaneElts + i] = M;
17414 }
17415 }
17416 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17417 // Ensure we didn't get back the shuffle we started with.
17418 // FIXME: This is a hack to make up for some splat handling code in
17419 // getVectorShuffle.
17420 if (isa<ShuffleVectorSDNode>(NewV1) &&
32
Assuming 'NewV1' is not a 'ShuffleVectorSDNode'
17421 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
17422 return SDValue();
17423
17424 for (int Lane = 0; Lane != NumLanes; ++Lane) {
33
Loop condition is true. Entering loop body
35
Loop condition is false. Execution continues on line 17433
17425 int Src = LaneSrcs[Lane][1];
17426 for (int i = 0; i != NumLaneElts; ++i) {
34
Loop condition is false. Execution continues on line 17424
17427 int M = -1;
17428 if (Src >= 0)
17429 M = Src * NumLaneElts + i;
17430 NewMask[Lane * NumLaneElts + i] = M;
17431 }
17432 }
17433 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17434 // Ensure we didn't get back the shuffle we started with.
17435 // FIXME: This is a hack to make up for some splat handling code in
17436 // getVectorShuffle.
17437 if (isa<ShuffleVectorSDNode>(NewV2) &&
36
Assuming 'NewV2' is not a 'ShuffleVectorSDNode'
17438 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
17439 return SDValue();
17440
17441 for (int i = 0; i != NumElts; ++i) {
37
Assuming 'i' is not equal to 'NumElts'
38
Loop condition is true. Entering loop body
17442 NewMask[i] = RepeatMask[i % NumLaneElts];
39
Division by zero
17443 if (NewMask[i] < 0)
17444 continue;
17445
17446 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
17447 }
17448 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
17449}
17450
17451/// If the input shuffle mask results in a vector that is undefined in all upper
17452/// or lower half elements and that mask accesses only 2 halves of the
17453/// shuffle's operands, return true. A mask of half the width with mask indexes
17454/// adjusted to access the extracted halves of the original shuffle operands is
17455/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17456/// lower half of each input operand is accessed.
17457static bool
17458getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17459 int &HalfIdx1, int &HalfIdx2) {
17460 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17461, __extension__
__PRETTY_FUNCTION__))
17461 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17461, __extension__
__PRETTY_FUNCTION__))
;
17462
17463 // Exactly one half of the result must be undef to allow narrowing.
17464 bool UndefLower = isUndefLowerHalf(Mask);
17465 bool UndefUpper = isUndefUpperHalf(Mask);
17466 if (UndefLower == UndefUpper)
17467 return false;
17468
17469 unsigned HalfNumElts = HalfMask.size();
17470 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17471 HalfIdx1 = -1;
17472 HalfIdx2 = -1;
17473 for (unsigned i = 0; i != HalfNumElts; ++i) {
17474 int M = Mask[i + MaskIndexOffset];
17475 if (M < 0) {
17476 HalfMask[i] = M;
17477 continue;
17478 }
17479
17480 // Determine which of the 4 half vectors this element is from.
17481 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17482 int HalfIdx = M / HalfNumElts;
17483
17484 // Determine the element index into its half vector source.
17485 int HalfElt = M % HalfNumElts;
17486
17487 // We can shuffle with up to 2 half vectors, set the new 'half'
17488 // shuffle mask accordingly.
17489 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17490 HalfMask[i] = HalfElt;
17491 HalfIdx1 = HalfIdx;
17492 continue;
17493 }
17494 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17495 HalfMask[i] = HalfElt + HalfNumElts;
17496 HalfIdx2 = HalfIdx;
17497 continue;
17498 }
17499
17500 // Too many half vectors referenced.
17501 return false;
17502 }
17503
17504 return true;
17505}
17506
17507/// Given the output values from getHalfShuffleMask(), create a half width
17508/// shuffle of extracted vectors followed by an insert back to full width.
17509static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17510 ArrayRef<int> HalfMask, int HalfIdx1,
17511 int HalfIdx2, bool UndefLower,
17512 SelectionDAG &DAG, bool UseConcat = false) {
17513 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17513, __extension__
__PRETTY_FUNCTION__))
;
17514 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17514, __extension__
__PRETTY_FUNCTION__))
;
17515
17516 MVT VT = V1.getSimpleValueType();
17517 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17518 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17519
17520 auto getHalfVector = [&](int HalfIdx) {
17521 if (HalfIdx < 0)
17522 return DAG.getUNDEF(HalfVT);
17523 SDValue V = (HalfIdx < 2 ? V1 : V2);
17524 HalfIdx = (HalfIdx % 2) * HalfNumElts;
17525 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17526 DAG.getIntPtrConstant(HalfIdx, DL));
17527 };
17528
17529 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17530 SDValue Half1 = getHalfVector(HalfIdx1);
17531 SDValue Half2 = getHalfVector(HalfIdx2);
17532 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17533 if (UseConcat) {
17534 SDValue Op0 = V;
17535 SDValue Op1 = DAG.getUNDEF(HalfVT);
17536 if (UndefLower)
17537 std::swap(Op0, Op1);
17538 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17539 }
17540
17541 unsigned Offset = UndefLower ? HalfNumElts : 0;
17542 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17543 DAG.getIntPtrConstant(Offset, DL));
17544}
17545
17546/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17547/// This allows for fast cases such as subvector extraction/insertion
17548/// or shuffling smaller vector types which can lower more efficiently.
17549static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17550 SDValue V2, ArrayRef<int> Mask,
17551 const X86Subtarget &Subtarget,
17552 SelectionDAG &DAG) {
17553 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17554, __extension__
__PRETTY_FUNCTION__))
17554 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17554, __extension__
__PRETTY_FUNCTION__))
;
17555
17556 bool UndefLower = isUndefLowerHalf(Mask);
17557 if (!UndefLower && !isUndefUpperHalf(Mask))
17558 return SDValue();
17559
17560 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17561, __extension__
__PRETTY_FUNCTION__))
17561 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17561, __extension__
__PRETTY_FUNCTION__))
;
17562
17563 // Upper half is undef and lower half is whole upper subvector.
17564 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17565 MVT HalfVT = VT.getHalfNumVectorElementsVT();
17566 unsigned HalfNumElts = HalfVT.getVectorNumElements();
17567 if (!UndefLower &&
17568 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17569 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17570 DAG.getIntPtrConstant(HalfNumElts, DL));
17571 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17572 DAG.getIntPtrConstant(0, DL));
17573 }
17574
17575 // Lower half is undef and upper half is whole lower subvector.
17576 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17577 if (UndefLower &&
17578 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17579 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17580 DAG.getIntPtrConstant(0, DL));
17581 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17582 DAG.getIntPtrConstant(HalfNumElts, DL));
17583 }
17584
17585 int HalfIdx1, HalfIdx2;
17586 SmallVector<int, 8> HalfMask(HalfNumElts);
17587 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17588 return SDValue();
17589
17590 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17590, __extension__
__PRETTY_FUNCTION__))
;
17591
17592 // Only shuffle the halves of the inputs when useful.
17593 unsigned NumLowerHalves =
17594 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17595 unsigned NumUpperHalves =
17596 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17597 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17597, __extension__
__PRETTY_FUNCTION__))
;
17598
17599 // Determine the larger pattern of undef/halves, then decide if it's worth
17600 // splitting the shuffle based on subtarget capabilities and types.
17601 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17602 if (!UndefLower) {
17603 // XXXXuuuu: no insert is needed.
17604 // Always extract lowers when setting lower - these are all free subreg ops.
17605 if (NumUpperHalves == 0)
17606 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17607 UndefLower, DAG);
17608
17609 if (NumUpperHalves == 1) {
17610 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17611 if (Subtarget.hasAVX2()) {
17612 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17613 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17614 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
17615 (!isSingleSHUFPSMask(HalfMask) ||
17616 Subtarget.hasFastVariableCrossLaneShuffle()))
17617 return SDValue();
17618 // If this is a unary shuffle (assume that the 2nd operand is
17619 // canonicalized to undef), then we can use vpermpd. Otherwise, we
17620 // are better off extracting the upper half of 1 operand and using a
17621 // narrow shuffle.
17622 if (EltWidth == 64 && V2.isUndef())
17623 return SDValue();
17624 }
17625 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17626 if (Subtarget.hasAVX512() && VT.is512BitVector())
17627 return SDValue();
17628 // Extract + narrow shuffle is better than the wide alternative.
17629 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17630 UndefLower, DAG);
17631 }
17632
17633 // Don't extract both uppers, instead shuffle and then extract.
17634 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17634, __extension__
__PRETTY_FUNCTION__))
;
17635 return SDValue();
17636 }
17637
17638 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17639 if (NumUpperHalves == 0) {
17640 // AVX2 has efficient 64-bit element cross-lane shuffles.
17641 // TODO: Refine to account for unary shuffle, splat, and other masks?
17642 if (Subtarget.hasAVX2() && EltWidth == 64)
17643 return SDValue();
17644 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17645 if (Subtarget.hasAVX512() && VT.is512BitVector())
17646 return SDValue();
17647 // Narrow shuffle + insert is better than the wide alternative.
17648 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17649 UndefLower, DAG);
17650 }
17651
17652 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17653 return SDValue();
17654}
17655
17656/// Handle case where shuffle sources are coming from the same 128-bit lane and
17657/// every lane can be represented as the same repeating mask - allowing us to
17658/// shuffle the sources with the repeating shuffle and then permute the result
17659/// to the destination lanes.
17660static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17661 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17662 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17663 int NumElts = VT.getVectorNumElements();
17664 int NumLanes = VT.getSizeInBits() / 128;
17665 int NumLaneElts = NumElts / NumLanes;
17666
17667 // On AVX2 we may be able to just shuffle the lowest elements and then
17668 // broadcast the result.
17669 if (Subtarget.hasAVX2()) {
17670 for (unsigned BroadcastSize : {16, 32, 64}) {
17671 if (BroadcastSize <= VT.getScalarSizeInBits())
17672 continue;
17673 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17674
17675 // Attempt to match a repeating pattern every NumBroadcastElts,
17676 // accounting for UNDEFs but only references the lowest 128-bit
17677 // lane of the inputs.
17678 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17679 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17680 for (int j = 0; j != NumBroadcastElts; ++j) {
17681 int M = Mask[i + j];
17682 if (M < 0)
17683 continue;
17684 int &R = RepeatMask[j];
17685 if (0 != ((M % NumElts) / NumLaneElts))
17686 return false;
17687 if (0 <= R && R != M)
17688 return false;
17689 R = M;
17690 }
17691 return true;
17692 };
17693
17694 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
17695 if (!FindRepeatingBroadcastMask(RepeatMask))
17696 continue;
17697
17698 // Shuffle the (lowest) repeated elements in place for broadcast.
17699 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
17700
17701 // Shuffle the actual broadcast.
17702 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
17703 for (int i = 0; i != NumElts; i += NumBroadcastElts)
17704 for (int j = 0; j != NumBroadcastElts; ++j)
17705 BroadcastMask[i + j] = j;
17706 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
17707 BroadcastMask);
17708 }
17709 }
17710
17711 // Bail if the shuffle mask doesn't cross 128-bit lanes.
17712 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
17713 return SDValue();
17714
17715 // Bail if we already have a repeated lane shuffle mask.
17716 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17717 return SDValue();
17718
17719 // Helper to look for repeated mask in each split sublane, and that those
17720 // sublanes can then be permuted into place.
17721 auto ShuffleSubLanes = [&](int SubLaneScale) {
17722 int NumSubLanes = NumLanes * SubLaneScale;
17723 int NumSubLaneElts = NumLaneElts / SubLaneScale;
17724
17725 // Check that all the sources are coming from the same lane and see if we
17726 // can form a repeating shuffle mask (local to each sub-lane). At the same
17727 // time, determine the source sub-lane for each destination sub-lane.
17728 int TopSrcSubLane = -1;
17729 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
17730 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
17731 SubLaneScale,
17732 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
17733
17734 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
17735 // Extract the sub-lane mask, check that it all comes from the same lane
17736 // and normalize the mask entries to come from the first lane.
17737 int SrcLane = -1;
17738 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
17739 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17740 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
17741 if (M < 0)
17742 continue;
17743 int Lane = (M % NumElts) / NumLaneElts;
17744 if ((0 <= SrcLane) && (SrcLane != Lane))
17745 return SDValue();
17746 SrcLane = Lane;
17747 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
17748 SubLaneMask[Elt] = LocalM;
17749 }
17750
17751 // Whole sub-lane is UNDEF.
17752 if (SrcLane < 0)
17753 continue;
17754
17755 // Attempt to match against the candidate repeated sub-lane masks.
17756 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
17757 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
17758 for (int i = 0; i != NumSubLaneElts; ++i) {
17759 if (M1[i] < 0 || M2[i] < 0)
17760 continue;
17761 if (M1[i] != M2[i])
17762 return false;
17763 }
17764 return true;
17765 };
17766
17767 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
17768 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
17769 continue;
17770
17771 // Merge the sub-lane mask into the matching repeated sub-lane mask.
17772 for (int i = 0; i != NumSubLaneElts; ++i) {
17773 int M = SubLaneMask[i];
17774 if (M < 0)
17775 continue;
17776 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17777, __extension__
__PRETTY_FUNCTION__))
17777 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17777, __extension__
__PRETTY_FUNCTION__))
;
17778 RepeatedSubLaneMask[i] = M;
17779 }
17780
17781 // Track the top most source sub-lane - by setting the remaining to
17782 // UNDEF we can greatly simplify shuffle matching.
17783 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
17784 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
17785 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
17786 break;
17787 }
17788
17789 // Bail if we failed to find a matching repeated sub-lane mask.
17790 if (Dst2SrcSubLanes[DstSubLane] < 0)
17791 return SDValue();
17792 }
17793 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17794, __extension__
__PRETTY_FUNCTION__))
17794 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17794, __extension__
__PRETTY_FUNCTION__))
;
17795
17796 // Create a repeating shuffle mask for the entire vector.
17797 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17798 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17799 int Lane = SubLane / SubLaneScale;
17800 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17801 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17802 int M = RepeatedSubLaneMask[Elt];
17803 if (M < 0)
17804 continue;
17805 int Idx = (SubLane * NumSubLaneElts) + Elt;
17806 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17807 }
17808 }
17809
17810 // Shuffle each source sub-lane to its destination.
17811 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17812 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17813 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17814 if (SrcSubLane < 0)
17815 continue;
17816 for (int j = 0; j != NumSubLaneElts; ++j)
17817 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17818 }
17819
17820 // Avoid returning the same shuffle operation.
17821 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
17822 if (RepeatedMask == Mask || SubLaneMask == Mask)
17823 return SDValue();
17824
17825 SDValue RepeatedShuffle =
17826 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17827
17828 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17829 SubLaneMask);
17830 };
17831
17832 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
17833 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
17834 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
17835 // Otherwise we can only permute whole 128-bit lanes.
17836 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
17837 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
17838 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
17839 MinSubLaneScale = 2;
17840 MaxSubLaneScale =
17841 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
17842 }
17843 if (Subtarget.hasBWI() && VT == MVT::v64i8)
17844 MinSubLaneScale = MaxSubLaneScale = 4;
17845
17846 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
17847 if (SDValue Shuffle = ShuffleSubLanes(Scale))
17848 return Shuffle;
17849
17850 return SDValue();
17851}
17852
17853static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17854 bool &ForceV1Zero, bool &ForceV2Zero,
17855 unsigned &ShuffleImm, ArrayRef<int> Mask,
17856 const APInt &Zeroable) {
17857 int NumElts = VT.getVectorNumElements();
17858 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17860, __extension__
__PRETTY_FUNCTION__))
17859 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17860, __extension__
__PRETTY_FUNCTION__))
17860 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17860, __extension__
__PRETTY_FUNCTION__))
;
17861 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17862, __extension__
__PRETTY_FUNCTION__))
17862 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17862, __extension__
__PRETTY_FUNCTION__))
;
17863
17864 bool ZeroLane[2] = { true, true };
17865 for (int i = 0; i < NumElts; ++i)
17866 ZeroLane[i & 1] &= Zeroable[i];
17867
17868 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
17869 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
17870 ShuffleImm = 0;
17871 bool ShufpdMask = true;
17872 bool CommutableMask = true;
17873 for (int i = 0; i < NumElts; ++i) {
17874 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
17875 continue;
17876 if (Mask[i] < 0)
17877 return false;
17878 int Val = (i & 6) + NumElts * (i & 1);
17879 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
17880 if (Mask[i] < Val || Mask[i] > Val + 1)
17881 ShufpdMask = false;
17882 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
17883 CommutableMask = false;
17884 ShuffleImm |= (Mask[i] % 2) << i;
17885 }
17886
17887 if (!ShufpdMask && !CommutableMask)
17888 return false;
17889
17890 if (!ShufpdMask && CommutableMask)
17891 std::swap(V1, V2);
17892
17893 ForceV1Zero = ZeroLane[0];
17894 ForceV2Zero = ZeroLane[1];
17895 return true;
17896}
17897
17898static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
17899 SDValue V2, ArrayRef<int> Mask,
17900 const APInt &Zeroable,
17901 const X86Subtarget &Subtarget,
17902 SelectionDAG &DAG) {
17903 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17904, __extension__
__PRETTY_FUNCTION__))
17904 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17904, __extension__
__PRETTY_FUNCTION__))
;
17905
17906 unsigned Immediate = 0;
17907 bool ForceV1Zero = false, ForceV2Zero = false;
17908 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
17909 Mask, Zeroable))
17910 return SDValue();
17911
17912 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
17913 if (ForceV1Zero)
17914 V1 = getZeroVector(VT, Subtarget, DAG, DL);
17915 if (ForceV2Zero)
17916 V2 = getZeroVector(VT, Subtarget, DAG, DL);
17917
17918 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
17919 DAG.getTargetConstant(Immediate, DL, MVT::i8));
17920}
17921
17922// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17923// by zeroable elements in the remaining 24 elements. Turn this into two
17924// vmovqb instructions shuffled together.
17925static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
17926 SDValue V1, SDValue V2,
17927 ArrayRef<int> Mask,
17928 const APInt &Zeroable,
17929 SelectionDAG &DAG) {
17930 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17930, __extension__
__PRETTY_FUNCTION__))
;
17931
17932 // The first 8 indices should be every 8th element.
17933 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
17934 return SDValue();
17935
17936 // Remaining elements need to be zeroable.
17937 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
17938 return SDValue();
17939
17940 V1 = DAG.getBitcast(MVT::v4i64, V1);
17941 V2 = DAG.getBitcast(MVT::v4i64, V2);
17942
17943 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
17944 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
17945
17946 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
17947 // the upper bits of the result using an unpckldq.
17948 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
17949 { 0, 1, 2, 3, 16, 17, 18, 19,
17950 4, 5, 6, 7, 20, 21, 22, 23 });
17951 // Insert the unpckldq into a zero vector to widen to v32i8.
17952 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
17953 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
17954 DAG.getIntPtrConstant(0, DL));
17955}
17956
17957// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
17958// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
17959// =>
17960// ul = unpckl v1, v2
17961// uh = unpckh v1, v2
17962// a = vperm ul, uh
17963// b = vperm ul, uh
17964//
17965// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
17966// and permute. We cannot directly match v3 because it is split into two
17967// 256-bit vectors in earlier isel stages. Therefore, this function matches a
17968// pair of 256-bit shuffles and makes sure the masks are consecutive.
17969//
17970// Once unpck and permute nodes are created, the permute corresponding to this
17971// shuffle is returned, while the other permute replaces the other half of the
17972// shuffle in the selection dag.
17973static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
17974 SDValue V1, SDValue V2,
17975 ArrayRef<int> Mask,
17976 SelectionDAG &DAG) {
17977 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
17978 VT != MVT::v32i8)
17979 return SDValue();
17980 // <B0, B1, B0+1, B1+1, ..., >
17981 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
17982 unsigned Begin1) {
17983 size_t Size = Mask.size();
17984 assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17984, __extension__
__PRETTY_FUNCTION__))
;
17985 for (unsigned I = 0; I < Size; I += 2) {
17986 if (Mask[I] != (int)(Begin0 + I / 2) ||
17987 Mask[I + 1] != (int)(Begin1 + I / 2))
17988 return false;
17989 }
17990 return true;
17991 };
17992 // Check which half is this shuffle node
17993 int NumElts = VT.getVectorNumElements();
17994 size_t FirstQtr = NumElts / 2;
17995 size_t ThirdQtr = NumElts + NumElts / 2;
17996 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
17997 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
17998 if (!IsFirstHalf && !IsSecondHalf)
17999 return SDValue();
18000
18001 // Find the intersection between shuffle users of V1 and V2.
18002 SmallVector<SDNode *, 2> Shuffles;
18003 for (SDNode *User : V1->uses())
18004 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
18005 User->getOperand(1) == V2)
18006 Shuffles.push_back(User);
18007 // Limit user size to two for now.
18008 if (Shuffles.size() != 2)
18009 return SDValue();
18010 // Find out which half of the 512-bit shuffles is each smaller shuffle
18011 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
18012 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
18013 SDNode *FirstHalf;
18014 SDNode *SecondHalf;
18015 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
18016 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
18017 FirstHalf = Shuffles[0];
18018 SecondHalf = Shuffles[1];
18019 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
18020 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
18021 FirstHalf = Shuffles[1];
18022 SecondHalf = Shuffles[0];
18023 } else {
18024 return SDValue();
18025 }
18026 // Lower into unpck and perm. Return the perm of this shuffle and replace
18027 // the other.
18028 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
18029 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
18030 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18031 DAG.getTargetConstant(0x20, DL, MVT::i8));
18032 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18033 DAG.getTargetConstant(0x31, DL, MVT::i8));
18034 if (IsFirstHalf) {
18035 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
18036 return Perm1;
18037 }
18038 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
18039 return Perm2;
18040}
18041
18042/// Handle lowering of 4-lane 64-bit floating point shuffles.
18043///
18044/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
18045/// isn't available.
18046static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18047 const APInt &Zeroable, SDValue V1, SDValue V2,
18048 const X86Subtarget &Subtarget,
18049 SelectionDAG &DAG) {
18050 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18050, __extension__
__PRETTY_FUNCTION__))
;
18051 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18051, __extension__
__PRETTY_FUNCTION__))
;
18052 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18052, __extension__
__PRETTY_FUNCTION__))
;
18053
18054 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
18055 Subtarget, DAG))
18056 return V;
18057
18058 if (V2.isUndef()) {
18059 // Check for being able to broadcast a single element.
18060 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
18061 Mask, Subtarget, DAG))
18062 return Broadcast;
18063
18064 // Use low duplicate instructions for masks that match their pattern.
18065 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
18066 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
18067
18068 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
18069 // Non-half-crossing single input shuffles can be lowered with an
18070 // interleaved permutation.
18071 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18072 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
18073 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
18074 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18075 }
18076
18077 // With AVX2 we have direct support for this permutation.
18078 if (Subtarget.hasAVX2())
18079 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
18080 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18081
18082 // Try to create an in-lane repeating shuffle mask and then shuffle the
18083 // results into the target lanes.
18084 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18085 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18086 return V;
18087
18088 // Try to permute the lanes and then use a per-lane permute.
18089 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
18090 Mask, DAG, Subtarget))
18091 return V;
18092
18093 // Otherwise, fall back.
18094 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
18095 DAG, Subtarget);
18096 }
18097
18098 // Use dedicated unpack instructions for masks that match their pattern.
18099 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
18100 return V;
18101
18102 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
18103 Zeroable, Subtarget, DAG))
18104 return Blend;
18105
18106 // Check if the blend happens to exactly fit that of SHUFPD.
18107 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
18108 Zeroable, Subtarget, DAG))
18109 return Op;
18110
18111 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18112 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18113
18114 // If we have lane crossing shuffles AND they don't all come from the lower
18115 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
18116 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
18117 // canonicalize to a blend of splat which isn't necessary for this combine.
18118 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
18119 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
18120 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
18121 (V2.getOpcode() != ISD::BUILD_VECTOR))
18122 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
18123
18124 // If we have one input in place, then we can permute the other input and
18125 // blend the result.
18126 if (V1IsInPlace || V2IsInPlace)
18127 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18128 Subtarget, DAG);
18129
18130 // Try to create an in-lane repeating shuffle mask and then shuffle the
18131 // results into the target lanes.
18132 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18133 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18134 return V;
18135
18136 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18137 // shuffle. However, if we have AVX2 and either inputs are already in place,
18138 // we will be able to shuffle even across lanes the other input in a single
18139 // instruction so skip this pattern.
18140 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
18141 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
18142 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18143 return V;
18144
18145 // If we have VLX support, we can use VEXPAND.
18146 if (Subtarget.hasVLX())
18147 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
18148 DAG, Subtarget))
18149 return V;
18150
18151 // If we have AVX2 then we always want to lower with a blend because an v4 we
18152 // can fully permute the elements.
18153 if (Subtarget.hasAVX2())
18154 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18155 Subtarget, DAG);
18156
18157 // Otherwise fall back on generic lowering.
18158 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
18159 Subtarget, DAG);
18160}
18161
18162/// Handle lowering of 4-lane 64-bit integer shuffles.
18163///
18164/// This routine is only called when we have AVX2 and thus a reasonable
18165/// instruction set for v4i64 shuffling..
18166static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18167 const APInt &Zeroable, SDValue V1, SDValue V2,
18168 const X86Subtarget &Subtarget,
18169 SelectionDAG &DAG) {
18170 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18170, __extension__
__PRETTY_FUNCTION__))
;
18171 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18171, __extension__
__PRETTY_FUNCTION__))
;
18172 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18172, __extension__
__PRETTY_FUNCTION__))
;
18173 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18173, __extension__
__PRETTY_FUNCTION__))
;
18174
18175 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18176 Subtarget, DAG))
18177 return V;
18178
18179 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
18180 Zeroable, Subtarget, DAG))
18181 return Blend;
18182
18183 // Check for being able to broadcast a single element.
18184 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
18185 Subtarget, DAG))
18186 return Broadcast;
18187
18188 if (V2.isUndef()) {
18189 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18190 // can use lower latency instructions that will operate on both lanes.
18191 SmallVector<int, 2> RepeatedMask;
18192 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
18193 SmallVector<int, 4> PSHUFDMask;
18194 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
18195 return DAG.getBitcast(
18196 MVT::v4i64,
18197 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
18198 DAG.getBitcast(MVT::v8i32, V1),
18199 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18200 }
18201
18202 // AVX2 provides a direct instruction for permuting a single input across
18203 // lanes.
18204 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
18205 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18206 }
18207
18208 // Try to use shift instructions.
18209 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
18210 Zeroable, Subtarget, DAG))
18211 return Shift;
18212
18213 // If we have VLX support, we can use VALIGN or VEXPAND.
18214 if (Subtarget.hasVLX()) {
18215 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
18216 Subtarget, DAG))
18217 return Rotate;
18218
18219 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
18220 DAG, Subtarget))
18221 return V;
18222 }
18223
18224 // Try to use PALIGNR.
18225 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
18226 Subtarget, DAG))
18227 return Rotate;
18228
18229 // Use dedicated unpack instructions for masks that match their pattern.
18230 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
18231 return V;
18232
18233 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18234 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18235
18236 // If we have one input in place, then we can permute the other input and
18237 // blend the result.
18238 if (V1IsInPlace || V2IsInPlace)
18239 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18240 Subtarget, DAG);
18241
18242 // Try to create an in-lane repeating shuffle mask and then shuffle the
18243 // results into the target lanes.
18244 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18245 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18246 return V;
18247
18248 // Try to lower to PERMQ(BLENDD(V1,V2)).
18249 if (SDValue V =
18250 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
18251 return V;
18252
18253 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18254 // shuffle. However, if we have AVX2 and either inputs are already in place,
18255 // we will be able to shuffle even across lanes the other input in a single
18256 // instruction so skip this pattern.
18257 if (!V1IsInPlace && !V2IsInPlace)
18258 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18259 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18260 return Result;
18261
18262 // Otherwise fall back on generic blend lowering.
18263 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18264 Subtarget, DAG);
18265}
18266
18267/// Handle lowering of 8-lane 32-bit floating point shuffles.
18268///
18269/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
18270/// isn't available.
18271static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18272 const APInt &Zeroable, SDValue V1, SDValue V2,
18273 const X86Subtarget &Subtarget,
18274 SelectionDAG &DAG) {
18275 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18275, __extension__
__PRETTY_FUNCTION__))
;
1
'?' condition is true
18276 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18276, __extension__
__PRETTY_FUNCTION__))
;
2
'?' condition is true
18277 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18277, __extension__
__PRETTY_FUNCTION__))
;
3
Assuming the condition is true
4
'?' condition is true
18278
18279 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
5
Taking false branch
18280 Zeroable, Subtarget, DAG))
18281 return Blend;
18282
18283 // Check for being able to broadcast a single element.
18284 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
6
Taking false branch
18285 Subtarget, DAG))
18286 return Broadcast;
18287
18288 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18289 // options to efficiently lower the shuffle.
18290 SmallVector<int, 4> RepeatedMask;
18291 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
7
Assuming the condition is false
8
Taking false branch
18292 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18293, __extension__
__PRETTY_FUNCTION__))
18293 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18293, __extension__
__PRETTY_FUNCTION__))
;
18294
18295 // Use even/odd duplicate instructions for masks that match their pattern.
18296 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18297 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
18298 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18299 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
18300
18301 if (V2.isUndef())
18302 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
18303 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18304
18305 // Use dedicated unpack instructions for masks that match their pattern.
18306 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
18307 return V;
18308
18309 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
18310 // have already handled any direct blends.
18311 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
18312 }
18313
18314 // Try to create an in-lane repeating shuffle mask and then shuffle the
18315 // results into the target lanes.
18316 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
9
Taking false branch
18317 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18318 return V;
18319
18320 // If we have a single input shuffle with different shuffle patterns in the
18321 // two 128-bit lanes use the variable mask to VPERMILPS.
18322 if (V2.isUndef()) {
10
Taking false branch
18323 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
18324 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18325 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
18326 }
18327 if (Subtarget.hasAVX2()) {
18328 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18329 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
18330 }
18331 // Otherwise, fall back.
18332 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
18333 DAG, Subtarget);
18334 }
18335
18336 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18337 // shuffle.
18338 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
11
Calling 'lowerShuffleAsLanePermuteAndRepeatedMask'
18339 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18340 return Result;
18341
18342 // If we have VLX support, we can use VEXPAND.
18343 if (Subtarget.hasVLX())
18344 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
18345 DAG, Subtarget))
18346 return V;
18347
18348 // Try to match an interleave of two v8f32s and lower them as unpck and
18349 // permutes using ymms. This needs to go before we try to split the vectors.
18350 //
18351 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18352 // this path inadvertently.
18353 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18354 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18355 Mask, DAG))
18356 return V;
18357
18358 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18359 // since after split we get a more efficient code using vpunpcklwd and
18360 // vpunpckhwd instrs than vblend.
18361 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
18362 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
18363 DAG);
18364
18365 // If we have AVX2 then we always want to lower with a blend because at v8 we
18366 // can fully permute the elements.
18367 if (Subtarget.hasAVX2())
18368 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
18369 Subtarget, DAG);
18370
18371 // Otherwise fall back on generic lowering.
18372 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
18373 Subtarget, DAG);
18374}
18375
18376/// Handle lowering of 8-lane 32-bit integer shuffles.
18377///
18378/// This routine is only called when we have AVX2 and thus a reasonable
18379/// instruction set for v8i32 shuffling..
18380static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18381 const APInt &Zeroable, SDValue V1, SDValue V2,
18382 const X86Subtarget &Subtarget,
18383 SelectionDAG &DAG) {
18384 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18384, __extension__
__PRETTY_FUNCTION__))
;
18385 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18385, __extension__
__PRETTY_FUNCTION__))
;
18386 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18386, __extension__
__PRETTY_FUNCTION__))
;
18387 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18387, __extension__
__PRETTY_FUNCTION__))
;
18388
18389 // Whenever we can lower this as a zext, that instruction is strictly faster
18390 // than any alternative. It also allows us to fold memory operands into the
18391 // shuffle in many cases.
18392 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18393 Zeroable, Subtarget, DAG))
18394 return ZExt;
18395
18396 // Try to match an interleave of two v8i32s and lower them as unpck and
18397 // permutes using ymms. This needs to go before we try to split the vectors.
18398 if (!Subtarget.hasAVX512())
18399 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18400 Mask, DAG))
18401 return V;
18402
18403 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18404 // since after split we get a more efficient code than vblend by using
18405 // vpunpcklwd and vpunpckhwd instrs.
18406 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
18407 !Subtarget.hasAVX512())
18408 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
18409 DAG);
18410
18411 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
18412 Zeroable, Subtarget, DAG))
18413 return Blend;
18414
18415 // Check for being able to broadcast a single element.
18416 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
18417 Subtarget, DAG))
18418 return Broadcast;
18419
18420 // If the shuffle mask is repeated in each 128-bit lane we can use more
18421 // efficient instructions that mirror the shuffles across the two 128-bit
18422 // lanes.
18423 SmallVector<int, 4> RepeatedMask;
18424 bool Is128BitLaneRepeatedShuffle =
18425 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
18426 if (Is128BitLaneRepeatedShuffle) {
18427 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18427, __extension__
__PRETTY_FUNCTION__))
;
18428 if (V2.isUndef())
18429 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
18430 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18431
18432 // Use dedicated unpack instructions for masks that match their pattern.
18433 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
18434 return V;
18435 }
18436
18437 // Try to use shift instructions.
18438 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
18439 Zeroable, Subtarget, DAG))
18440 return Shift;
18441
18442 // If we have VLX support, we can use VALIGN or EXPAND.
18443 if (Subtarget.hasVLX()) {
18444 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
18445 Subtarget, DAG))
18446 return Rotate;
18447
18448 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
18449 DAG, Subtarget))
18450 return V;
18451 }
18452
18453 // Try to use byte rotation instructions.
18454 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
18455 Subtarget, DAG))
18456 return Rotate;
18457
18458 // Try to create an in-lane repeating shuffle mask and then shuffle the
18459 // results into the target lanes.
18460 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18461 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18462 return V;
18463
18464 if (V2.isUndef()) {
18465 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18466 // because that should be faster than the variable permute alternatives.
18467 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
18468 return V;
18469
18470 // If the shuffle patterns aren't repeated but it's a single input, directly
18471 // generate a cross-lane VPERMD instruction.
18472 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18473 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
18474 }
18475
18476 // Assume that a single SHUFPS is faster than an alternative sequence of
18477 // multiple instructions (even if the CPU has a domain penalty).
18478 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18479 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18480 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
18481 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
18482 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
18483 CastV1, CastV2, DAG);
18484 return DAG.getBitcast(MVT::v8i32, ShufPS);
18485 }
18486
18487 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18488 // shuffle.
18489 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18490 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18491 return Result;
18492
18493 // Otherwise fall back on generic blend lowering.
18494 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
18495 Subtarget, DAG);
18496}
18497
18498/// Handle lowering of 16-lane 16-bit integer shuffles.
18499///
18500/// This routine is only called when we have AVX2 and thus a reasonable
18501/// instruction set for v16i16 shuffling..
18502static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18503 const APInt &Zeroable, SDValue V1, SDValue V2,
18504 const X86Subtarget &Subtarget,
18505 SelectionDAG &DAG) {
18506 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18506, __extension__
__PRETTY_FUNCTION__))
;
18507 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18507, __extension__
__PRETTY_FUNCTION__))
;
18508 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18508, __extension__
__PRETTY_FUNCTION__))
;
18509 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18509, __extension__
__PRETTY_FUNCTION__))
;
18510
18511 // Whenever we can lower this as a zext, that instruction is strictly faster
18512 // than any alternative. It also allows us to fold memory operands into the
18513 // shuffle in many cases.
18514 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18515 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18516 return ZExt;
18517
18518 // Check for being able to broadcast a single element.
18519 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
18520 Subtarget, DAG))
18521 return Broadcast;
18522
18523 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
18524 Zeroable, Subtarget, DAG))
18525 return Blend;
18526
18527 // Use dedicated unpack instructions for masks that match their pattern.
18528 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
18529 return V;
18530
18531 // Use dedicated pack instructions for masks that match their pattern.
18532 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
18533 Subtarget))
18534 return V;
18535
18536 // Try to use lower using a truncation.
18537 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18538 Subtarget, DAG))
18539 return V;
18540
18541 // Try to use shift instructions.
18542 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
18543 Zeroable, Subtarget, DAG))
18544 return Shift;
18545
18546 // Try to use byte rotation instructions.
18547 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
18548 Subtarget, DAG))
18549 return Rotate;
18550
18551 // Try to create an in-lane repeating shuffle mask and then shuffle the
18552 // results into the target lanes.
18553 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18554 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18555 return V;
18556
18557 if (V2.isUndef()) {
18558 // Try to use bit rotation instructions.
18559 if (SDValue Rotate =
18560 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
18561 return Rotate;
18562
18563 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18564 // because that should be faster than the variable permute alternatives.
18565 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
18566 return V;
18567
18568 // There are no generalized cross-lane shuffle operations available on i16
18569 // element types.
18570 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18571 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18572 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18573 return V;
18574
18575 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18576 DAG, Subtarget);
18577 }
18578
18579 SmallVector<int, 8> RepeatedMask;
18580 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18581 // As this is a single-input shuffle, the repeated mask should be
18582 // a strictly valid v8i16 mask that we can pass through to the v8i16
18583 // lowering to handle even the v16 case.
18584 return lowerV8I16GeneralSingleInputShuffle(
18585 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18586 }
18587 }
18588
18589 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18590 Zeroable, Subtarget, DAG))
18591 return PSHUFB;
18592
18593 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18594 if (Subtarget.hasBWI())
18595 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18596
18597 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18598 // shuffle.
18599 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18600 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18601 return Result;
18602
18603 // Try to permute the lanes and then use a per-lane permute.
18604 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18605 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18606 return V;
18607
18608 // Try to match an interleave of two v16i16s and lower them as unpck and
18609 // permutes using ymms.
18610 if (!Subtarget.hasAVX512())
18611 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18612 Mask, DAG))
18613 return V;
18614
18615 // Otherwise fall back on generic lowering.
18616 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18617 Subtarget, DAG);
18618}
18619
18620/// Handle lowering of 32-lane 8-bit integer shuffles.
18621///
18622/// This routine is only called when we have AVX2 and thus a reasonable
18623/// instruction set for v32i8 shuffling..
18624static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18625 const APInt &Zeroable, SDValue V1, SDValue V2,
18626 const X86Subtarget &Subtarget,
18627 SelectionDAG &DAG) {
18628 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18628, __extension__
__PRETTY_FUNCTION__))
;
18629 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18629, __extension__
__PRETTY_FUNCTION__))
;
18630 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18630, __extension__
__PRETTY_FUNCTION__))
;
18631 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18631, __extension__
__PRETTY_FUNCTION__))
;
18632
18633 // Whenever we can lower this as a zext, that instruction is strictly faster
18634 // than any alternative. It also allows us to fold memory operands into the
18635 // shuffle in many cases.
18636 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18637 Zeroable, Subtarget, DAG))
18638 return ZExt;
18639
18640 // Check for being able to broadcast a single element.
18641 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18642 Subtarget, DAG))
18643 return Broadcast;
18644
18645 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18646 Zeroable, Subtarget, DAG))
18647 return Blend;
18648
18649 // Use dedicated unpack instructions for masks that match their pattern.
18650 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
18651 return V;
18652
18653 // Use dedicated pack instructions for masks that match their pattern.
18654 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
18655 Subtarget))
18656 return V;
18657
18658 // Try to use lower using a truncation.
18659 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
18660 Subtarget, DAG))
18661 return V;
18662
18663 // Try to use shift instructions.
18664 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
18665 Zeroable, Subtarget, DAG))
18666 return Shift;
18667
18668 // Try to use byte rotation instructions.
18669 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
18670 Subtarget, DAG))
18671 return Rotate;
18672
18673 // Try to use bit rotation instructions.
18674 if (V2.isUndef())
18675 if (SDValue Rotate =
18676 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
18677 return Rotate;
18678
18679 // Try to create an in-lane repeating shuffle mask and then shuffle the
18680 // results into the target lanes.
18681 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18682 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18683 return V;
18684
18685 // There are no generalized cross-lane shuffle operations available on i8
18686 // element types.
18687 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
18688 // Try to produce a fixed cross-128-bit lane permute followed by unpack
18689 // because that should be faster than the variable permute alternatives.
18690 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
18691 return V;
18692
18693 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18694 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18695 return V;
18696
18697 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
18698 DAG, Subtarget);
18699 }
18700
18701 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
18702 Zeroable, Subtarget, DAG))
18703 return PSHUFB;
18704
18705 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
18706 if (Subtarget.hasVBMI())
18707 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
18708
18709 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18710 // shuffle.
18711 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18712 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
18713 return Result;
18714
18715 // Try to permute the lanes and then use a per-lane permute.
18716 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18717 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
18718 return V;
18719
18720 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18721 // by zeroable elements in the remaining 24 elements. Turn this into two
18722 // vmovqb instructions shuffled together.
18723 if (Subtarget.hasVLX())
18724 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
18725 Mask, Zeroable, DAG))
18726 return V;
18727
18728 // Try to match an interleave of two v32i8s and lower them as unpck and
18729 // permutes using ymms.
18730 if (!Subtarget.hasAVX512())
18731 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
18732 Mask, DAG))
18733 return V;
18734
18735 // Otherwise fall back on generic lowering.
18736 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
18737 Subtarget, DAG);
18738}
18739
18740/// High-level routine to lower various 256-bit x86 vector shuffles.
18741///
18742/// This routine either breaks down the specific type of a 256-bit x86 vector
18743/// shuffle or splits it into two 128-bit shuffles and fuses the results back
18744/// together based on the available instructions.
18745static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18746 SDValue V1, SDValue V2, const APInt &Zeroable,
18747 const X86Subtarget &Subtarget,
18748 SelectionDAG &DAG) {
18749 // If we have a single input to the zero element, insert that into V1 if we
18750 // can do so cheaply.
18751 int NumElts = VT.getVectorNumElements();
18752 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18753
18754 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18755 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18756 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18757 return Insertion;
18758
18759 // Handle special cases where the lower or upper half is UNDEF.
18760 if (SDValue V =
18761 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18762 return V;
18763
18764 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
18765 // can check for those subtargets here and avoid much of the subtarget
18766 // querying in the per-vector-type lowering routines. With AVX1 we have
18767 // essentially *zero* ability to manipulate a 256-bit vector with integer
18768 // types. Since we'll use floating point types there eventually, just
18769 // immediately cast everything to a float and operate entirely in that domain.
18770 if (VT.isInteger() && !Subtarget.hasAVX2()) {
18771 int ElementBits = VT.getScalarSizeInBits();
18772 if (ElementBits < 32) {
18773 // No floating point type available, if we can't use the bit operations
18774 // for masking/blending then decompose into 128-bit vectors.
18775 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18776 Subtarget, DAG))
18777 return V;
18778 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18779 return V;
18780 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18781 }
18782
18783 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
18784 VT.getVectorNumElements());
18785 V1 = DAG.getBitcast(FpVT, V1);
18786 V2 = DAG.getBitcast(FpVT, V2);
18787 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
18788 }
18789
18790 if (VT == MVT::v16f16) {
18791 V1 = DAG.getBitcast(MVT::v16i16, V1);
18792 V2 = DAG.getBitcast(MVT::v16i16, V2);
18793 return DAG.getBitcast(MVT::v16f16,
18794 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
18795 }
18796
18797 switch (VT.SimpleTy) {
18798 case MVT::v4f64:
18799 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18800 case MVT::v4i64:
18801 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18802 case MVT::v8f32:
18803 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18804 case MVT::v8i32:
18805 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18806 case MVT::v16i16:
18807 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18808 case MVT::v32i8:
18809 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18810
18811 default:
18812 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18812)
;
18813 }
18814}
18815
18816/// Try to lower a vector shuffle as a 128-bit shuffles.
18817static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
18818 const APInt &Zeroable, SDValue V1, SDValue V2,
18819 const X86Subtarget &Subtarget,
18820 SelectionDAG &DAG) {
18821 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822, __extension__
__PRETTY_FUNCTION__))
18822 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822, __extension__
__PRETTY_FUNCTION__))
;
18823
18824 // To handle 256 bit vector requires VLX and most probably
18825 // function lowerV2X128VectorShuffle() is better solution.
18826 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18826, __extension__
__PRETTY_FUNCTION__))
;
18827
18828 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
18829 SmallVector<int, 4> Widened128Mask;
18830 if (!canWidenShuffleElements(Mask, Widened128Mask))
18831 return SDValue();
18832 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18832, __extension__
__PRETTY_FUNCTION__))
;
18833
18834 // Try to use an insert into a zero vector.
18835 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
18836 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
18837 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
18838 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
18839 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
18840 DAG.getIntPtrConstant(0, DL));
18841 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18842 getZeroVector(VT, Subtarget, DAG, DL), LoV,
18843 DAG.getIntPtrConstant(0, DL));
18844 }
18845
18846 // Check for patterns which can be matched with a single insert of a 256-bit
18847 // subvector.
18848 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
18849 if (OnlyUsesV1 ||
18850 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
18851 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
18852 SDValue SubVec =
18853 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
18854 DAG.getIntPtrConstant(0, DL));
18855 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
18856 DAG.getIntPtrConstant(4, DL));
18857 }
18858
18859 // See if this is an insertion of the lower 128-bits of V2 into V1.
18860 bool IsInsert = true;
18861 int V2Index = -1;
18862 for (int i = 0; i < 4; ++i) {
18863 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18863, __extension__
__PRETTY_FUNCTION__))
;
18864 if (Widened128Mask[i] < 0)
18865 continue;
18866
18867 // Make sure all V1 subvectors are in place.
18868 if (Widened128Mask[i] < 4) {
18869 if (Widened128Mask[i] != i) {
18870 IsInsert = false;
18871 break;
18872 }
18873 } else {
18874 // Make sure we only have a single V2 index and its the lowest 128-bits.
18875 if (V2Index >= 0 || Widened128Mask[i] != 4) {
18876 IsInsert = false;
18877 break;
18878 }
18879 V2Index = i;
18880 }
18881 }
18882 if (IsInsert && V2Index >= 0) {
18883 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
18884 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
18885 DAG.getIntPtrConstant(0, DL));
18886 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
18887 }
18888
18889 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
18890 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
18891 // possible we at least ensure the lanes stay sequential to help later
18892 // combines.
18893 SmallVector<int, 2> Widened256Mask;
18894 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
18895 Widened128Mask.clear();
18896 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
18897 }
18898
18899 // Try to lower to vshuf64x2/vshuf32x4.
18900 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
18901 unsigned PermMask = 0;
18902 // Insure elements came from the same Op.
18903 for (int i = 0; i < 4; ++i) {
18904 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18904, __extension__
__PRETTY_FUNCTION__))
;
18905 if (Widened128Mask[i] < 0)
18906 continue;
18907
18908 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
18909 unsigned OpIndex = i / 2;
18910 if (Ops[OpIndex].isUndef())
18911 Ops[OpIndex] = Op;
18912 else if (Ops[OpIndex] != Op)
18913 return SDValue();
18914
18915 // Convert the 128-bit shuffle mask selection values into 128-bit selection
18916 // bits defined by a vshuf64x2 instruction's immediate control byte.
18917 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
18918 }
18919
18920 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
18921 DAG.getTargetConstant(PermMask, DL, MVT::i8));
18922}
18923
18924/// Handle lowering of 8-lane 64-bit floating point shuffles.
18925static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18926 const APInt &Zeroable, SDValue V1, SDValue V2,
18927 const X86Subtarget &Subtarget,
18928 SelectionDAG &DAG) {
18929 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18929, __extension__
__PRETTY_FUNCTION__))
;
18930 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18930, __extension__
__PRETTY_FUNCTION__))
;
18931 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18931, __extension__
__PRETTY_FUNCTION__))
;
18932
18933 if (V2.isUndef()) {
18934 // Use low duplicate instructions for masks that match their pattern.
18935 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
18936 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
18937
18938 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
18939 // Non-half-crossing single input shuffles can be lowered with an
18940 // interleaved permutation.
18941 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18942 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18943 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18944 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18945 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18946 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18947 }
18948
18949 SmallVector<int, 4> RepeatedMask;
18950 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18951 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18952 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18953 }
18954
18955 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18956 V2, Subtarget, DAG))
18957 return Shuf128;
18958
18959 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
18960 return Unpck;
18961
18962 // Check if the blend happens to exactly fit that of SHUFPD.
18963 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18964 Zeroable, Subtarget, DAG))
18965 return Op;
18966
18967 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
18968 DAG, Subtarget))
18969 return V;
18970
18971 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18972 Zeroable, Subtarget, DAG))
18973 return Blend;
18974
18975 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18976}
18977
18978/// Handle lowering of 16-lane 32-bit floating point shuffles.
18979static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18980 const APInt &Zeroable, SDValue V1, SDValue V2,
18981 const X86Subtarget &Subtarget,
18982 SelectionDAG &DAG) {
18983 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18983, __extension__
__PRETTY_FUNCTION__))
;
18984 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18984, __extension__
__PRETTY_FUNCTION__))
;
18985 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18985, __extension__
__PRETTY_FUNCTION__))
;
18986
18987 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18988 // options to efficiently lower the shuffle.
18989 SmallVector<int, 4> RepeatedMask;
18990 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18991 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18991, __extension__
__PRETTY_FUNCTION__))
;
18992
18993 // Use even/odd duplicate instructions for masks that match their pattern.
18994 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18995 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18996 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18997 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18998
18999 if (V2.isUndef())
19000 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
19001 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19002
19003 // Use dedicated unpack instructions for masks that match their pattern.
19004 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
19005 return V;
19006
19007 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19008 Zeroable, Subtarget, DAG))
19009 return Blend;
19010
19011 // Otherwise, fall back to a SHUFPS sequence.
19012 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
19013 }
19014
19015 // Try to create an in-lane repeating shuffle mask and then shuffle the
19016 // results into the target lanes.
19017 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19018 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
19019 return V;
19020
19021 // If we have a single input shuffle with different shuffle patterns in the
19022 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
19023 if (V2.isUndef() &&
19024 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
19025 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
19026 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
19027 }
19028
19029 // If we have AVX512F support, we can use VEXPAND.
19030 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
19031 V1, V2, DAG, Subtarget))
19032 return V;
19033
19034 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
19035}
19036
19037/// Handle lowering of 8-lane 64-bit integer shuffles.
19038static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19039 const APInt &Zeroable, SDValue V1, SDValue V2,
19040 const X86Subtarget &Subtarget,
19041 SelectionDAG &DAG) {
19042 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19042, __extension__
__PRETTY_FUNCTION__))
;
19043 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19043, __extension__
__PRETTY_FUNCTION__))
;
19044 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19044, __extension__
__PRETTY_FUNCTION__))
;
19045
19046 if (V2.isUndef()) {
19047 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
19048 // can use lower latency instructions that will operate on all four
19049 // 128-bit lanes.
19050 SmallVector<int, 2> Repeated128Mask;
19051 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
19052 SmallVector<int, 4> PSHUFDMask;
19053 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
19054 return DAG.getBitcast(
19055 MVT::v8i64,
19056 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
19057 DAG.getBitcast(MVT::v16i32, V1),
19058 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
19059 }
19060
19061 SmallVector<int, 4> Repeated256Mask;
19062 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
19063 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
19064 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
19065 }
19066
19067 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
19068 V2, Subtarget, DAG))
19069 return Shuf128;
19070
19071 // Try to use shift instructions.
19072 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
19073 Zeroable, Subtarget, DAG))
19074 return Shift;
19075
19076 // Try to use VALIGN.
19077 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
19078 Subtarget, DAG))
19079 return Rotate;
19080
19081 // Try to use PALIGNR.
19082 if (Subtarget.hasBWI())
19083 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
19084 Subtarget, DAG))
19085 return Rotate;
19086
19087 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
19088 return Unpck;
19089
19090 // If we have AVX512F support, we can use VEXPAND.
19091 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
19092 DAG, Subtarget))
19093 return V;
19094
19095 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
19096 Zeroable, Subtarget, DAG))
19097 return Blend;
19098
19099 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
19100}
19101
19102/// Handle lowering of 16-lane 32-bit integer shuffles.
19103static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19104 const APInt &Zeroable, SDValue V1, SDValue V2,
19105 const X86Subtarget &Subtarget,
19106 SelectionDAG &DAG) {
19107 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19107, __extension__
__PRETTY_FUNCTION__))
;
19108 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19108, __extension__
__PRETTY_FUNCTION__))
;
19109 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19109, __extension__
__PRETTY_FUNCTION__))
;
19110
19111 // Whenever we can lower this as a zext, that instruction is strictly faster
19112 // than any alternative. It also allows us to fold memory operands into the
19113 // shuffle in many cases.
19114 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19115 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19116 return ZExt;
19117
19118 // If the shuffle mask is repeated in each 128-bit lane we can use more
19119 // efficient instructions that mirror the shuffles across the four 128-bit
19120 // lanes.
19121 SmallVector<int, 4> RepeatedMask;
19122 bool Is128BitLaneRepeatedShuffle =
19123 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
19124 if (Is128BitLaneRepeatedShuffle) {
19125 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19125, __extension__
__PRETTY_FUNCTION__))
;
19126 if (V2.isUndef())
19127 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
19128 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19129
19130 // Use dedicated unpack instructions for masks that match their pattern.
19131 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
19132 return V;
19133 }
19134
19135 // Try to use shift instructions.
19136 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
19137 Zeroable, Subtarget, DAG))
19138 return Shift;
19139
19140 // Try to use VALIGN.
19141 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
19142 Subtarget, DAG))
19143 return Rotate;
19144
19145 // Try to use byte rotation instructions.
19146 if (Subtarget.hasBWI())
19147 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
19148 Subtarget, DAG))
19149 return Rotate;
19150
19151 // Assume that a single SHUFPS is faster than using a permv shuffle.
19152 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
19153 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
19154 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
19155 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
19156 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
19157 CastV1, CastV2, DAG);
19158 return DAG.getBitcast(MVT::v16i32, ShufPS);
19159 }
19160
19161 // Try to create an in-lane repeating shuffle mask and then shuffle the
19162 // results into the target lanes.
19163 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19164 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
19165 return V;
19166
19167 // If we have AVX512F support, we can use VEXPAND.
19168 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
19169 DAG, Subtarget))
19170 return V;
19171
19172 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
19173 Zeroable, Subtarget, DAG))
19174 return Blend;
19175
19176 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
19177}
19178
19179/// Handle lowering of 32-lane 16-bit integer shuffles.
19180static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19181 const APInt &Zeroable, SDValue V1, SDValue V2,
19182 const X86Subtarget &Subtarget,
19183 SelectionDAG &DAG) {
19184 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19184, __extension__
__PRETTY_FUNCTION__))
;
19185 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19185, __extension__
__PRETTY_FUNCTION__))
;
19186 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19186, __extension__
__PRETTY_FUNCTION__))
;
19187 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19187, __extension__
__PRETTY_FUNCTION__))
;
19188
19189 // Whenever we can lower this as a zext, that instruction is strictly faster
19190 // than any alternative. It also allows us to fold memory operands into the
19191 // shuffle in many cases.
19192 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19193 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
19194 return ZExt;
19195
19196 // Use dedicated unpack instructions for masks that match their pattern.
19197 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
19198 return V;
19199
19200 // Use dedicated pack instructions for masks that match their pattern.
19201 if (SDValue V =
19202 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
19203 return V;
19204
19205 // Try to use shift instructions.
19206 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
19207 Zeroable, Subtarget, DAG))
19208 return Shift;
19209
19210 // Try to use byte rotation instructions.
19211 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
19212 Subtarget, DAG))
19213 return Rotate;
19214
19215 if (V2.isUndef()) {
19216 // Try to use bit rotation instructions.
19217 if (SDValue Rotate =
19218 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
19219 return Rotate;
19220
19221 SmallVector<int, 8> RepeatedMask;
19222 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
19223 // As this is a single-input shuffle, the repeated mask should be
19224 // a strictly valid v8i16 mask that we can pass through to the v8i16
19225 // lowering to handle even the v32 case.
19226 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
19227 RepeatedMask, Subtarget, DAG);
19228 }
19229 }
19230
19231 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
19232 Zeroable, Subtarget, DAG))
19233 return Blend;
19234
19235 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
19236 Zeroable, Subtarget, DAG))
19237 return PSHUFB;
19238
19239 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
19240}
19241
19242/// Handle lowering of 64-lane 8-bit integer shuffles.
19243static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19244 const APInt &Zeroable, SDValue V1, SDValue V2,
19245 const X86Subtarget &Subtarget,
19246 SelectionDAG &DAG) {
19247 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19247, __extension__
__PRETTY_FUNCTION__))
;
19248 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19248, __extension__
__PRETTY_FUNCTION__))
;
19249 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19249, __extension__
__PRETTY_FUNCTION__))
;
19250 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19250, __extension__
__PRETTY_FUNCTION__))
;
19251
19252 // Whenever we can lower this as a zext, that instruction is strictly faster
19253 // than any alternative. It also allows us to fold memory operands into the
19254 // shuffle in many cases.
19255 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19256 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
19257 return ZExt;
19258
19259 // Use dedicated unpack instructions for masks that match their pattern.
19260 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
19261 return V;
19262
19263 // Use dedicated pack instructions for masks that match their pattern.
19264 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
19265 Subtarget))
19266 return V;
19267
19268 // Try to use shift instructions.
19269 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
19270 Zeroable, Subtarget, DAG))
19271 return Shift;
19272
19273 // Try to use byte rotation instructions.
19274 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
19275 Subtarget, DAG))
19276 return Rotate;
19277
19278 // Try to use bit rotation instructions.
19279 if (V2.isUndef())
19280 if (SDValue Rotate =
19281 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
19282 return Rotate;
19283
19284 // Lower as AND if possible.
19285 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
19286 Zeroable, Subtarget, DAG))
19287 return Masked;
19288
19289 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
19290 Zeroable, Subtarget, DAG))
19291 return PSHUFB;
19292
19293 // Try to create an in-lane repeating shuffle mask and then shuffle the
19294 // results into the target lanes.
19295 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19296 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19297 return V;
19298
19299 if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
19300 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
19301 return Result;
19302
19303 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
19304 Zeroable, Subtarget, DAG))
19305 return Blend;
19306
19307 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
19308 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
19309 // PALIGNR will be cheaper than the second PSHUFB+OR.
19310 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
19311 Mask, Subtarget, DAG))
19312 return V;
19313
19314 // If we can't directly blend but can use PSHUFB, that will be better as it
19315 // can both shuffle and set up the inefficient blend.
19316 bool V1InUse, V2InUse;
19317 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
19318 DAG, V1InUse, V2InUse);
19319 }
19320
19321 // Try to simplify this by merging 128-bit lanes to enable a lane-based
19322 // shuffle.
19323 if (!V2.isUndef())
19324 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19325 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19326 return Result;
19327
19328 // VBMI can use VPERMV/VPERMV3 byte shuffles.
19329 if (Subtarget.hasVBMI())
19330 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19331
19332 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
19333}
19334
19335/// High-level routine to lower various 512-bit x86 vector shuffles.
19336///
19337/// This routine either breaks down the specific type of a 512-bit x86 vector
19338/// shuffle or splits it into two 256-bit shuffles and fuses the results back
19339/// together based on the available instructions.
19340static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19341 MVT VT, SDValue V1, SDValue V2,
19342 const APInt &Zeroable,
19343 const X86Subtarget &Subtarget,
19344 SelectionDAG &DAG) {
19345 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19346, __extension__
__PRETTY_FUNCTION__))
19346 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19346, __extension__
__PRETTY_FUNCTION__))
;
19347
19348 // If we have a single input to the zero element, insert that into V1 if we
19349 // can do so cheaply.
19350 int NumElts = Mask.size();
19351 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19352
19353 if (NumV2Elements == 1 && Mask[0] >= NumElts)
19354 if (SDValue Insertion = lowerShuffleAsElementInsertion(
19355 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19356 return Insertion;
19357
19358 // Handle special cases where the lower or upper half is UNDEF.
19359 if (SDValue V =
19360 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19361 return V;
19362
19363 // Check for being able to broadcast a single element.
19364 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
19365 Subtarget, DAG))
19366 return Broadcast;
19367
19368 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
19369 // Try using bit ops for masking and blending before falling back to
19370 // splitting.
19371 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19372 Subtarget, DAG))
19373 return V;
19374 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19375 return V;
19376
19377 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
19378 }
19379
19380 if (VT == MVT::v32f16) {
19381 V1 = DAG.getBitcast(MVT::v32i16, V1);
19382 V2 = DAG.getBitcast(MVT::v32i16, V2);
19383 return DAG.getBitcast(MVT::v32f16,
19384 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
19385 }
19386
19387 // Dispatch to each element type for lowering. If we don't have support for
19388 // specific element type shuffles at 512 bits, immediately split them and
19389 // lower them. Each lowering routine of a given type is allowed to assume that
19390 // the requisite ISA extensions for that element type are available.
19391 switch (VT.SimpleTy) {
19392 case MVT::v8f64:
19393 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19394 case MVT::v16f32:
19395 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19396 case MVT::v8i64:
19397 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19398 case MVT::v16i32:
19399 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19400 case MVT::v32i16:
19401 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19402 case MVT::v64i8:
19403 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19404
19405 default:
19406 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19406)
;
19407 }
19408}
19409
19410static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
19411 MVT VT, SDValue V1, SDValue V2,
19412 const X86Subtarget &Subtarget,
19413 SelectionDAG &DAG) {
19414 // Shuffle should be unary.
19415 if (!V2.isUndef())
19416 return SDValue();
19417
19418 int ShiftAmt = -1;
19419 int NumElts = Mask.size();
19420 for (int i = 0; i != NumElts; ++i) {
19421 int M = Mask[i];
19422 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19423, __extension__
__PRETTY_FUNCTION__))
19423 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19423, __extension__
__PRETTY_FUNCTION__))
;
19424 if (M < 0)
19425 continue;
19426
19427 // The first non-undef element determines our shift amount.
19428 if (ShiftAmt < 0) {
19429 ShiftAmt = M - i;
19430 // Need to be shifting right.
19431 if (ShiftAmt <= 0)
19432 return SDValue();
19433 }
19434 // All non-undef elements must shift by the same amount.
19435 if (ShiftAmt != M - i)
19436 return SDValue();
19437 }
19438 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19438, __extension__
__PRETTY_FUNCTION__))
;
19439
19440 // Great we found a shift right.
19441 MVT WideVT = VT;
19442 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19443 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19444 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19445 DAG.getUNDEF(WideVT), V1,
19446 DAG.getIntPtrConstant(0, DL));
19447 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
19448 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19449 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19450 DAG.getIntPtrConstant(0, DL));
19451}
19452
19453// Determine if this shuffle can be implemented with a KSHIFT instruction.
19454// Returns the shift amount if possible or -1 if not. This is a simplified
19455// version of matchShuffleAsShift.
19456static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
19457 int MaskOffset, const APInt &Zeroable) {
19458 int Size = Mask.size();
19459
19460 auto CheckZeros = [&](int Shift, bool Left) {
19461 for (int j = 0; j < Shift; ++j)
19462 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
19463 return false;
19464
19465 return true;
19466 };
19467
19468 auto MatchShift = [&](int Shift, bool Left) {
19469 unsigned Pos = Left ? Shift : 0;
19470 unsigned Low = Left ? 0 : Shift;
19471 unsigned Len = Size - Shift;
19472 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
19473 };
19474
19475 for (int Shift = 1; Shift != Size; ++Shift)
19476 for (bool Left : {true, false})
19477 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
19478 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
19479 return Shift;
19480 }
19481
19482 return -1;
19483}
19484
19485
19486// Lower vXi1 vector shuffles.
19487// There is no a dedicated instruction on AVX-512 that shuffles the masks.
19488// The only way to shuffle bits is to sign-extend the mask vector to SIMD
19489// vector, shuffle and then truncate it back.
19490static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19491 MVT VT, SDValue V1, SDValue V2,
19492 const APInt &Zeroable,
19493 const X86Subtarget &Subtarget,
19494 SelectionDAG &DAG) {
19495 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__))
19496 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__))
;
19497
19498 int NumElts = Mask.size();
19499
19500 // Try to recognize shuffles that are just padding a subvector with zeros.
19501 int SubvecElts = 0;
19502 int Src = -1;
19503 for (int i = 0; i != NumElts; ++i) {
19504 if (Mask[i] >= 0) {
19505 // Grab the source from the first valid mask. All subsequent elements need
19506 // to use this same source.
19507 if (Src < 0)
19508 Src = Mask[i] / NumElts;
19509 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
19510 break;
19511 }
19512
19513 ++SubvecElts;
19514 }
19515 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19515, __extension__
__PRETTY_FUNCTION__))
;
19516
19517 // Clip to a power 2.
19518 SubvecElts = PowerOf2Floor(SubvecElts);
19519
19520 // Make sure the number of zeroable bits in the top at least covers the bits
19521 // not covered by the subvector.
19522 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
19523 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19523, __extension__
__PRETTY_FUNCTION__))
;
19524 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
19525 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
19526 Src == 0 ? V1 : V2,
19527 DAG.getIntPtrConstant(0, DL));
19528 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19529 DAG.getConstant(0, DL, VT),
19530 Extract, DAG.getIntPtrConstant(0, DL));
19531 }
19532
19533 // Try a simple shift right with undef elements. Later we'll try with zeros.
19534 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
19535 DAG))
19536 return Shift;
19537
19538 // Try to match KSHIFTs.
19539 unsigned Offset = 0;
19540 for (SDValue V : { V1, V2 }) {
19541 unsigned Opcode;
19542 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
19543 if (ShiftAmt >= 0) {
19544 MVT WideVT = VT;
19545 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19546 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19547 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19548 DAG.getUNDEF(WideVT), V,
19549 DAG.getIntPtrConstant(0, DL));
19550 // Widened right shifts need two shifts to ensure we shift in zeroes.
19551 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
19552 int WideElts = WideVT.getVectorNumElements();
19553 // Shift left to put the original vector in the MSBs of the new size.
19554 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
19555 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
19556 // Increase the shift amount to account for the left shift.
19557 ShiftAmt += WideElts - NumElts;
19558 }
19559
19560 Res = DAG.getNode(Opcode, DL, WideVT, Res,
19561 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19562 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19563 DAG.getIntPtrConstant(0, DL));
19564 }
19565 Offset += NumElts; // Increment for next iteration.
19566 }
19567
19568 // If we're broadcasting a SETCC result, try to broadcast the ops instead.
19569 // TODO: What other unary shuffles would benefit from this?
19570 if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
19571 V1->hasOneUse()) {
19572 SDValue Op0 = V1.getOperand(0);
19573 SDValue Op1 = V1.getOperand(1);
19574 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
19575 EVT OpVT = Op0.getValueType();
19576 return DAG.getSetCC(
19577 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
19578 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
19579 }
19580
19581 MVT ExtVT;
19582 switch (VT.SimpleTy) {
19583 default:
19584 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19584)
;
19585 case MVT::v2i1:
19586 ExtVT = MVT::v2i64;
19587 break;
19588 case MVT::v4i1:
19589 ExtVT = MVT::v4i32;
19590 break;
19591 case MVT::v8i1:
19592 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19593 // shuffle.
19594 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19595 break;
19596 case MVT::v16i1:
19597 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19598 // 256-bit operation available.
19599 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19600 break;
19601 case MVT::v32i1:
19602 // Take 512-bit type, unless we are avoiding 512-bit types and have the
19603 // 256-bit operation available.
19604 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19604, __extension__
__PRETTY_FUNCTION__))
;
19605 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19606 break;
19607 case MVT::v64i1:
19608 // Fall back to scalarization. FIXME: We can do better if the shuffle
19609 // can be partitioned cleanly.
19610 if (!Subtarget.useBWIRegs())
19611 return SDValue();
19612 ExtVT = MVT::v64i8;
19613 break;
19614 }
19615
19616 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
19617 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
19618
19619 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
19620 // i1 was sign extended we can use X86ISD::CVT2MASK.
19621 int NumElems = VT.getVectorNumElements();
19622 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
19623 (Subtarget.hasDQI() && (NumElems < 32)))
19624 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
19625 Shuffle, ISD::SETGT);
19626
19627 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
19628}
19629
19630/// Helper function that returns true if the shuffle mask should be
19631/// commuted to improve canonicalization.
19632static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
19633 int NumElements = Mask.size();
19634
19635 int NumV1Elements = 0, NumV2Elements = 0;
19636 for (int M : Mask)
19637 if (M < 0)
19638 continue;
19639 else if (M < NumElements)
19640 ++NumV1Elements;
19641 else
19642 ++NumV2Elements;
19643
19644 // Commute the shuffle as needed such that more elements come from V1 than
19645 // V2. This allows us to match the shuffle pattern strictly on how many
19646 // elements come from V1 without handling the symmetric cases.
19647 if (NumV2Elements > NumV1Elements)
19648 return true;
19649
19650 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19650, __extension__
__PRETTY_FUNCTION__))
;
19651
19652 if (NumV2Elements == 0)
19653 return false;
19654
19655 // When the number of V1 and V2 elements are the same, try to minimize the
19656 // number of uses of V2 in the low half of the vector. When that is tied,
19657 // ensure that the sum of indices for V1 is equal to or lower than the sum
19658 // indices for V2. When those are equal, try to ensure that the number of odd
19659 // indices for V1 is lower than the number of odd indices for V2.
19660 if (NumV1Elements == NumV2Elements) {
19661 int LowV1Elements = 0, LowV2Elements = 0;
19662 for (int M : Mask.slice(0, NumElements / 2))
19663 if (M >= NumElements)
19664 ++LowV2Elements;
19665 else if (M >= 0)
19666 ++LowV1Elements;
19667 if (LowV2Elements > LowV1Elements)
19668 return true;
19669 if (LowV2Elements == LowV1Elements) {
19670 int SumV1Indices = 0, SumV2Indices = 0;
19671 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19672 if (Mask[i] >= NumElements)
19673 SumV2Indices += i;
19674 else if (Mask[i] >= 0)
19675 SumV1Indices += i;
19676 if (SumV2Indices < SumV1Indices)
19677 return true;
19678 if (SumV2Indices == SumV1Indices) {
19679 int NumV1OddIndices = 0, NumV2OddIndices = 0;
19680 for (int i = 0, Size = Mask.size(); i < Size; ++i)
19681 if (Mask[i] >= NumElements)
19682 NumV2OddIndices += i % 2;
19683 else if (Mask[i] >= 0)
19684 NumV1OddIndices += i % 2;
19685 if (NumV2OddIndices < NumV1OddIndices)
19686 return true;
19687 }
19688 }
19689 }
19690
19691 return false;
19692}
19693
19694static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,
19695 const X86Subtarget &Subtarget) {
19696 if (!Subtarget.hasAVX512())
19697 return false;
19698
19699 MVT VT = V1.getSimpleValueType().getScalarType();
19700 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
19701 return false;
19702
19703 // i8 is better to be widen to i16, because there is PBLENDW for vXi16
19704 // when the vector bit size is 128 or 256.
19705 if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512)
19706 return false;
19707
19708 auto HasMaskOperation = [&](SDValue V) {
19709 // TODO: Currently we only check limited opcode. We probably extend
19710 // it to all binary operation by checking TLI.isBinOp().
19711 switch (V->getOpcode()) {
19712 default:
19713 return false;
19714 case ISD::ADD:
19715 case ISD::SUB:
19716 case ISD::AND:
19717 case ISD::XOR:
19718 break;
19719 }
19720 if (!V->hasOneUse())
19721 return false;
19722
19723 return true;
19724 };
19725
19726 if (HasMaskOperation(V1) || HasMaskOperation(V2))
19727 return true;
19728
19729 return false;
19730}
19731
19732// Forward declaration.
19733static SDValue canonicalizeShuffleMaskWithHorizOp(
19734 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
19735 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
19736 const X86Subtarget &Subtarget);
19737
19738 /// Top-level lowering for x86 vector shuffles.
19739///
19740/// This handles decomposition, canonicalization, and lowering of all x86
19741/// vector shuffles. Most of the specific lowering strategies are encapsulated
19742/// above in helper routines. The canonicalization attempts to widen shuffles
19743/// to involve fewer lanes of wider elements, consolidate symmetric patterns
19744/// s.t. only one of the two inputs needs to be tested, etc.
19745static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
19746 SelectionDAG &DAG) {
19747 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
19748 ArrayRef<int> OrigMask = SVOp->getMask();
19749 SDValue V1 = Op.getOperand(0);
19750 SDValue V2 = Op.getOperand(1);
19751 MVT VT = Op.getSimpleValueType();
19752 int NumElements = VT.getVectorNumElements();
19753 SDLoc DL(Op);
19754 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
19755
19756 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19757, __extension__
__PRETTY_FUNCTION__))
19757 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19757, __extension__
__PRETTY_FUNCTION__))
;
19758
19759 bool V1IsUndef = V1.isUndef();
19760 bool V2IsUndef = V2.isUndef();
19761 if (V1IsUndef && V2IsUndef)
19762 return DAG.getUNDEF(VT);
19763
19764 // When we create a shuffle node we put the UNDEF node to second operand,
19765 // but in some cases the first operand may be transformed to UNDEF.
19766 // In this case we should just commute the node.
19767 if (V1IsUndef)
19768 return DAG.getCommutedVectorShuffle(*SVOp);
19769
19770 // Check for non-undef masks pointing at an undef vector and make the masks
19771 // undef as well. This makes it easier to match the shuffle based solely on
19772 // the mask.
19773 if (V2IsUndef &&
19774 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
19775 SmallVector<int, 8> NewMask(OrigMask);
19776 for (int &M : NewMask)
19777 if (M >= NumElements)
19778 M = -1;
19779 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
19780 }
19781
19782 // Check for illegal shuffle mask element index values.
19783 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
19784 (void)MaskUpperLimit;
19785 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19787, __extension__
__PRETTY_FUNCTION__))
19786 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19787, __extension__
__PRETTY_FUNCTION__))
19787 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19787, __extension__
__PRETTY_FUNCTION__))
;
19788
19789 // We actually see shuffles that are entirely re-arrangements of a set of
19790 // zero inputs. This mostly happens while decomposing complex shuffles into
19791 // simple ones. Directly lower these as a buildvector of zeros.
19792 APInt KnownUndef, KnownZero;
19793 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
19794
19795 APInt Zeroable = KnownUndef | KnownZero;
19796 if (Zeroable.isAllOnes())
19797 return getZeroVector(VT, Subtarget, DAG, DL);
19798
19799 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
19800
19801 // Try to collapse shuffles into using a vector type with fewer elements but
19802 // wider element types. We cap this to not form integers or floating point
19803 // elements wider than 64 bits. It does not seem beneficial to form i128
19804 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
19805 SmallVector<int, 16> WidenedMask;
19806 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
19807 !canCombineAsMaskOperation(V1, V2, Subtarget) &&
19808 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
19809 // Shuffle mask widening should not interfere with a broadcast opportunity
19810 // by obfuscating the operands with bitcasts.
19811 // TODO: Avoid lowering directly from this top-level function: make this
19812 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
19813 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
19814 Subtarget, DAG))
19815 return Broadcast;
19816
19817 MVT NewEltVT = VT.isFloatingPoint()
19818 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
19819 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
19820 int NewNumElts = NumElements / 2;
19821 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
19822 // Make sure that the new vector type is legal. For example, v2f64 isn't
19823 // legal on SSE1.
19824 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
19825 if (V2IsZero) {
19826 // Modify the new Mask to take all zeros from the all-zero vector.
19827 // Choose indices that are blend-friendly.
19828 bool UsedZeroVector = false;
19829 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19830, __extension__
__PRETTY_FUNCTION__))
19830 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19830, __extension__
__PRETTY_FUNCTION__))
;
19831 for (int i = 0; i != NewNumElts; ++i)
19832 if (WidenedMask[i] == SM_SentinelZero) {
19833 WidenedMask[i] = i + NewNumElts;
19834 UsedZeroVector = true;
19835 }
19836 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
19837 // some elements to be undef.
19838 if (UsedZeroVector)
19839 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
19840 }
19841 V1 = DAG.getBitcast(NewVT, V1);
19842 V2 = DAG.getBitcast(NewVT, V2);
19843 return DAG.getBitcast(
19844 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
19845 }
19846 }
19847
19848 SmallVector<SDValue> Ops = {V1, V2};
19849 SmallVector<int> Mask(OrigMask);
19850
19851 // Canonicalize the shuffle with any horizontal ops inputs.
19852 // NOTE: This may update Ops and Mask.
19853 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
19854 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
19855 return DAG.getBitcast(VT, HOp);
19856
19857 V1 = DAG.getBitcast(VT, Ops[0]);
19858 V2 = DAG.getBitcast(VT, Ops[1]);
19859 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19861, __extension__
__PRETTY_FUNCTION__))
19860 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19861, __extension__
__PRETTY_FUNCTION__))
19861 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19861, __extension__
__PRETTY_FUNCTION__))
;
19862
19863 // Commute the shuffle if it will improve canonicalization.
19864 if (canonicalizeShuffleMaskWithCommute(Mask)) {
19865 ShuffleVectorSDNode::commuteMask(Mask);
19866 std::swap(V1, V2);
19867 }
19868
19869 // For each vector width, delegate to a specialized lowering routine.
19870 if (VT.is128BitVector())
19871 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19872
19873 if (VT.is256BitVector())
19874 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19875
19876 if (VT.is512BitVector())
19877 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19878
19879 if (Is1BitVector)
19880 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19881
19882 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19882)
;
19883}
19884
19885/// Try to lower a VSELECT instruction to a vector shuffle.
19886static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
19887 const X86Subtarget &Subtarget,
19888 SelectionDAG &DAG) {
19889 SDValue Cond = Op.getOperand(0);
19890 SDValue LHS = Op.getOperand(1);
19891 SDValue RHS = Op.getOperand(2);
19892 MVT VT = Op.getSimpleValueType();
19893
19894 // Only non-legal VSELECTs reach this lowering, convert those into generic
19895 // shuffles and re-use the shuffle lowering path for blends.
19896 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
19897 SmallVector<int, 32> Mask;
19898 if (createShuffleMaskFromVSELECT(Mask, Cond))
19899 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19900 }
19901
19902 return SDValue();
19903}
19904
19905SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19906 SDValue Cond = Op.getOperand(0);
19907 SDValue LHS = Op.getOperand(1);
19908 SDValue RHS = Op.getOperand(2);
19909
19910 SDLoc dl(Op);
19911 MVT VT = Op.getSimpleValueType();
19912 if (isSoftFP16(VT)) {
19913 MVT NVT = VT.changeVectorElementTypeToInteger();
19914 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
19915 DAG.getBitcast(NVT, LHS),
19916 DAG.getBitcast(NVT, RHS)));
19917 }
19918
19919 // A vselect where all conditions and data are constants can be optimized into
19920 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19921 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
19922 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
19923 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
19924 return SDValue();
19925
19926 // Try to lower this to a blend-style vector shuffle. This can handle all
19927 // constant condition cases.
19928 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19929 return BlendOp;
19930
19931 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19932 // with patterns on the mask registers on AVX-512.
19933 MVT CondVT = Cond.getSimpleValueType();
19934 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19935 if (CondEltSize == 1)
19936 return Op;
19937
19938 // Variable blends are only legal from SSE4.1 onward.
19939 if (!Subtarget.hasSSE41())
19940 return SDValue();
19941
19942 unsigned EltSize = VT.getScalarSizeInBits();
19943 unsigned NumElts = VT.getVectorNumElements();
19944
19945 // Expand v32i16/v64i8 without BWI.
19946 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19947 return SDValue();
19948
19949 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19950 // into an i1 condition so that we can use the mask-based 512-bit blend
19951 // instructions.
19952 if (VT.getSizeInBits() == 512) {
19953 // Build a mask by testing the condition against zero.
19954 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19955 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19956 DAG.getConstant(0, dl, CondVT),
19957 ISD::SETNE);
19958 // Now return a new VSELECT using the mask.
19959 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19960 }
19961
19962 // SEXT/TRUNC cases where the mask doesn't match the destination size.
19963 if (CondEltSize != EltSize) {
19964 // If we don't have a sign splat, rely on the expansion.
19965 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19966 return SDValue();
19967
19968 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19969 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19970 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19971 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19972 }
19973
19974 // Only some types will be legal on some subtargets. If we can emit a legal
19975 // VSELECT-matching blend, return Op, and but if we need to expand, return
19976 // a null value.
19977 switch (VT.SimpleTy) {
19978 default:
19979 // Most of the vector types have blends past SSE4.1.
19980 return Op;
19981
19982 case MVT::v32i8:
19983 // The byte blends for AVX vectors were introduced only in AVX2.
19984 if (Subtarget.hasAVX2())
19985 return Op;
19986
19987 return SDValue();
19988
19989 case MVT::v8i16:
19990 case MVT::v16i16: {
19991 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19992 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19993 Cond = DAG.getBitcast(CastVT, Cond);
19994 LHS = DAG.getBitcast(CastVT, LHS);
19995 RHS = DAG.getBitcast(CastVT, RHS);
19996 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19997 return DAG.getBitcast(VT, Select);
19998 }
19999 }
20000}
20001
20002static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
20003 MVT VT = Op.getSimpleValueType();
20004 SDValue Vec = Op.getOperand(0);
20005 SDValue Idx = Op.getOperand(1);
20006 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20006, __extension__
__PRETTY_FUNCTION__))
;
20007 SDLoc dl(Op);
20008
20009 if (!Vec.getSimpleValueType().is128BitVector())
20010 return SDValue();
20011
20012 if (VT.getSizeInBits() == 8) {
20013 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
20014 // we're going to zero extend the register or fold the store.
20015 if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
20016 !X86::mayFoldIntoStore(Op))
20017 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
20018 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20019 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20020
20021 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
20022 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
20023 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20024 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20025 }
20026
20027 if (VT == MVT::f32) {
20028 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
20029 // the result back to FR32 register. It's only worth matching if the
20030 // result has a single use which is a store or a bitcast to i32. And in
20031 // the case of a store, it's not worth it if the index is a constant 0,
20032 // because a MOVSSmr can be used instead, which is smaller and faster.
20033 if (!Op.hasOneUse())
20034 return SDValue();
20035 SDNode *User = *Op.getNode()->use_begin();
20036 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
20037 (User->getOpcode() != ISD::BITCAST ||
20038 User->getValueType(0) != MVT::i32))
20039 return SDValue();
20040 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20041 DAG.getBitcast(MVT::v4i32, Vec), Idx);
20042 return DAG.getBitcast(MVT::f32, Extract);
20043 }
20044
20045 if (VT == MVT::i32 || VT == MVT::i64)
20046 return Op;
20047
20048 return SDValue();
20049}
20050
20051/// Extract one bit from mask vector, like v16i1 or v8i1.
20052/// AVX-512 feature.
20053static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
20054 const X86Subtarget &Subtarget) {
20055 SDValue Vec = Op.getOperand(0);
20056 SDLoc dl(Vec);
20057 MVT VecVT = Vec.getSimpleValueType();
20058 SDValue Idx = Op.getOperand(1);
20059 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20060 MVT EltVT = Op.getSimpleValueType();
20061
20062 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20063, __extension__
__PRETTY_FUNCTION__))
20063 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20063, __extension__
__PRETTY_FUNCTION__))
;
20064
20065 // variable index can't be handled in mask registers,
20066 // extend vector to VR512/128
20067 if (!IdxC) {
20068 unsigned NumElts = VecVT.getVectorNumElements();
20069 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
20070 // than extending to 128/256bit.
20071 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20072 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20073 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
20074 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
20075 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
20076 }
20077
20078 unsigned IdxVal = IdxC->getZExtValue();
20079 if (IdxVal == 0) // the operation is legal
20080 return Op;
20081
20082 // Extend to natively supported kshift.
20083 unsigned NumElems = VecVT.getVectorNumElements();
20084 MVT WideVecVT = VecVT;
20085 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20086 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20087 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20088 DAG.getUNDEF(WideVecVT), Vec,
20089 DAG.getIntPtrConstant(0, dl));
20090 }
20091
20092 // Use kshiftr instruction to move to the lower element.
20093 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20094 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20095
20096 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20097 DAG.getIntPtrConstant(0, dl));
20098}
20099
20100SDValue
20101X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
20102 SelectionDAG &DAG) const {
20103 SDLoc dl(Op);
20104 SDValue Vec = Op.getOperand(0);
20105 MVT VecVT = Vec.getSimpleValueType();
20106 SDValue Idx = Op.getOperand(1);
20107 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20108
20109 if (VecVT.getVectorElementType() == MVT::i1)
20110 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
20111
20112 if (!IdxC) {
20113 // Its more profitable to go through memory (1 cycles throughput)
20114 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
20115 // IACA tool was used to get performance estimation
20116 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
20117 //
20118 // example : extractelement <16 x i8> %a, i32 %i
20119 //
20120 // Block Throughput: 3.00 Cycles
20121 // Throughput Bottleneck: Port5
20122 //
20123 // | Num Of | Ports pressure in cycles | |
20124 // | Uops | 0 - DV | 5 | 6 | 7 | |
20125 // ---------------------------------------------
20126 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
20127 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
20128 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
20129 // Total Num Of Uops: 4
20130 //
20131 //
20132 // Block Throughput: 1.00 Cycles
20133 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
20134 //
20135 // | | Ports pressure in cycles | |
20136 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
20137 // ---------------------------------------------------------
20138 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
20139 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
20140 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
20141 // Total Num Of Uops: 4
20142
20143 return SDValue();
20144 }
20145
20146 unsigned IdxVal = IdxC->getZExtValue();
20147
20148 // If this is a 256-bit vector result, first extract the 128-bit vector and
20149 // then extract the element from the 128-bit vector.
20150 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
20151 // Get the 128-bit vector.
20152 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
20153 MVT EltVT = VecVT.getVectorElementType();
20154
20155 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
20156 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20156, __extension__
__PRETTY_FUNCTION__))
;
20157
20158 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
20159 // this can be done with a mask.
20160 IdxVal &= ElemsPerChunk - 1;
20161 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20162 DAG.getIntPtrConstant(IdxVal, dl));
20163 }
20164
20165 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20165, __extension__
__PRETTY_FUNCTION__))
;
20166
20167 MVT VT = Op.getSimpleValueType();
20168
20169 if (VT == MVT::i16) {
20170 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
20171 // we're going to zero extend the register or fold the store (SSE41 only).
20172 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
20173 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
20174 if (Subtarget.hasFP16())
20175 return Op;
20176
20177 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
20178 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20179 DAG.getBitcast(MVT::v4i32, Vec), Idx));
20180 }
20181
20182 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
20183 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20184 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20185 }
20186
20187 if (Subtarget.hasSSE41())
20188 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
20189 return Res;
20190
20191 // TODO: We only extract a single element from v16i8, we can probably afford
20192 // to be more aggressive here before using the default approach of spilling to
20193 // stack.
20194 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
20195 // Extract either the lowest i32 or any i16, and extract the sub-byte.
20196 int DWordIdx = IdxVal / 4;
20197 if (DWordIdx == 0) {
20198 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20199 DAG.getBitcast(MVT::v4i32, Vec),
20200 DAG.getIntPtrConstant(DWordIdx, dl));
20201 int ShiftVal = (IdxVal % 4) * 8;
20202 if (ShiftVal != 0)
20203 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
20204 DAG.getConstant(ShiftVal, dl, MVT::i8));
20205 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20206 }
20207
20208 int WordIdx = IdxVal / 2;
20209 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
20210 DAG.getBitcast(MVT::v8i16, Vec),
20211 DAG.getIntPtrConstant(WordIdx, dl));
20212 int ShiftVal = (IdxVal % 2) * 8;
20213 if (ShiftVal != 0)
20214 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
20215 DAG.getConstant(ShiftVal, dl, MVT::i8));
20216 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20217 }
20218
20219 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
20220 if (IdxVal == 0)
20221 return Op;
20222
20223 // Shuffle the element to the lowest element, then movss or movsh.
20224 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
20225 Mask[0] = static_cast<int>(IdxVal);
20226 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20227 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20228 DAG.getIntPtrConstant(0, dl));
20229 }
20230
20231 if (VT.getSizeInBits() == 64) {
20232 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
20233 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
20234 // to match extract_elt for f64.
20235 if (IdxVal == 0)
20236 return Op;
20237
20238 // UNPCKHPD the element to the lowest double word, then movsd.
20239 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
20240 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
20241 int Mask[2] = { 1, -1 };
20242 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20243 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20244 DAG.getIntPtrConstant(0, dl));
20245 }
20246
20247 return SDValue();
20248}
20249
20250/// Insert one bit to mask vector, like v16i1 or v8i1.
20251/// AVX-512 feature.
20252static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
20253 const X86Subtarget &Subtarget) {
20254 SDLoc dl(Op);
20255 SDValue Vec = Op.getOperand(0);
20256 SDValue Elt = Op.getOperand(1);
20257 SDValue Idx = Op.getOperand(2);
20258 MVT VecVT = Vec.getSimpleValueType();
20259
20260 if (!isa<ConstantSDNode>(Idx)) {
20261 // Non constant index. Extend source and destination,
20262 // insert element and then truncate the result.
20263 unsigned NumElts = VecVT.getVectorNumElements();
20264 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20265 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20266 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
20267 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
20268 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
20269 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
20270 }
20271
20272 // Copy into a k-register, extract to v1i1 and insert_subvector.
20273 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
20274 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
20275}
20276
20277SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
20278 SelectionDAG &DAG) const {
20279 MVT VT = Op.getSimpleValueType();
20280 MVT EltVT = VT.getVectorElementType();
20281 unsigned NumElts = VT.getVectorNumElements();
20282 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
20283
20284 if (EltVT == MVT::i1)
20285 return InsertBitToMaskVector(Op, DAG, Subtarget);
20286
20287 SDLoc dl(Op);
20288 SDValue N0 = Op.getOperand(0);
20289 SDValue N1 = Op.getOperand(1);
20290 SDValue N2 = Op.getOperand(2);
20291 auto *N2C = dyn_cast<ConstantSDNode>(N2);
20292
20293 if (!N2C) {
20294 // Variable insertion indices, usually we're better off spilling to stack,
20295 // but AVX512 can use a variable compare+select by comparing against all
20296 // possible vector indices, and FP insertion has less gpr->simd traffic.
20297 if (!(Subtarget.hasBWI() ||
20298 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
20299 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
20300 return SDValue();
20301
20302 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
20303 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
20304 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
20305 return SDValue();
20306
20307 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
20308 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
20309 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
20310
20311 SmallVector<SDValue, 16> RawIndices;
20312 for (unsigned I = 0; I != NumElts; ++I)
20313 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
20314 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
20315
20316 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
20317 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
20318 ISD::CondCode::SETEQ);
20319 }
20320
20321 if (N2C->getAPIntValue().uge(NumElts))
20322 return SDValue();
20323 uint64_t IdxVal = N2C->getZExtValue();
20324
20325 bool IsZeroElt = X86::isZeroNode(N1);
20326 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
20327
20328 if (IsZeroElt || IsAllOnesElt) {
20329 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
20330 // We don't deal with i8 0 since it appears to be handled elsewhere.
20331 if (IsAllOnesElt &&
20332 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
20333 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
20334 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
20335 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
20336 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
20337 CstVectorElts[IdxVal] = OnesCst;
20338 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
20339 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
20340 }
20341 // See if we can do this more efficiently with a blend shuffle with a
20342 // rematerializable vector.
20343 if (Subtarget.hasSSE41() &&
20344 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
20345 SmallVector<int, 8> BlendMask;
20346 for (unsigned i = 0; i != NumElts; ++i)
20347 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20348 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
20349 : getOnesVector(VT, DAG, dl);
20350 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
20351 }
20352 }
20353
20354 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
20355 // into that, and then insert the subvector back into the result.
20356 if (VT.is256BitVector() || VT.is512BitVector()) {
20357 // With a 256-bit vector, we can insert into the zero element efficiently
20358 // using a blend if we have AVX or AVX2 and the right data type.
20359 if (VT.is256BitVector() && IdxVal == 0) {
20360 // TODO: It is worthwhile to cast integer to floating point and back
20361 // and incur a domain crossing penalty if that's what we'll end up
20362 // doing anyway after extracting to a 128-bit vector.
20363 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
20364 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
20365 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20366 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
20367 DAG.getTargetConstant(1, dl, MVT::i8));
20368 }
20369 }
20370
20371 unsigned NumEltsIn128 = 128 / EltSizeInBits;
20372 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20373, __extension__
__PRETTY_FUNCTION__))
20373 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20373, __extension__
__PRETTY_FUNCTION__))
;
20374
20375 // If we are not inserting into the low 128-bit vector chunk,
20376 // then prefer the broadcast+blend sequence.
20377 // FIXME: relax the profitability check iff all N1 uses are insertions.
20378 if (IdxVal >= NumEltsIn128 &&
20379 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
20380 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
20381 X86::mayFoldLoad(N1, Subtarget)))) {
20382 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
20383 SmallVector<int, 8> BlendMask;
20384 for (unsigned i = 0; i != NumElts; ++i)
20385 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20386 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
20387 }
20388
20389 // Get the desired 128-bit vector chunk.
20390 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
20391
20392 // Insert the element into the desired chunk.
20393 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
20394 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
20395
20396 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
20397 DAG.getIntPtrConstant(IdxIn128, dl));
20398
20399 // Insert the changed part back into the bigger vector
20400 return insert128BitVector(N0, V, IdxVal, DAG, dl);
20401 }
20402 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20402, __extension__
__PRETTY_FUNCTION__))
;
20403
20404 // This will be just movw/movd/movq/movsh/movss/movsd.
20405 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
20406 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
20407 EltVT == MVT::f16 || EltVT == MVT::i64) {
20408 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20409 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20410 }
20411
20412 // We can't directly insert an i8 or i16 into a vector, so zero extend
20413 // it to i32 first.
20414 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
20415 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
20416 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20417 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
20418 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20419 return DAG.getBitcast(VT, N1);
20420 }
20421 }
20422
20423 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
20424 // argument. SSE41 required for pinsrb.
20425 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
20426 unsigned Opc;
20427 if (VT == MVT::v8i16) {
20428 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20428, __extension__
__PRETTY_FUNCTION__))
;
20429 Opc = X86ISD::PINSRW;
20430 } else {
20431 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20431, __extension__
__PRETTY_FUNCTION__))
;
20432 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20432, __extension__
__PRETTY_FUNCTION__))
;
20433 Opc = X86ISD::PINSRB;
20434 }
20435
20436 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20436, __extension__
__PRETTY_FUNCTION__))
;
20437 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
20438 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
20439 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
20440 }
20441
20442 if (Subtarget.hasSSE41()) {
20443 if (EltVT == MVT::f32) {
20444 // Bits [7:6] of the constant are the source select. This will always be
20445 // zero here. The DAG Combiner may combine an extract_elt index into
20446 // these bits. For example (insert (extract, 3), 2) could be matched by
20447 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
20448 // Bits [5:4] of the constant are the destination select. This is the
20449 // value of the incoming immediate.
20450 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
20451 // combine either bitwise AND or insert of float 0.0 to set these bits.
20452
20453 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
20454 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
20455 // If this is an insertion of 32-bits into the low 32-bits of
20456 // a vector, we prefer to generate a blend with immediate rather
20457 // than an insertps. Blends are simpler operations in hardware and so
20458 // will always have equal or better performance than insertps.
20459 // But if optimizing for size and there's a load folding opportunity,
20460 // generate insertps because blendps does not have a 32-bit memory
20461 // operand form.
20462 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20463 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
20464 DAG.getTargetConstant(1, dl, MVT::i8));
20465 }
20466 // Create this as a scalar to vector..
20467 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20468 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
20469 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
20470 }
20471
20472 // PINSR* works with constant index.
20473 if (EltVT == MVT::i32 || EltVT == MVT::i64)
20474 return Op;
20475 }
20476
20477 return SDValue();
20478}
20479
20480static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
20481 SelectionDAG &DAG) {
20482 SDLoc dl(Op);
20483 MVT OpVT = Op.getSimpleValueType();
20484
20485 // It's always cheaper to replace a xor+movd with xorps and simplifies further
20486 // combines.
20487 if (X86::isZeroNode(Op.getOperand(0)))
20488 return getZeroVector(OpVT, Subtarget, DAG, dl);
20489
20490 // If this is a 256-bit vector result, first insert into a 128-bit
20491 // vector and then insert into the 256-bit vector.
20492 if (!OpVT.is128BitVector()) {
20493 // Insert into a 128-bit vector.
20494 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
20495 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
20496 OpVT.getVectorNumElements() / SizeFactor);
20497
20498 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
20499
20500 // Insert the 128-bit vector.
20501 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
20502 }
20503 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20504, __extension__
__PRETTY_FUNCTION__))
20504 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20504, __extension__
__PRETTY_FUNCTION__))
;
20505
20506 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
20507 // tblgen.
20508 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
20509 return Op;
20510
20511 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
20512 return DAG.getBitcast(
20513 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
20514}
20515
20516// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
20517// simple superregister reference or explicit instructions to insert
20518// the upper bits of a vector.
20519static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20520 SelectionDAG &DAG) {
20521 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20521, __extension__
__PRETTY_FUNCTION__))
;
20522
20523 return insert1BitVector(Op, DAG, Subtarget);
20524}
20525
20526static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20527 SelectionDAG &DAG) {
20528 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20529, __extension__
__PRETTY_FUNCTION__))
20529 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20529, __extension__
__PRETTY_FUNCTION__))
;
20530
20531 SDLoc dl(Op);
20532 SDValue Vec = Op.getOperand(0);
20533 uint64_t IdxVal = Op.getConstantOperandVal(1);
20534
20535 if (IdxVal == 0) // the operation is legal
20536 return Op;
20537
20538 MVT VecVT = Vec.getSimpleValueType();
20539 unsigned NumElems = VecVT.getVectorNumElements();
20540
20541 // Extend to natively supported kshift.
20542 MVT WideVecVT = VecVT;
20543 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20544 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20545 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20546 DAG.getUNDEF(WideVecVT), Vec,
20547 DAG.getIntPtrConstant(0, dl));
20548 }
20549
20550 // Shift to the LSB.
20551 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20552 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20553
20554 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
20555 DAG.getIntPtrConstant(0, dl));
20556}
20557
20558// Returns the appropriate wrapper opcode for a global reference.
20559unsigned X86TargetLowering::getGlobalWrapperKind(
20560 const GlobalValue *GV, const unsigned char OpFlags) const {
20561 // References to absolute symbols are never PC-relative.
20562 if (GV && GV->isAbsoluteSymbolRef())
20563 return X86ISD::Wrapper;
20564
20565 CodeModel::Model M = getTargetMachine().getCodeModel();
20566 if (Subtarget.isPICStyleRIPRel() &&
20567 (M == CodeModel::Small || M == CodeModel::Kernel))
20568 return X86ISD::WrapperRIP;
20569
20570 // In the medium model, functions can always be referenced RIP-relatively,
20571 // since they must be within 2GiB. This is also possible in non-PIC mode, and
20572 // shorter than the 64-bit absolute immediate that would otherwise be emitted.
20573 if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))
20574 return X86ISD::WrapperRIP;
20575
20576 // GOTPCREL references must always use RIP.
20577 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
20578 return X86ISD::WrapperRIP;
20579
20580 return X86ISD::Wrapper;
20581}
20582
20583// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
20584// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
20585// one of the above mentioned nodes. It has to be wrapped because otherwise
20586// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
20587// be used to form addressing mode. These wrapped nodes will be selected
20588// into MOV32ri.
20589SDValue
20590X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
20591 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
20592
20593 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20594 // global base reg.
20595 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20596
20597 auto PtrVT = getPointerTy(DAG.getDataLayout());
20598 SDValue Result = DAG.getTargetConstantPool(
20599 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
20600 SDLoc DL(CP);
20601 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20602 // With PIC, the address is actually $g + Offset.
20603 if (OpFlag) {
20604 Result =
20605 DAG.getNode(ISD::ADD, DL, PtrVT,
20606 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20607 }
20608
20609 return Result;
20610}
20611
20612SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
20613 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
20614
20615 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20616 // global base reg.
20617 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
20618
20619 auto PtrVT = getPointerTy(DAG.getDataLayout());
20620 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
20621 SDLoc DL(JT);
20622 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
20623
20624 // With PIC, the address is actually $g + Offset.
20625 if (OpFlag)
20626 Result =
20627 DAG.getNode(ISD::ADD, DL, PtrVT,
20628 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
20629
20630 return Result;
20631}
20632
20633SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
20634 SelectionDAG &DAG) const {
20635 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20636}
20637
20638SDValue
20639X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
20640 // Create the TargetBlockAddressAddress node.
20641 unsigned char OpFlags =
20642 Subtarget.classifyBlockAddressReference();
20643 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
20644 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
20645 SDLoc dl(Op);
20646 auto PtrVT = getPointerTy(DAG.getDataLayout());
20647 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20648 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
20649
20650 // With PIC, the address is actually $g + Offset.
20651 if (isGlobalRelativeToPICBase(OpFlags)) {
20652 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20653 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20654 }
20655
20656 return Result;
20657}
20658
20659/// Creates target global address or external symbol nodes for calls or
20660/// other uses.
20661SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20662 bool ForCall) const {
20663 // Unpack the global address or external symbol.
20664 const SDLoc &dl = SDLoc(Op);
20665 const GlobalValue *GV = nullptr;
20666 int64_t Offset = 0;
20667 const char *ExternalSym = nullptr;
20668 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20669 GV = G->getGlobal();
20670 Offset = G->getOffset();
20671 } else {
20672 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20673 ExternalSym = ES->getSymbol();
20674 }
20675
20676 // Calculate some flags for address lowering.
20677 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
20678 unsigned char OpFlags;
20679 if (ForCall)
20680 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
20681 else
20682 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
20683 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
20684 bool NeedsLoad = isGlobalStubReference(OpFlags);
20685
20686 CodeModel::Model M = DAG.getTarget().getCodeModel();
20687 auto PtrVT = getPointerTy(DAG.getDataLayout());
20688 SDValue Result;
20689
20690 if (GV) {
20691 // Create a target global address if this is a global. If possible, fold the
20692 // offset into the global address reference. Otherwise, ADD it on later.
20693 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
20694 // allowed because if the address of foo is 0, the ELF R_X86_64_32
20695 // relocation will compute to a negative value, which is invalid.
20696 int64_t GlobalOffset = 0;
20697 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20698 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
20699 std::swap(GlobalOffset, Offset);
20700 }
20701 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20702 } else {
20703 // If this is not a global address, this must be an external symbol.
20704 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20705 }
20706
20707 // If this is a direct call, avoid the wrapper if we don't need to do any
20708 // loads or adds. This allows SDAG ISel to match direct calls.
20709 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20710 return Result;
20711
20712 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20713
20714 // With PIC, the address is actually $g + Offset.
20715 if (HasPICReg) {
20716 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20717 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20718 }
20719
20720 // For globals that require a load from a stub to get the address, emit the
20721 // load.
20722 if (NeedsLoad)
20723 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20724 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20725
20726 // If there was a non-zero offset that we didn't fold, create an explicit
20727 // addition for it.
20728 if (Offset != 0)
20729 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20730 DAG.getConstant(Offset, dl, PtrVT));
20731
20732 return Result;
20733}
20734
20735SDValue
20736X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20737 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
20738}
20739
20740static SDValue
20741GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
20742 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
20743 unsigned char OperandFlags, bool LocalDynamic = false) {
20744 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20745 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20746 SDLoc dl(GA);
20747 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20748 GA->getValueType(0),
20749 GA->getOffset(),
20750 OperandFlags);
20751
20752 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
20753 : X86ISD::TLSADDR;
20754
20755 if (InFlag) {
20756 SDValue Ops[] = { Chain, TGA, *InFlag };
20757 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20758 } else {
20759 SDValue Ops[] = { Chain, TGA };
20760 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
20761 }
20762
20763 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20764 MFI.setAdjustsStack(true);
20765 MFI.setHasCalls(true);
20766
20767 SDValue Flag = Chain.getValue(1);
20768 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
20769}
20770
20771// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20772static SDValue
20773LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20774 const EVT PtrVT) {
20775 SDValue InFlag;
20776 SDLoc dl(GA); // ? function entry point might be better
20777 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20778 DAG.getNode(X86ISD::GlobalBaseReg,
20779 SDLoc(), PtrVT), InFlag);
20780 InFlag = Chain.getValue(1);
20781
20782 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
20783}
20784
20785// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20786static SDValue
20787LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20788 const EVT PtrVT) {
20789 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20790 X86::RAX, X86II::MO_TLSGD);
20791}
20792
20793// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20794static SDValue
20795LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20796 const EVT PtrVT) {
20797 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
20798 X86::EAX, X86II::MO_TLSGD);
20799}
20800
20801static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
20802 SelectionDAG &DAG, const EVT PtrVT,
20803 bool Is64Bit, bool Is64BitLP64) {
20804 SDLoc dl(GA);
20805
20806 // Get the start address of the TLS block for this module.
20807 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
20808 .getInfo<X86MachineFunctionInfo>();
20809 MFI->incNumLocalDynamicTLSAccesses();
20810
20811 SDValue Base;
20812 if (Is64Bit) {
20813 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20814 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
20815 X86II::MO_TLSLD, /*LocalDynamic=*/true);
20816 } else {
20817 SDValue InFlag;
20818 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
20819 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
20820 InFlag = Chain.getValue(1);
20821 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
20822 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
20823 }
20824
20825 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20826 // of Base.
20827
20828 // Build x@dtpoff.
20829 unsigned char OperandFlags = X86II::MO_DTPOFF;
20830 unsigned WrapperKind = X86ISD::Wrapper;
20831 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20832 GA->getValueType(0),
20833 GA->getOffset(), OperandFlags);
20834 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20835
20836 // Add x@dtpoff with the base.
20837 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20838}
20839
20840// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20841static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
20842 const EVT PtrVT, TLSModel::Model model,
20843 bool is64Bit, bool isPIC) {
20844 SDLoc dl(GA);
20845
20846 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20847 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
20848 is64Bit ? 257 : 256));
20849
20850 SDValue ThreadPointer =
20851 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20852 MachinePointerInfo(Ptr));
20853
20854 unsigned char OperandFlags = 0;
20855 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
20856 // initialexec.
20857 unsigned WrapperKind = X86ISD::Wrapper;
20858 if (model == TLSModel::LocalExec) {
20859 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20860 } else if (model == TLSModel::InitialExec) {
20861 if (is64Bit) {
20862 OperandFlags = X86II::MO_GOTTPOFF;
20863 WrapperKind = X86ISD::WrapperRIP;
20864 } else {
20865 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20866 }
20867 } else {
20868 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20868)
;
20869 }
20870
20871 // emit "addl x@ntpoff,%eax" (local exec)
20872 // or "addl x@indntpoff,%eax" (initial exec)
20873 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20874 SDValue TGA =
20875 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20876 GA->getOffset(), OperandFlags);
20877 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20878
20879 if (model == TLSModel::InitialExec) {
20880 if (isPIC && !is64Bit) {
20881 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20882 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20883 Offset);
20884 }
20885
20886 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20887 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
20888 }
20889
20890 // The address of the thread local variable is the add of the thread
20891 // pointer with the offset of the variable.
20892 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20893}
20894
20895SDValue
20896X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20897
20898 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20899
20900 if (DAG.getTarget().useEmulatedTLS())
20901 return LowerToTLSEmulatedModel(GA, DAG);
20902
20903 const GlobalValue *GV = GA->getGlobal();
20904 auto PtrVT = getPointerTy(DAG.getDataLayout());
20905 bool PositionIndependent = isPositionIndependent();
20906
20907 if (Subtarget.isTargetELF()) {
20908 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20909 switch (model) {
20910 case TLSModel::GeneralDynamic:
20911 if (Subtarget.is64Bit()) {
20912 if (Subtarget.isTarget64BitLP64())
20913 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20914 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20915 }
20916 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20917 case TLSModel::LocalDynamic:
20918 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20919 Subtarget.isTarget64BitLP64());
20920 case TLSModel::InitialExec:
20921 case TLSModel::LocalExec:
20922 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20923 PositionIndependent);
20924 }
20925 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20925)
;
20926 }
20927
20928 if (Subtarget.isTargetDarwin()) {
20929 // Darwin only has one model of TLS. Lower to that.
20930 unsigned char OpFlag = 0;
20931 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
20932 X86ISD::WrapperRIP : X86ISD::Wrapper;
20933
20934 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20935 // global base reg.
20936 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20937 if (PIC32)
20938 OpFlag = X86II::MO_TLVP_PIC_BASE;
20939 else
20940 OpFlag = X86II::MO_TLVP;
20941 SDLoc DL(Op);
20942 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
20943 GA->getValueType(0),
20944 GA->getOffset(), OpFlag);
20945 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20946
20947 // With PIC32, the address is actually $g + Offset.
20948 if (PIC32)
20949 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20950 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20951 Offset);
20952
20953 // Lowering the machine isd will make sure everything is in the right
20954 // location.
20955 SDValue Chain = DAG.getEntryNode();
20956 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20957 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20958 SDValue Args[] = { Chain, Offset };
20959 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20960 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
20961
20962 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20963 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20964 MFI.setAdjustsStack(true);
20965
20966 // And our return value (tls address) is in the standard call return value
20967 // location.
20968 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20969 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20970 }
20971
20972 if (Subtarget.isOSWindows()) {
20973 // Just use the implicit TLS architecture
20974 // Need to generate something similar to:
20975 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20976 // ; from TEB
20977 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
20978 // mov rcx, qword [rdx+rcx*8]
20979 // mov eax, .tls$:tlsvar
20980 // [rax+rcx] contains the address
20981 // Windows 64bit: gs:0x58
20982 // Windows 32bit: fs:__tls_array
20983
20984 SDLoc dl(GA);
20985 SDValue Chain = DAG.getEntryNode();
20986
20987 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20988 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20989 // use its literal value of 0x2C.
20990 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
20991 ? Type::getInt8PtrTy(*DAG.getContext(),
20992 256)
20993 : Type::getInt32PtrTy(*DAG.getContext(),
20994 257));
20995
20996 SDValue TlsArray = Subtarget.is64Bit()
20997 ? DAG.getIntPtrConstant(0x58, dl)
20998 : (Subtarget.isTargetWindowsGNU()
20999 ? DAG.getIntPtrConstant(0x2C, dl)
21000 : DAG.getExternalSymbol("_tls_array", PtrVT));
21001
21002 SDValue ThreadPointer =
21003 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
21004
21005 SDValue res;
21006 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
21007 res = ThreadPointer;
21008 } else {
21009 // Load the _tls_index variable
21010 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
21011 if (Subtarget.is64Bit())
21012 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
21013 MachinePointerInfo(), MVT::i32);
21014 else
21015 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
21016
21017 const DataLayout &DL = DAG.getDataLayout();
21018 SDValue Scale =
21019 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
21020 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
21021
21022 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
21023 }
21024
21025 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
21026
21027 // Get the offset of start of .tls section
21028 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21029 GA->getValueType(0),
21030 GA->getOffset(), X86II::MO_SECREL);
21031 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
21032
21033 // The address of the thread local variable is the add of the thread
21034 // pointer with the offset of the variable.
21035 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
21036 }
21037
21038 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21038)
;
21039}
21040
21041/// Lower SRA_PARTS and friends, which return two i32 values
21042/// and take a 2 x i32 value to shift plus a shift amount.
21043/// TODO: Can this be moved to general expansion code?
21044static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
21045 SDValue Lo, Hi;
21046 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
21047 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
21048}
21049
21050// Try to use a packed vector operation to handle i64 on 32-bit targets when
21051// AVX512DQ is enabled.
21052static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
21053 const X86Subtarget &Subtarget) {
21054 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))
21055 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))
21056 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))
21057 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))
21058 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))
;
21059 bool IsStrict = Op->isStrictFPOpcode();
21060 unsigned OpNo = IsStrict ? 1 : 0;
21061 SDValue Src = Op.getOperand(OpNo);
21062 MVT SrcVT = Src.getSimpleValueType();
21063 MVT VT = Op.getSimpleValueType();
21064
21065 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
21066 (VT != MVT::f32 && VT != MVT::f64))
21067 return SDValue();
21068
21069 // Pack the i64 into a vector, do the operation and extract.
21070
21071 // Using 256-bit to ensure result is 128-bits for f32 case.
21072 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
21073 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
21074 MVT VecVT = MVT::getVectorVT(VT, NumElts);
21075
21076 SDLoc dl(Op);
21077 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
21078 if (IsStrict) {
21079 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
21080 {Op.getOperand(0), InVec});
21081 SDValue Chain = CvtVec.getValue(1);
21082 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21083 DAG.getIntPtrConstant(0, dl));
21084 return DAG.getMergeValues({Value, Chain}, dl);
21085 }
21086
21087 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
21088
21089 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21090 DAG.getIntPtrConstant(0, dl));
21091}
21092
21093// Try to use a packed vector operation to handle i64 on 32-bit targets.
21094static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
21095 const X86Subtarget &Subtarget) {
21096 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))
21097 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))
21098 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))
21099 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))
21100 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))
;
21101 bool IsStrict = Op->isStrictFPOpcode();
21102 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21103 MVT SrcVT = Src.getSimpleValueType();
21104 MVT VT = Op.getSimpleValueType();
21105
21106 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
21107 return SDValue();
21108
21109 // Pack the i64 into a vector, do the operation and extract.
21110
21111 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21111, __extension__
__PRETTY_FUNCTION__))
;
21112
21113 SDLoc dl(Op);
21114 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
21115 if (IsStrict) {
21116 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
21117 {Op.getOperand(0), InVec});
21118 SDValue Chain = CvtVec.getValue(1);
21119 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21120 DAG.getIntPtrConstant(0, dl));
21121 return DAG.getMergeValues({Value, Chain}, dl);
21122 }
21123
21124 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
21125
21126 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21127 DAG.getIntPtrConstant(0, dl));
21128}
21129
21130static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
21131 const X86Subtarget &Subtarget) {
21132 switch (Opcode) {
21133 case ISD::SINT_TO_FP:
21134 // TODO: Handle wider types with AVX/AVX512.
21135 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
21136 return false;
21137 // CVTDQ2PS or (V)CVTDQ2PD
21138 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
21139
21140 case ISD::UINT_TO_FP:
21141 // TODO: Handle wider types and i64 elements.
21142 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
21143 return false;
21144 // VCVTUDQ2PS or VCVTUDQ2PD
21145 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
21146
21147 default:
21148 return false;
21149 }
21150}
21151
21152/// Given a scalar cast operation that is extracted from a vector, try to
21153/// vectorize the cast op followed by extraction. This will avoid an expensive
21154/// round-trip between XMM and GPR.
21155static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
21156 const X86Subtarget &Subtarget) {
21157 // TODO: This could be enhanced to handle smaller integer types by peeking
21158 // through an extend.
21159 SDValue Extract = Cast.getOperand(0);
21160 MVT DestVT = Cast.getSimpleValueType();
21161 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21162 !isa<ConstantSDNode>(Extract.getOperand(1)))
21163 return SDValue();
21164
21165 // See if we have a 128-bit vector cast op for this type of cast.
21166 SDValue VecOp = Extract.getOperand(0);
21167 MVT FromVT = VecOp.getSimpleValueType();
21168 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
21169 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
21170 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
21171 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
21172 return SDValue();
21173
21174 // If we are extracting from a non-zero element, first shuffle the source
21175 // vector to allow extracting from element zero.
21176 SDLoc DL(Cast);
21177 if (!isNullConstant(Extract.getOperand(1))) {
21178 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
21179 Mask[0] = Extract.getConstantOperandVal(1);
21180 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
21181 }
21182 // If the source vector is wider than 128-bits, extract the low part. Do not
21183 // create an unnecessarily wide vector cast op.
21184 if (FromVT != Vec128VT)
21185 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
21186
21187 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
21188 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
21189 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
21190 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
21191 DAG.getIntPtrConstant(0, DL));
21192}
21193
21194/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
21195/// try to vectorize the cast ops. This will avoid an expensive round-trip
21196/// between XMM and GPR.
21197static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
21198 const X86Subtarget &Subtarget) {
21199 // TODO: Allow FP_TO_UINT.
21200 SDValue CastToInt = CastToFP.getOperand(0);
21201 MVT VT = CastToFP.getSimpleValueType();
21202 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
21203 return SDValue();
21204
21205 MVT IntVT = CastToInt.getSimpleValueType();
21206 SDValue X = CastToInt.getOperand(0);
21207 MVT SrcVT = X.getSimpleValueType();
21208 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
21209 return SDValue();
21210
21211 // See if we have 128-bit vector cast instructions for this type of cast.
21212 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
21213 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
21214 IntVT != MVT::i32)
21215 return SDValue();
21216
21217 unsigned SrcSize = SrcVT.getSizeInBits();
21218 unsigned IntSize = IntVT.getSizeInBits();
21219 unsigned VTSize = VT.getSizeInBits();
21220 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
21221 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
21222 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
21223
21224 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
21225 unsigned ToIntOpcode =
21226 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
21227 unsigned ToFPOpcode =
21228 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
21229
21230 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
21231 //
21232 // We are not defining the high elements (for example, zero them) because
21233 // that could nullify any performance advantage that we hoped to gain from
21234 // this vector op hack. We do not expect any adverse effects (like denorm
21235 // penalties) with cast ops.
21236 SDLoc DL(CastToFP);
21237 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
21238 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
21239 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
21240 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
21241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
21242}
21243
21244static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
21245 const X86Subtarget &Subtarget) {
21246 SDLoc DL(Op);
21247 bool IsStrict = Op->isStrictFPOpcode();
21248 MVT VT = Op->getSimpleValueType(0);
21249 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
21250
21251 if (Subtarget.hasDQI()) {
21252 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21252, __extension__
__PRETTY_FUNCTION__))
;
21253
21254 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21256, __extension__
__PRETTY_FUNCTION__))
21255 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21256, __extension__
__PRETTY_FUNCTION__))
21256 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21256, __extension__
__PRETTY_FUNCTION__))
;
21257
21258 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
21259 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21260, __extension__
__PRETTY_FUNCTION__))
21260 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21260, __extension__
__PRETTY_FUNCTION__))
;
21261 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21262
21263 // Need to concat with zero vector for strict fp to avoid spurious
21264 // exceptions.
21265 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
21266 : DAG.getUNDEF(MVT::v8i64);
21267 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
21268 DAG.getIntPtrConstant(0, DL));
21269 SDValue Res, Chain;
21270 if (IsStrict) {
21271 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
21272 {Op->getOperand(0), Src});
21273 Chain = Res.getValue(1);
21274 } else {
21275 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
21276 }
21277
21278 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21279 DAG.getIntPtrConstant(0, DL));
21280
21281 if (IsStrict)
21282 return DAG.getMergeValues({Res, Chain}, DL);
21283 return Res;
21284 }
21285
21286 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
21287 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
21288 if (VT != MVT::v4f32 || IsSigned)
21289 return SDValue();
21290
21291 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
21292 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
21293 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
21294 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
21295 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
21296 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
21297 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
21298 SmallVector<SDValue, 4> SignCvts(4);
21299 SmallVector<SDValue, 4> Chains(4);
21300 for (int i = 0; i != 4; ++i) {
21301 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
21302 DAG.getIntPtrConstant(i, DL));
21303 if (IsStrict) {
21304 SignCvts[i] =
21305 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
21306 {Op.getOperand(0), Elt});
21307 Chains[i] = SignCvts[i].getValue(1);
21308 } else {
21309 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
21310 }
21311 }
21312 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
21313
21314 SDValue Slow, Chain;
21315 if (IsStrict) {
21316 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
21317 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
21318 {Chain, SignCvt, SignCvt});
21319 Chain = Slow.getValue(1);
21320 } else {
21321 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
21322 }
21323
21324 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
21325 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
21326
21327 if (IsStrict)
21328 return DAG.getMergeValues({Cvt, Chain}, DL);
21329
21330 return Cvt;
21331}
21332
21333static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
21334 bool IsStrict = Op->isStrictFPOpcode();
21335 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21336 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21337 MVT VT = Op.getSimpleValueType();
21338 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21339 SDLoc dl(Op);
21340
21341 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
21342 if (IsStrict)
21343 return DAG.getNode(
21344 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
21345 {Chain,
21346 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
21347 Rnd});
21348 return DAG.getNode(ISD::FP_ROUND, dl, VT,
21349 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
21350}
21351
21352static bool isLegalConversion(MVT VT, bool IsSigned,
21353 const X86Subtarget &Subtarget) {
21354 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
21355 return true;
21356 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
21357 return true;
21358 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
21359 return true;
21360 if (Subtarget.useAVX512Regs()) {
21361 if (VT == MVT::v16i32)
21362 return true;
21363 if (VT == MVT::v8i64 && Subtarget.hasDQI())
21364 return true;
21365 }
21366 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
21367 (VT == MVT::v2i64 || VT == MVT::v4i64))
21368 return true;
21369 return false;
21370}
21371
21372SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
21373 SelectionDAG &DAG) const {
21374 bool IsStrict = Op->isStrictFPOpcode();
21375 unsigned OpNo = IsStrict ? 1 : 0;
21376 SDValue Src = Op.getOperand(OpNo);
21377 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21378 MVT SrcVT = Src.getSimpleValueType();
21379 MVT VT = Op.getSimpleValueType();
21380 SDLoc dl(Op);
21381
21382 if (isSoftFP16(VT))
21383 return promoteXINT_TO_FP(Op, DAG);
21384 else if (isLegalConversion(SrcVT, true, Subtarget))
21385 return Op;
21386
21387 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21388 return LowerWin64_INT128_TO_FP(Op, DAG);
21389
21390 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21391 return Extract;
21392
21393 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
21394 return R;
21395
21396 if (SrcVT.isVector()) {
21397 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
21398 // Note: Since v2f64 is a legal type. We don't need to zero extend the
21399 // source for strict FP.
21400 if (IsStrict)
21401 return DAG.getNode(
21402 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
21403 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21404 DAG.getUNDEF(SrcVT))});
21405 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
21406 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21407 DAG.getUNDEF(SrcVT)));
21408 }
21409 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
21410 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21411
21412 return SDValue();
21413 }
21414
21415 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21416, __extension__
__PRETTY_FUNCTION__))
21416 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21416, __extension__
__PRETTY_FUNCTION__))
;
21417
21418 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
21419
21420 // These are really Legal; return the operand so the caller accepts it as
21421 // Legal.
21422 if (SrcVT == MVT::i32 && UseSSEReg)
21423 return Op;
21424 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
21425 return Op;
21426
21427 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21428 return V;
21429 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21430 return V;
21431
21432 // SSE doesn't have an i16 conversion so we need to promote.
21433 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
21434 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
21435 if (IsStrict)
21436 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
21437 {Chain, Ext});
21438
21439 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
21440 }
21441
21442 if (VT == MVT::f128 || !Subtarget.hasX87())
21443 return SDValue();
21444
21445 SDValue ValueToStore = Src;
21446 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
21447 // Bitcasting to f64 here allows us to do a single 64-bit store from
21448 // an SSE register, avoiding the store forwarding penalty that would come
21449 // with two 32-bit stores.
21450 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21451
21452 unsigned Size = SrcVT.getStoreSize();
21453 Align Alignment(Size);
21454 MachineFunction &MF = DAG.getMachineFunction();
21455 auto PtrVT = getPointerTy(MF.getDataLayout());
21456 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
21457 MachinePointerInfo MPI =
21458 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21459 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21460 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
21461 std::pair<SDValue, SDValue> Tmp =
21462 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
21463
21464 if (IsStrict)
21465 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21466
21467 return Tmp.first;
21468}
21469
21470std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
21471 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
21472 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
21473 // Build the FILD
21474 SDVTList Tys;
21475 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
21476 if (useSSE)
21477 Tys = DAG.getVTList(MVT::f80, MVT::Other);
21478 else
21479 Tys = DAG.getVTList(DstVT, MVT::Other);
21480
21481 SDValue FILDOps[] = {Chain, Pointer};
21482 SDValue Result =
21483 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
21484 Alignment, MachineMemOperand::MOLoad);
21485 Chain = Result.getValue(1);
21486
21487 if (useSSE) {
21488 MachineFunction &MF = DAG.getMachineFunction();
21489 unsigned SSFISize = DstVT.getStoreSize();
21490 int SSFI =
21491 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
21492 auto PtrVT = getPointerTy(MF.getDataLayout());
21493 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21494 Tys = DAG.getVTList(MVT::Other);
21495 SDValue FSTOps[] = {Chain, Result, StackSlot};
21496 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
21497 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
21498 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
21499
21500 Chain =
21501 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
21502 Result = DAG.getLoad(
21503 DstVT, DL, Chain, StackSlot,
21504 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
21505 Chain = Result.getValue(1);
21506 }
21507
21508 return { Result, Chain };
21509}
21510
21511/// Horizontal vector math instructions may be slower than normal math with
21512/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
21513/// implementation, and likely shuffle complexity of the alternate sequence.
21514static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
21515 const X86Subtarget &Subtarget) {
21516 bool IsOptimizingSize = DAG.shouldOptForSize();
21517 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
21518 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
21519}
21520
21521/// 64-bit unsigned integer to double expansion.
21522static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
21523 const X86Subtarget &Subtarget) {
21524 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
21525 // when converting 0 when rounding toward negative infinity. Caller will
21526 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
21527 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21527, __extension__
__PRETTY_FUNCTION__))
;
21528 // This algorithm is not obvious. Here it is what we're trying to output:
21529 /*
21530 movq %rax, %xmm0
21531 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
21532 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
21533 #ifdef __SSE3__
21534 haddpd %xmm0, %xmm0
21535 #else
21536 pshufd $0x4e, %xmm0, %xmm1
21537 addpd %xmm1, %xmm0
21538 #endif
21539 */
21540
21541 SDLoc dl(Op);
21542 LLVMContext *Context = DAG.getContext();
21543
21544 // Build some magic constants.
21545 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21546 Constant *C0 = ConstantDataVector::get(*Context, CV0);
21547 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21548 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21549
21550 SmallVector<Constant*,2> CV1;
21551 CV1.push_back(
21552 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21553 APInt(64, 0x4330000000000000ULL))));
21554 CV1.push_back(
21555 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21556 APInt(64, 0x4530000000000000ULL))));
21557 Constant *C1 = ConstantVector::get(CV1);
21558 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21559
21560 // Load the 64-bit value into an XMM register.
21561 SDValue XR1 =
21562 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21563 SDValue CLod0 = DAG.getLoad(
21564 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21565 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21566 SDValue Unpck1 =
21567 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21568
21569 SDValue CLod1 = DAG.getLoad(
21570 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21571 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21572 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21573 // TODO: Are there any fast-math-flags to propagate here?
21574 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21575 SDValue Result;
21576
21577 if (Subtarget.hasSSE3() &&
21578 shouldUseHorizontalOp(true, DAG, Subtarget)) {
21579 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21580 } else {
21581 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21582 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21583 }
21584 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21585 DAG.getIntPtrConstant(0, dl));
21586 return Result;
21587}
21588
21589/// 32-bit unsigned integer to float expansion.
21590static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
21591 const X86Subtarget &Subtarget) {
21592 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21593 SDLoc dl(Op);
21594 // FP constant to bias correct the final result.
21595 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
21596 MVT::f64);
21597
21598 // Load the 32-bit value into an XMM register.
21599 SDValue Load =
21600 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21601
21602 // Zero out the upper parts of the register.
21603 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21604
21605 // Or the load with the bias.
21606 SDValue Or = DAG.getNode(
21607 ISD::OR, dl, MVT::v2i64,
21608 DAG.getBitcast(MVT::v2i64, Load),
21609 DAG.getBitcast(MVT::v2i64,
21610 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21611 Or =
21612 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21613 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
21614
21615 if (Op.getNode()->isStrictFPOpcode()) {
21616 // Subtract the bias.
21617 // TODO: Are there any fast-math-flags to propagate here?
21618 SDValue Chain = Op.getOperand(0);
21619 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21620 {Chain, Or, Bias});
21621
21622 if (Op.getValueType() == Sub.getValueType())
21623 return Sub;
21624
21625 // Handle final rounding.
21626 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21627 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21628
21629 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21630 }
21631
21632 // Subtract the bias.
21633 // TODO: Are there any fast-math-flags to propagate here?
21634 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21635
21636 // Handle final rounding.
21637 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21638}
21639
21640static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
21641 const X86Subtarget &Subtarget,
21642 const SDLoc &DL) {
21643 if (Op.getSimpleValueType() != MVT::v2f64)
21644 return SDValue();
21645
21646 bool IsStrict = Op->isStrictFPOpcode();
21647
21648 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21649 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21649, __extension__
__PRETTY_FUNCTION__))
;
21650
21651 if (Subtarget.hasAVX512()) {
21652 if (!Subtarget.hasVLX()) {
21653 // Let generic type legalization widen this.
21654 if (!IsStrict)
21655 return SDValue();
21656 // Otherwise pad the integer input with 0s and widen the operation.
21657 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21658 DAG.getConstant(0, DL, MVT::v2i32));
21659 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21660 {Op.getOperand(0), N0});
21661 SDValue Chain = Res.getValue(1);
21662 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21663 DAG.getIntPtrConstant(0, DL));
21664 return DAG.getMergeValues({Res, Chain}, DL);
21665 }
21666
21667 // Legalize to v4i32 type.
21668 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21669 DAG.getUNDEF(MVT::v2i32));
21670 if (IsStrict)
21671 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21672 {Op.getOperand(0), N0});
21673 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21674 }
21675
21676 // Zero extend to 2i64, OR with the floating point representation of 2^52.
21677 // This gives us the floating point equivalent of 2^52 + the i32 integer
21678 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
21679 // point leaving just our i32 integers in double format.
21680 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21681 SDValue VBias =
21682 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
21683 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21684 DAG.getBitcast(MVT::v2i64, VBias));
21685 Or = DAG.getBitcast(MVT::v2f64, Or);
21686
21687 if (IsStrict)
21688 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21689 {Op.getOperand(0), Or, VBias});
21690 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21691}
21692
21693static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
21694 const X86Subtarget &Subtarget) {
21695 SDLoc DL(Op);
21696 bool IsStrict = Op->isStrictFPOpcode();
21697 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21698 MVT VecIntVT = V.getSimpleValueType();
21699 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21700, __extension__
__PRETTY_FUNCTION__))
21700 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21700, __extension__
__PRETTY_FUNCTION__))
;
21701
21702 if (Subtarget.hasAVX512()) {
21703 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21704 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21704, __extension__
__PRETTY_FUNCTION__))
;
21705 MVT VT = Op->getSimpleValueType(0);
21706
21707 // v8i32->v8f64 is legal with AVX512 so just return it.
21708 if (VT == MVT::v8f64)
21709 return Op;
21710
21711 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21712, __extension__
__PRETTY_FUNCTION__))
21712 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21712, __extension__
__PRETTY_FUNCTION__))
;
21713 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21714 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21715 // Need to concat with zero vector for strict fp to avoid spurious
21716 // exceptions.
21717 SDValue Tmp =
21718 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21719 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21720 DAG.getIntPtrConstant(0, DL));
21721 SDValue Res, Chain;
21722 if (IsStrict) {
21723 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21724 {Op->getOperand(0), V});
21725 Chain = Res.getValue(1);
21726 } else {
21727 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21728 }
21729
21730 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21731 DAG.getIntPtrConstant(0, DL));
21732
21733 if (IsStrict)
21734 return DAG.getMergeValues({Res, Chain}, DL);
21735 return Res;
21736 }
21737
21738 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21739 Op->getSimpleValueType(0) == MVT::v4f64) {
21740 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21741 Constant *Bias = ConstantFP::get(
21742 *DAG.getContext(),
21743 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21744 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21745 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21746 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21747 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21748 SDValue VBias = DAG.getMemIntrinsicNode(
21749 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21750 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
21751 MachineMemOperand::MOLoad);
21752
21753 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21754 DAG.getBitcast(MVT::v4i64, VBias));
21755 Or = DAG.getBitcast(MVT::v4f64, Or);
21756
21757 if (IsStrict)
21758 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21759 {Op.getOperand(0), Or, VBias});
21760 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21761 }
21762
21763 // The algorithm is the following:
21764 // #ifdef __SSE4_1__
21765 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21766 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21767 // (uint4) 0x53000000, 0xaa);
21768 // #else
21769 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21770 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21771 // #endif
21772 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21773 // return (float4) lo + fhi;
21774
21775 bool Is128 = VecIntVT == MVT::v4i32;
21776 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21777 // If we convert to something else than the supported type, e.g., to v4f64,
21778 // abort early.
21779 if (VecFloatVT != Op->getSimpleValueType(0))
21780 return SDValue();
21781
21782 // In the #idef/#else code, we have in common:
21783 // - The vector of constants:
21784 // -- 0x4b000000
21785 // -- 0x53000000
21786 // - A shift:
21787 // -- v >> 16
21788
21789 // Create the splat vector for 0x4b000000.
21790 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21791 // Create the splat vector for 0x53000000.
21792 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21793
21794 // Create the right shift.
21795 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21796 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21797
21798 SDValue Low, High;
21799 if (Subtarget.hasSSE41()) {
21800 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21801 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21802 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21803 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21804 // Low will be bitcasted right away, so do not bother bitcasting back to its
21805 // original type.
21806 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21807 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21808 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21809 // (uint4) 0x53000000, 0xaa);
21810 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21811 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21812 // High will be bitcasted right away, so do not bother bitcasting back to
21813 // its original type.
21814 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21815 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21816 } else {
21817 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21818 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21819 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21820 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21821
21822 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21823 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21824 }
21825
21826 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21827 SDValue VecCstFSub = DAG.getConstantFP(
21828 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21829
21830 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21831 // NOTE: By using fsub of a positive constant instead of fadd of a negative
21832 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
21833 // enabled. See PR24512.
21834 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21835 // TODO: Are there any fast-math-flags to propagate here?
21836 // (float4) lo;
21837 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21838 // return (float4) lo + fhi;
21839 if (IsStrict) {
21840 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21841 {Op.getOperand(0), HighBitcast, VecCstFSub});
21842 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21843 {FHigh.getValue(1), LowBitcast, FHigh});
21844 }
21845
21846 SDValue FHigh =
21847 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21848 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21849}
21850
21851static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
21852 const X86Subtarget &Subtarget) {
21853 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21854 SDValue N0 = Op.getOperand(OpNo);
21855 MVT SrcVT = N0.getSimpleValueType();
21856 SDLoc dl(Op);
21857
21858 switch (SrcVT.SimpleTy) {
21859 default:
21860 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21860)
;
21861 case MVT::v2i32:
21862 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
21863 case MVT::v4i32:
21864 case MVT::v8i32:
21865 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
21866 case MVT::v2i64:
21867 case MVT::v4i64:
21868 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21869 }
21870}
21871
21872SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21873 SelectionDAG &DAG) const {
21874 bool IsStrict = Op->isStrictFPOpcode();
21875 unsigned OpNo = IsStrict ? 1 : 0;
21876 SDValue Src = Op.getOperand(OpNo);
21877 SDLoc dl(Op);
21878 auto PtrVT = getPointerTy(DAG.getDataLayout());
21879 MVT SrcVT = Src.getSimpleValueType();
21880 MVT DstVT = Op->getSimpleValueType(0);
21881 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21882
21883 // Bail out when we don't have native conversion instructions.
21884 if (DstVT == MVT::f128)
21885 return SDValue();
21886
21887 if (isSoftFP16(DstVT))
21888 return promoteXINT_TO_FP(Op, DAG);
21889 else if (isLegalConversion(SrcVT, false, Subtarget))
21890 return Op;
21891
21892 if (DstVT.isVector())
21893 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
21894
21895 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21896 return LowerWin64_INT128_TO_FP(Op, DAG);
21897
21898 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21899 return Extract;
21900
21901 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21902 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21903 // Conversions from unsigned i32 to f32/f64 are legal,
21904 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
21905 return Op;
21906 }
21907
21908 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21909 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21910 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21911 if (IsStrict)
21912 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21913 {Chain, Src});
21914 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21915 }
21916
21917 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21918 return V;
21919 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21920 return V;
21921
21922 // The transform for i64->f64 isn't correct for 0 when rounding to negative
21923 // infinity. It produces -0.0, so disable under strictfp.
21924 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21925 !IsStrict)
21926 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
21927 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
21928 // negative infinity. So disable under strictfp. Using FILD instead.
21929 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21930 !IsStrict)
21931 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
21932 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21933 (DstVT == MVT::f32 || DstVT == MVT::f64))
21934 return SDValue();
21935
21936 // Make a 64-bit buffer, and use it to build an FILD.
21937 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21938 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21939 Align SlotAlign(8);
21940 MachinePointerInfo MPI =
21941 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21942 if (SrcVT == MVT::i32) {
21943 SDValue OffsetSlot =
21944 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
21945 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21946 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21947 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21948 std::pair<SDValue, SDValue> Tmp =
21949 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21950 if (IsStrict)
21951 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21952
21953 return Tmp.first;
21954 }
21955
21956 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21956, __extension__
__PRETTY_FUNCTION__))
;
21957 SDValue ValueToStore = Src;
21958 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21959 // Bitcasting to f64 here allows us to do a single 64-bit store from
21960 // an SSE register, avoiding the store forwarding penalty that would come
21961 // with two 32-bit stores.
21962 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21963 }
21964 SDValue Store =
21965 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21966 // For i64 source, we need to add the appropriate power of 2 if the input
21967 // was negative. We must be careful to do the computation in x87 extended
21968 // precision, not in SSE.
21969 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21970 SDValue Ops[] = { Store, StackSlot };
21971 SDValue Fild =
21972 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21973 SlotAlign, MachineMemOperand::MOLoad);
21974 Chain = Fild.getValue(1);
21975
21976
21977 // Check whether the sign bit is set.
21978 SDValue SignSet = DAG.getSetCC(
21979 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21980 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21981
21982 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21983 APInt FF(64, 0x5F80000000000000ULL);
21984 SDValue FudgePtr = DAG.getConstantPool(
21985 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21986 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21987
21988 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21989 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21990 SDValue Four = DAG.getIntPtrConstant(4, dl);
21991 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21992 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21993
21994 // Load the value out, extending it from f32 to f80.
21995 SDValue Fudge = DAG.getExtLoad(
21996 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21997 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
21998 CPAlignment);
21999 Chain = Fudge.getValue(1);
22000 // Extend everything to 80 bits to force it to be done on x87.
22001 // TODO: Are there any fast-math-flags to propagate here?
22002 if (IsStrict) {
22003 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
22004 {Chain, Fild, Fudge});
22005 // STRICT_FP_ROUND can't handle equal types.
22006 if (DstVT == MVT::f80)
22007 return Add;
22008 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
22009 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
22010 }
22011 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
22012 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
22013 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22014}
22015
22016// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
22017// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
22018// just return an SDValue().
22019// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
22020// to i16, i32 or i64, and we lower it to a legal sequence and return the
22021// result.
22022SDValue
22023X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
22024 bool IsSigned, SDValue &Chain) const {
22025 bool IsStrict = Op->isStrictFPOpcode();
22026 SDLoc DL(Op);
22027
22028 EVT DstTy = Op.getValueType();
22029 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
22030 EVT TheVT = Value.getValueType();
22031 auto PtrVT = getPointerTy(DAG.getDataLayout());
22032
22033 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
22034 // f16 must be promoted before using the lowering in this routine.
22035 // fp128 does not use this lowering.
22036 return SDValue();
22037 }
22038
22039 // If using FIST to compute an unsigned i64, we'll need some fixup
22040 // to handle values above the maximum signed i64. A FIST is always
22041 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
22042 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
22043
22044 // FIXME: This does not generate an invalid exception if the input does not
22045 // fit in i32. PR44019
22046 if (!IsSigned && DstTy != MVT::i64) {
22047 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
22048 // The low 32 bits of the fist result will have the correct uint32 result.
22049 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22049, __extension__
__PRETTY_FUNCTION__))
;
22050 DstTy = MVT::i64;
22051 }
22052
22053 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22055, __extension__
__PRETTY_FUNCTION__))
22054 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22055, __extension__
__PRETTY_FUNCTION__))
22055 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22055, __extension__
__PRETTY_FUNCTION__))
;
22056
22057 // We lower FP->int64 into FISTP64 followed by a load from a temporary
22058 // stack slot.
22059 MachineFunction &MF = DAG.getMachineFunction();
22060 unsigned MemSize = DstTy.getStoreSize();
22061 int SSFI =
22062 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
22063 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
22064
22065 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22066
22067 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
22068
22069 if (UnsignedFixup) {
22070 //
22071 // Conversion to unsigned i64 is implemented with a select,
22072 // depending on whether the source value fits in the range
22073 // of a signed i64. Let Thresh be the FP equivalent of
22074 // 0x8000000000000000ULL.
22075 //
22076 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
22077 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
22078 // FistSrc = (Value - FltOfs);
22079 // Fist-to-mem64 FistSrc
22080 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
22081 // to XOR'ing the high 32 bits with Adjust.
22082 //
22083 // Being a power of 2, Thresh is exactly representable in all FP formats.
22084 // For X87 we'd like to use the smallest FP type for this constant, but
22085 // for DAG type consistency we have to match the FP operand type.
22086
22087 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
22088 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
22089 bool LosesInfo = false;
22090 if (TheVT == MVT::f64)
22091 // The rounding mode is irrelevant as the conversion should be exact.
22092 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
22093 &LosesInfo);
22094 else if (TheVT == MVT::f80)
22095 Status = Thresh.convert(APFloat::x87DoubleExtended(),
22096 APFloat::rmNearestTiesToEven, &LosesInfo);
22097
22098 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22099, __extension__
__PRETTY_FUNCTION__))
22099 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22099, __extension__
__PRETTY_FUNCTION__))
;
22100
22101 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
22102
22103 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
22104 *DAG.getContext(), TheVT);
22105 SDValue Cmp;
22106 if (IsStrict) {
22107 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
22108 /*IsSignaling*/ true);
22109 Chain = Cmp.getValue(1);
22110 } else {
22111 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
22112 }
22113
22114 // Our preferred lowering of
22115 //
22116 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
22117 //
22118 // is
22119 //
22120 // (Value >= Thresh) << 63
22121 //
22122 // but since we can get here after LegalOperations, DAGCombine might do the
22123 // wrong thing if we create a select. So, directly create the preferred
22124 // version.
22125 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
22126 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
22127 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
22128
22129 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
22130 DAG.getConstantFP(0.0, DL, TheVT));
22131
22132 if (IsStrict) {
22133 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
22134 { Chain, Value, FltOfs });
22135 Chain = Value.getValue(1);
22136 } else
22137 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
22138 }
22139
22140 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
22141
22142 // FIXME This causes a redundant load/store if the SSE-class value is already
22143 // in memory, such as if it is on the callstack.
22144 if (isScalarFPTypeInSSEReg(TheVT)) {
22145 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22145, __extension__
__PRETTY_FUNCTION__))
;
22146 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
22147 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22148 SDValue Ops[] = { Chain, StackSlot };
22149
22150 unsigned FLDSize = TheVT.getStoreSize();
22151 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22151, __extension__
__PRETTY_FUNCTION__))
;
22152 MachineMemOperand *MMO = MF.getMachineMemOperand(
22153 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
22154 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
22155 Chain = Value.getValue(1);
22156 }
22157
22158 // Build the FP_TO_INT*_IN_MEM
22159 MachineMemOperand *MMO = MF.getMachineMemOperand(
22160 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
22161 SDValue Ops[] = { Chain, Value, StackSlot };
22162 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
22163 DAG.getVTList(MVT::Other),
22164 Ops, DstTy, MMO);
22165
22166 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
22167 Chain = Res.getValue(1);
22168
22169 // If we need an unsigned fixup, XOR the result with adjust.
22170 if (UnsignedFixup)
22171 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
22172
22173 return Res;
22174}
22175
22176static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
22177 const X86Subtarget &Subtarget) {
22178 MVT VT = Op.getSimpleValueType();
22179 SDValue In = Op.getOperand(0);
22180 MVT InVT = In.getSimpleValueType();
22181 SDLoc dl(Op);
22182 unsigned Opc = Op.getOpcode();
22183
22184 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22184, __extension__
__PRETTY_FUNCTION__))
;
22185 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22186, __extension__
__PRETTY_FUNCTION__))
22186 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22186, __extension__
__PRETTY_FUNCTION__))
;
22187 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22188, __extension__
__PRETTY_FUNCTION__))
22188 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22188, __extension__
__PRETTY_FUNCTION__))
;
22189 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))
22190 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))
22191 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))
22192 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))
;
22193 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))
22194 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))
22195 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))
22196 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))
;
22197
22198 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
22199
22200 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
22201 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22201, __extension__
__PRETTY_FUNCTION__))
;
22202 return splitVectorIntUnary(Op, DAG);
22203 }
22204
22205 if (Subtarget.hasInt256())
22206 return Op;
22207
22208 // Optimize vectors in AVX mode:
22209 //
22210 // v8i16 -> v8i32
22211 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
22212 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
22213 // Concat upper and lower parts.
22214 //
22215 // v4i32 -> v4i64
22216 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
22217 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
22218 // Concat upper and lower parts.
22219 //
22220 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22221 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
22222
22223 // Short-circuit if we can determine that each 128-bit half is the same value.
22224 // Otherwise, this is difficult to match and optimize.
22225 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
22226 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
22227 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
22228
22229 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
22230 SDValue Undef = DAG.getUNDEF(InVT);
22231 bool NeedZero = Opc == ISD::ZERO_EXTEND;
22232 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
22233 OpHi = DAG.getBitcast(HalfVT, OpHi);
22234
22235 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22236}
22237
22238// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
22239static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
22240 const SDLoc &dl, SelectionDAG &DAG) {
22241 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22241, __extension__
__PRETTY_FUNCTION__))
;
22242 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22243 DAG.getIntPtrConstant(0, dl));
22244 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22245 DAG.getIntPtrConstant(8, dl));
22246 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
22247 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
22248 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
22249 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22250}
22251
22252static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
22253 const X86Subtarget &Subtarget,
22254 SelectionDAG &DAG) {
22255 MVT VT = Op->getSimpleValueType(0);
22256 SDValue In = Op->getOperand(0);
22257 MVT InVT = In.getSimpleValueType();
22258 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22258, __extension__
__PRETTY_FUNCTION__))
;
22259 SDLoc DL(Op);
22260 unsigned NumElts = VT.getVectorNumElements();
22261
22262 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
22263 // avoids a constant pool load.
22264 if (VT.getVectorElementType() != MVT::i8) {
22265 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
22266 return DAG.getNode(ISD::SRL, DL, VT, Extend,
22267 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
22268 }
22269
22270 // Extend VT if BWI is not supported.
22271 MVT ExtVT = VT;
22272 if (!Subtarget.hasBWI()) {
22273 // If v16i32 is to be avoided, we'll need to split and concatenate.
22274 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22275 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
22276
22277 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22278 }
22279
22280 // Widen to 512-bits if VLX is not supported.
22281 MVT WideVT = ExtVT;
22282 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22283 NumElts *= 512 / ExtVT.getSizeInBits();
22284 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22285 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
22286 In, DAG.getIntPtrConstant(0, DL));
22287 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
22288 NumElts);
22289 }
22290
22291 SDValue One = DAG.getConstant(1, DL, WideVT);
22292 SDValue Zero = DAG.getConstant(0, DL, WideVT);
22293
22294 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
22295
22296 // Truncate if we had to extend above.
22297 if (VT != ExtVT) {
22298 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
22299 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
22300 }
22301
22302 // Extract back to 128/256-bit if we widened.
22303 if (WideVT != VT)
22304 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
22305 DAG.getIntPtrConstant(0, DL));
22306
22307 return SelectedVal;
22308}
22309
22310static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22311 SelectionDAG &DAG) {
22312 SDValue In = Op.getOperand(0);
22313 MVT SVT = In.getSimpleValueType();
22314
22315 if (SVT.getVectorElementType() == MVT::i1)
22316 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
22317
22318 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22318, __extension__
__PRETTY_FUNCTION__))
;
22319 return LowerAVXExtend(Op, DAG, Subtarget);
22320}
22321
22322/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
22323/// It makes use of the fact that vectors with enough leading sign/zero bits
22324/// prevent the PACKSS/PACKUS from saturating the results.
22325/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
22326/// within each 128-bit lane.
22327static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
22328 const SDLoc &DL, SelectionDAG &DAG,
22329 const X86Subtarget &Subtarget) {
22330 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22331, __extension__
__PRETTY_FUNCTION__))
22331 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22331, __extension__
__PRETTY_FUNCTION__))
;
22332 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22332, __extension__
__PRETTY_FUNCTION__))
;
22333
22334 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
22335 if (!Subtarget.hasSSE2())
22336 return SDValue();
22337
22338 EVT SrcVT = In.getValueType();
22339
22340 // No truncation required, we might get here due to recursive calls.
22341 if (SrcVT == DstVT)
22342 return In;
22343
22344 // We only support vector truncation to 64bits or greater from a
22345 // 128bits or greater source.
22346 unsigned DstSizeInBits = DstVT.getSizeInBits();
22347 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
22348 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
22349 return SDValue();
22350
22351 unsigned NumElems = SrcVT.getVectorNumElements();
22352 if (!isPowerOf2_32(NumElems))
22353 return SDValue();
22354
22355 LLVMContext &Ctx = *DAG.getContext();
22356 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22356, __extension__
__PRETTY_FUNCTION__))
;
22357 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22357, __extension__
__PRETTY_FUNCTION__))
;
22358
22359 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
22360
22361 // Pack to the largest type possible:
22362 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
22363 EVT InVT = MVT::i16, OutVT = MVT::i8;
22364 if (SrcVT.getScalarSizeInBits() > 16 &&
22365 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
22366 InVT = MVT::i32;
22367 OutVT = MVT::i16;
22368 }
22369
22370 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
22371 if (SrcVT.is128BitVector()) {
22372 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
22373 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
22374 In = DAG.getBitcast(InVT, In);
22375 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
22376 Res = extractSubVector(Res, 0, DAG, DL, 64);
22377 return DAG.getBitcast(DstVT, Res);
22378 }
22379
22380 // Split lower/upper subvectors.
22381 SDValue Lo, Hi;
22382 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
22383
22384 unsigned SubSizeInBits = SrcSizeInBits / 2;
22385 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
22386 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
22387
22388 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
22389 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
22390 Lo = DAG.getBitcast(InVT, Lo);
22391 Hi = DAG.getBitcast(InVT, Hi);
22392 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22393 return DAG.getBitcast(DstVT, Res);
22394 }
22395
22396 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
22397 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
22398 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
22399 Lo = DAG.getBitcast(InVT, Lo);
22400 Hi = DAG.getBitcast(InVT, Hi);
22401 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22402
22403 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
22404 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
22405 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
22406 SmallVector<int, 64> Mask;
22407 int Scale = 64 / OutVT.getScalarSizeInBits();
22408 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
22409 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
22410
22411 if (DstVT.is256BitVector())
22412 return DAG.getBitcast(DstVT, Res);
22413
22414 // If 512bit -> 128bit truncate another stage.
22415 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22416 Res = DAG.getBitcast(PackedVT, Res);
22417 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22418 }
22419
22420 // Recursively pack lower/upper subvectors, concat result and pack again.
22421 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22421, __extension__
__PRETTY_FUNCTION__))
;
22422 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
22423 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
22424 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
22425
22426 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22427 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
22428 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22429}
22430
22431static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
22432 const X86Subtarget &Subtarget) {
22433
22434 SDLoc DL(Op);
22435 MVT VT = Op.getSimpleValueType();
22436 SDValue In = Op.getOperand(0);
22437 MVT InVT = In.getSimpleValueType();
22438
22439 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22439, __extension__
__PRETTY_FUNCTION__))
;
22440
22441 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
22442 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
22443 if (InVT.getScalarSizeInBits() <= 16) {
22444 if (Subtarget.hasBWI()) {
22445 // legal, will go to VPMOVB2M, VPMOVW2M
22446 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22447 // We need to shift to get the lsb into sign position.
22448 // Shift packed bytes not supported natively, bitcast to word
22449 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
22450 In = DAG.getNode(ISD::SHL, DL, ExtVT,
22451 DAG.getBitcast(ExtVT, In),
22452 DAG.getConstant(ShiftInx, DL, ExtVT));
22453 In = DAG.getBitcast(InVT, In);
22454 }
22455 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
22456 In, ISD::SETGT);
22457 }
22458 // Use TESTD/Q, extended vector to packed dword/qword.
22459 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))
22460 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))
;
22461 unsigned NumElts = InVT.getVectorNumElements();
22462 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22462, __extension__
__PRETTY_FUNCTION__))
;
22463 // We need to change to a wider element type that we have support for.
22464 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
22465 // For 16 element vectors we extend to v16i32 unless we are explicitly
22466 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
22467 // we need to split into two 8 element vectors which we can extend to v8i32,
22468 // truncate and concat the results. There's an additional complication if
22469 // the original type is v16i8. In that case we can't split the v16i8
22470 // directly, so we need to shuffle high elements to low and use
22471 // sign_extend_vector_inreg.
22472 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
22473 SDValue Lo, Hi;
22474 if (InVT == MVT::v16i8) {
22475 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
22476 Hi = DAG.getVectorShuffle(
22477 InVT, DL, In, In,
22478 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
22479 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
22480 } else {
22481 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22481, __extension__
__PRETTY_FUNCTION__))
;
22482 Lo = extract128BitVector(In, 0, DAG, DL);
22483 Hi = extract128BitVector(In, 8, DAG, DL);
22484 }
22485 // We're split now, just emit two truncates and a concat. The two
22486 // truncates will trigger legalization to come back to this function.
22487 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
22488 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
22489 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22490 }
22491 // We either have 8 elements or we're allowed to use 512-bit vectors.
22492 // If we have VLX, we want to use the narrowest vector that can get the
22493 // job done so we use vXi32.
22494 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
22495 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
22496 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
22497 InVT = ExtVT;
22498 ShiftInx = InVT.getScalarSizeInBits() - 1;
22499 }
22500
22501 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22502 // We need to shift to get the lsb into sign position.
22503 In = DAG.getNode(ISD::SHL, DL, InVT, In,
22504 DAG.getConstant(ShiftInx, DL, InVT));
22505 }
22506 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
22507 if (Subtarget.hasDQI())
22508 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
22509 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
22510}
22511
22512SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
22513 SDLoc DL(Op);
22514 MVT VT = Op.getSimpleValueType();
22515 SDValue In = Op.getOperand(0);
22516 MVT InVT = In.getSimpleValueType();
22517 unsigned InNumEltBits = InVT.getScalarSizeInBits();
22518
22519 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22520, __extension__
__PRETTY_FUNCTION__))
22520 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22520, __extension__
__PRETTY_FUNCTION__))
;
22521
22522 // If we're called by the type legalizer, handle a few cases.
22523 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22524 if (!TLI.isTypeLegal(InVT)) {
22525 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22526 VT.is128BitVector()) {
22527 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22528, __extension__
__PRETTY_FUNCTION__))
22528 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22528, __extension__
__PRETTY_FUNCTION__))
;
22529 // The default behavior is to truncate one step, concatenate, and then
22530 // truncate the remainder. We'd rather produce two 64-bit results and
22531 // concatenate those.
22532 SDValue Lo, Hi;
22533 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
22534
22535 EVT LoVT, HiVT;
22536 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22537
22538 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
22539 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
22540 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22541 }
22542
22543 // Otherwise let default legalization handle it.
22544 return SDValue();
22545 }
22546
22547 if (VT.getVectorElementType() == MVT::i1)
22548 return LowerTruncateVecI1(Op, DAG, Subtarget);
22549
22550 // vpmovqb/w/d, vpmovdb/w, vpmovwb
22551 if (Subtarget.hasAVX512()) {
22552 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
22553 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22553, __extension__
__PRETTY_FUNCTION__))
;
22554 return splitVectorIntUnary(Op, DAG);
22555 }
22556
22557 // word to byte only under BWI. Otherwise we have to promoted to v16i32
22558 // and then truncate that. But we should only do that if we haven't been
22559 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
22560 // handled by isel patterns.
22561 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
22562 Subtarget.canExtendTo512DQ())
22563 return Op;
22564 }
22565
22566 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22567 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22568
22569 // Truncate with PACKUS if we are truncating a vector with leading zero bits
22570 // that extend all the way to the packed/truncated value.
22571 // Pre-SSE41 we can only use PACKUSWB.
22572 KnownBits Known = DAG.computeKnownBits(In);
22573 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
22574 if (SDValue V =
22575 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
22576 return V;
22577
22578 // Truncate with PACKSS if we are truncating a vector with sign-bits that
22579 // extend all the way to the packed/truncated value.
22580 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
22581 if (SDValue V =
22582 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
22583 return V;
22584
22585 // Handle truncation of V256 to V128 using shuffles.
22586 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22586, __extension__
__PRETTY_FUNCTION__))
;
22587
22588 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
22589 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
22590 if (Subtarget.hasInt256()) {
22591 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
22592 In = DAG.getBitcast(MVT::v8i32, In);
22593 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
22594 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
22595 DAG.getIntPtrConstant(0, DL));
22596 }
22597
22598 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22599 DAG.getIntPtrConstant(0, DL));
22600 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22601 DAG.getIntPtrConstant(2, DL));
22602 static const int ShufMask[] = {0, 2, 4, 6};
22603 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
22604 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
22605 }
22606
22607 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
22608 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
22609 if (Subtarget.hasInt256()) {
22610 // The PSHUFB mask:
22611 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
22612 -1, -1, -1, -1, -1, -1, -1, -1,
22613 16, 17, 20, 21, 24, 25, 28, 29,
22614 -1, -1, -1, -1, -1, -1, -1, -1 };
22615 In = DAG.getBitcast(MVT::v32i8, In);
22616 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
22617 In = DAG.getBitcast(MVT::v4i64, In);
22618
22619 static const int ShufMask2[] = {0, 2, -1, -1};
22620 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
22621 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
22622 DAG.getIntPtrConstant(0, DL));
22623 return DAG.getBitcast(MVT::v8i16, In);
22624 }
22625
22626 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22627 DAG.getIntPtrConstant(0, DL));
22628 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
22629 DAG.getIntPtrConstant(4, DL));
22630
22631 // The PSHUFB mask:
22632 static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
22633
22634 OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
22635 OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
22636
22637 OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
22638 OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
22639
22640 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
22641 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
22642
22643 // The MOVLHPS Mask:
22644 static const int ShufMask2[] = {0, 1, 4, 5};
22645 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
22646 return DAG.getBitcast(MVT::v8i16, res);
22647 }
22648
22649 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
22650 // Use an AND to zero uppper bits for PACKUS.
22651 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
22652
22653 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22654 DAG.getIntPtrConstant(0, DL));
22655 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
22656 DAG.getIntPtrConstant(8, DL));
22657 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
22658 }
22659
22660 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22660)
;
22661}
22662
22663// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
22664// behaves on out of range inputs to generate optimized conversions.
22665static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
22666 SelectionDAG &DAG,
22667 const X86Subtarget &Subtarget) {
22668 MVT SrcVT = Src.getSimpleValueType();
22669 unsigned DstBits = VT.getScalarSizeInBits();
22670 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22670, __extension__
__PRETTY_FUNCTION__))
;
22671
22672 // Calculate the converted result for values in the range 0 to
22673 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22674 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
22675 SDValue Big =
22676 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
22677 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
22678 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
22679
22680 // The "CVTTP2SI" instruction conveniently sets the sign bit if
22681 // and only if the value was out of range. So we can use that
22682 // as our indicator that we rather use "Big" instead of "Small".
22683 //
22684 // Use "Small" if "IsOverflown" has all bits cleared
22685 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22686
22687 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
22688 // use the slightly slower blendv select instead.
22689 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
22690 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
22691 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
22692 }
22693
22694 SDValue IsOverflown =
22695 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
22696 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
22697 return DAG.getNode(ISD::OR, dl, VT, Small,
22698 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22699}
22700
22701SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
22702 bool IsStrict = Op->isStrictFPOpcode();
22703 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
22704 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
22705 MVT VT = Op->getSimpleValueType(0);
22706 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22707 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
22708 MVT SrcVT = Src.getSimpleValueType();
22709 SDLoc dl(Op);
22710
22711 SDValue Res;
22712 if (isSoftFP16(SrcVT)) {
22713 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
22714 if (IsStrict)
22715 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
22716 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
22717 {NVT, MVT::Other}, {Chain, Src})});
22718 return DAG.getNode(Op.getOpcode(), dl, VT,
22719 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
22720 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
22721 return Op;
22722 }
22723
22724 if (VT.isVector()) {
22725 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
22726 MVT ResVT = MVT::v4i32;
22727 MVT TruncVT = MVT::v4i1;
22728 unsigned Opc;
22729 if (IsStrict)
22730 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
22731 else
22732 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22733
22734 if (!IsSigned && !Subtarget.hasVLX()) {
22735 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22735, __extension__
__PRETTY_FUNCTION__))
;
22736 // Widen to 512-bits.
22737 ResVT = MVT::v8i32;
22738 TruncVT = MVT::v8i1;
22739 Opc = Op.getOpcode();
22740 // Need to concat with zero vector for strict fp to avoid spurious
22741 // exceptions.
22742 // TODO: Should we just do this for non-strict as well?
22743 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
22744 : DAG.getUNDEF(MVT::v8f64);
22745 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
22746 DAG.getIntPtrConstant(0, dl));
22747 }
22748 if (IsStrict) {
22749 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
22750 Chain = Res.getValue(1);
22751 } else {
22752 Res = DAG.getNode(Opc, dl, ResVT, Src);
22753 }
22754
22755 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
22756 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
22757 DAG.getIntPtrConstant(0, dl));
22758 if (IsStrict)
22759 return DAG.getMergeValues({Res, Chain}, dl);
22760 return Res;
22761 }
22762
22763 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
22764 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
22765 return Op;
22766
22767 MVT ResVT = VT;
22768 MVT EleVT = VT.getVectorElementType();
22769 if (EleVT != MVT::i64)
22770 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
22771
22772 if (SrcVT != MVT::v8f16) {
22773 SDValue Tmp =
22774 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
22775 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
22776 Ops[0] = Src;
22777 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
22778 }
22779
22780 if (IsStrict) {
22781 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
22782 : X86ISD::STRICT_CVTTP2UI,
22783 dl, {ResVT, MVT::Other}, {Chain, Src});
22784 Chain = Res.getValue(1);
22785 } else {
22786 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
22787 ResVT, Src);
22788 }
22789
22790 // TODO: Need to add exception check code for strict FP.
22791 if (EleVT.getSizeInBits() < 16) {
22792 ResVT = MVT::getVectorVT(EleVT, 8);
22793 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
22794 }
22795
22796 if (ResVT != VT)
22797 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22798 DAG.getIntPtrConstant(0, dl));
22799
22800 if (IsStrict)
22801 return DAG.getMergeValues({Res, Chain}, dl);
22802 return Res;
22803 }
22804
22805 if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
22806 if (IsStrict) {
22807 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
22808 : ISD::STRICT_FP_TO_UINT,
22809 dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
22810 Chain = Res.getValue(1);
22811 } else {
22812 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
22813 MVT::v8i32, Src);
22814 }
22815
22816 // TODO: Need to add exception check code for strict FP.
22817 Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
22818
22819 if (IsStrict)
22820 return DAG.getMergeValues({Res, Chain}, dl);
22821 return Res;
22822 }
22823
22824 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
22825 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
22826 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22826, __extension__
__PRETTY_FUNCTION__))
;
22827 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22827, __extension__
__PRETTY_FUNCTION__))
;
22828 return Op;
22829 }
22830
22831 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
22832 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
22833 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
22834 Subtarget.useAVX512Regs()) {
22835 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22835, __extension__
__PRETTY_FUNCTION__))
;
22836 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22836, __extension__
__PRETTY_FUNCTION__))
;
22837 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22838 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22839 // Need to concat with zero vector for strict fp to avoid spurious
22840 // exceptions.
22841 // TODO: Should we just do this for non-strict as well?
22842 SDValue Tmp =
22843 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22844 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22845 DAG.getIntPtrConstant(0, dl));
22846
22847 if (IsStrict) {
22848 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
22849 {Chain, Src});
22850 Chain = Res.getValue(1);
22851 } else {
22852 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
22853 }
22854
22855 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22856 DAG.getIntPtrConstant(0, dl));
22857
22858 if (IsStrict)
22859 return DAG.getMergeValues({Res, Chain}, dl);
22860 return Res;
22861 }
22862
22863 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
22864 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
22865 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
22866 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
22867 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22867, __extension__
__PRETTY_FUNCTION__))
;
22868 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
22869 // Need to concat with zero vector for strict fp to avoid spurious
22870 // exceptions.
22871 // TODO: Should we just do this for non-strict as well?
22872 SDValue Tmp =
22873 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
22874 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
22875 DAG.getIntPtrConstant(0, dl));
22876
22877 if (IsStrict) {
22878 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22879 {Chain, Src});
22880 Chain = Res.getValue(1);
22881 } else {
22882 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
22883 }
22884
22885 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
22886 DAG.getIntPtrConstant(0, dl));
22887
22888 if (IsStrict)
22889 return DAG.getMergeValues({Res, Chain}, dl);
22890 return Res;
22891 }
22892
22893 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
22894 if (!Subtarget.hasVLX()) {
22895 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
22896 // legalizer and then widened again by vector op legalization.
22897 if (!IsStrict)
22898 return SDValue();
22899
22900 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
22901 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
22902 {Src, Zero, Zero, Zero});
22903 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
22904 {Chain, Tmp});
22905 SDValue Chain = Tmp.getValue(1);
22906 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
22907 DAG.getIntPtrConstant(0, dl));
22908 return DAG.getMergeValues({Tmp, Chain}, dl);
22909 }
22910
22911 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22911, __extension__
__PRETTY_FUNCTION__))
;
22912 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
22913 DAG.getUNDEF(MVT::v2f32));
22914 if (IsStrict) {
22915 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
22916 : X86ISD::STRICT_CVTTP2UI;
22917 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
22918 }
22919 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
22920 return DAG.getNode(Opc, dl, VT, Tmp);
22921 }
22922
22923 // Generate optimized instructions for pre AVX512 unsigned conversions from
22924 // vXf32 to vXi32.
22925 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
22926 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
22927 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
22928 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22928, __extension__
__PRETTY_FUNCTION__))
;
22929 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
22930 }
22931
22932 return SDValue();
22933 }
22934
22935 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22935, __extension__ __PRETTY_FUNCTION__))
;
22936
22937 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
22938
22939 if (!IsSigned && UseSSEReg) {
22940 // Conversions from f32/f64 with AVX512 should be legal.
22941 if (Subtarget.hasAVX512())
22942 return Op;
22943
22944 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
22945 // behaves on out of range inputs to generate optimized conversions.
22946 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
22947 (VT == MVT::i64 && Subtarget.is64Bit()))) {
22948 unsigned DstBits = VT.getScalarSizeInBits();
22949 APInt UIntLimit = APInt::getSignMask(DstBits);
22950 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
22951 DAG.getConstant(UIntLimit, dl, VT));
22952 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
22953
22954 // Calculate the converted result for values in the range:
22955 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
22956 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
22957 SDValue Small =
22958 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
22959 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
22960 SDValue Big = DAG.getNode(
22961 X86ISD::CVTTS2SI, dl, VT,
22962 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
22963 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
22964
22965 // The "CVTTS2SI" instruction conveniently sets the sign bit if
22966 // and only if the value was out of range. So we can use that
22967 // as our indicator that we rather use "Big" instead of "Small".
22968 //
22969 // Use "Small" if "IsOverflown" has all bits cleared
22970 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
22971 SDValue IsOverflown = DAG.getNode(
22972 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
22973 return DAG.getNode(ISD::OR, dl, VT, Small,
22974 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
22975 }
22976
22977 // Use default expansion for i64.
22978 if (VT == MVT::i64)
22979 return SDValue();
22980
22981 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22981, __extension__
__PRETTY_FUNCTION__))
;
22982
22983 // Promote i32 to i64 and use a signed operation on 64-bit targets.
22984 // FIXME: This does not generate an invalid exception if the input does not
22985 // fit in i32. PR44019
22986 if (Subtarget.is64Bit()) {
22987 if (IsStrict) {
22988 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
22989 {Chain, Src});
22990 Chain = Res.getValue(1);
22991 } else
22992 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
22993
22994 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22995 if (IsStrict)
22996 return DAG.getMergeValues({Res, Chain}, dl);
22997 return Res;
22998 }
22999
23000 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
23001 // use fisttp which will be handled later.
23002 if (!Subtarget.hasSSE3())
23003 return SDValue();
23004 }
23005
23006 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
23007 // FIXME: This does not generate an invalid exception if the input does not
23008 // fit in i16. PR44019
23009 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
23010 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23010, __extension__
__PRETTY_FUNCTION__))
;
23011 if (IsStrict) {
23012 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
23013 {Chain, Src});
23014 Chain = Res.getValue(1);
23015 } else
23016 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
23017
23018 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23019 if (IsStrict)
23020 return DAG.getMergeValues({Res, Chain}, dl);
23021 return Res;
23022 }
23023
23024 // If this is a FP_TO_SINT using SSEReg we're done.
23025 if (UseSSEReg && IsSigned)
23026 return Op;
23027
23028 // fp128 needs to use a libcall.
23029 if (SrcVT == MVT::f128) {
23030 RTLIB::Libcall LC;
23031 if (IsSigned)
23032 LC = RTLIB::getFPTOSINT(SrcVT, VT);
23033 else
23034 LC = RTLIB::getFPTOUINT(SrcVT, VT);
23035
23036 MakeLibCallOptions CallOptions;
23037 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
23038 SDLoc(Op), Chain);
23039
23040 if (IsStrict)
23041 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
23042
23043 return Tmp.first;
23044 }
23045
23046 // Fall back to X87.
23047 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
23048 if (IsStrict)
23049 return DAG.getMergeValues({V, Chain}, dl);
23050 return V;
23051 }
23052
23053 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23053)
;
23054}
23055
23056SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
23057 SelectionDAG &DAG) const {
23058 SDValue Src = Op.getOperand(0);
23059 MVT SrcVT = Src.getSimpleValueType();
23060
23061 if (SrcVT == MVT::f16)
23062 return SDValue();
23063
23064 // If the source is in an SSE register, the node is Legal.
23065 if (isScalarFPTypeInSSEReg(SrcVT))
23066 return Op;
23067
23068 return LRINT_LLRINTHelper(Op.getNode(), DAG);
23069}
23070
23071SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
23072 SelectionDAG &DAG) const {
23073 EVT DstVT = N->getValueType(0);
23074 SDValue Src = N->getOperand(0);
23075 EVT SrcVT = Src.getValueType();
23076
23077 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
23078 // f16 must be promoted before using the lowering in this routine.
23079 // fp128 does not use this lowering.
23080 return SDValue();
23081 }
23082
23083 SDLoc DL(N);
23084 SDValue Chain = DAG.getEntryNode();
23085
23086 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
23087
23088 // If we're converting from SSE, the stack slot needs to hold both types.
23089 // Otherwise it only needs to hold the DstVT.
23090 EVT OtherVT = UseSSE ? SrcVT : DstVT;
23091 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
23092 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23093 MachinePointerInfo MPI =
23094 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
23095
23096 if (UseSSE) {
23097 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23097, __extension__
__PRETTY_FUNCTION__))
;
23098 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
23099 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
23100 SDValue Ops[] = { Chain, StackPtr };
23101
23102 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
23103 /*Align*/ std::nullopt,
23104 MachineMemOperand::MOLoad);
23105 Chain = Src.getValue(1);
23106 }
23107
23108 SDValue StoreOps[] = { Chain, Src, StackPtr };
23109 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
23110 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
23111 MachineMemOperand::MOStore);
23112
23113 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
23114}
23115
23116SDValue
23117X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
23118 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
23119 // but making use of X86 specifics to produce better instruction sequences.
23120 SDNode *Node = Op.getNode();
23121 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
23122 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
23123 SDLoc dl(SDValue(Node, 0));
23124 SDValue Src = Node->getOperand(0);
23125
23126 // There are three types involved here: SrcVT is the source floating point
23127 // type, DstVT is the type of the result, and TmpVT is the result of the
23128 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
23129 // DstVT).
23130 EVT SrcVT = Src.getValueType();
23131 EVT DstVT = Node->getValueType(0);
23132 EVT TmpVT = DstVT;
23133
23134 // This code is only for floats and doubles. Fall back to generic code for
23135 // anything else.
23136 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
23137 return SDValue();
23138
23139 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
23140 unsigned SatWidth = SatVT.getScalarSizeInBits();
23141 unsigned DstWidth = DstVT.getScalarSizeInBits();
23142 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
23143 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23144, __extension__
__PRETTY_FUNCTION__))
23144 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23144, __extension__
__PRETTY_FUNCTION__))
;
23145
23146 // Promote result of FP_TO_*INT to at least 32 bits.
23147 if (TmpWidth < 32) {
23148 TmpVT = MVT::i32;
23149 TmpWidth = 32;
23150 }
23151
23152 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
23153 // us to use a native signed conversion instead.
23154 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
23155 TmpVT = MVT::i64;
23156 TmpWidth = 64;
23157 }
23158
23159 // If the saturation width is smaller than the size of the temporary result,
23160 // we can always use signed conversion, which is native.
23161 if (SatWidth < TmpWidth)
23162 FpToIntOpcode = ISD::FP_TO_SINT;
23163
23164 // Determine minimum and maximum integer values and their corresponding
23165 // floating-point values.
23166 APInt MinInt, MaxInt;
23167 if (IsSigned) {
23168 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
23169 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
23170 } else {
23171 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
23172 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
23173 }
23174
23175 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23176 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23177
23178 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
23179 MinInt, IsSigned, APFloat::rmTowardZero);
23180 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
23181 MaxInt, IsSigned, APFloat::rmTowardZero);
23182 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
23183 && !(MaxStatus & APFloat::opStatus::opInexact);
23184
23185 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
23186 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
23187
23188 // If the integer bounds are exactly representable as floats, emit a
23189 // min+max+fptoi sequence. Otherwise use comparisons and selects.
23190 if (AreExactFloatBounds) {
23191 if (DstVT != TmpVT) {
23192 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
23193 SDValue MinClamped = DAG.getNode(
23194 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
23195 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
23196 SDValue BothClamped = DAG.getNode(
23197 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
23198 // Convert clamped value to integer.
23199 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
23200
23201 // NaN will become INDVAL, with the top bit set and the rest zero.
23202 // Truncation will discard the top bit, resulting in zero.
23203 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23204 }
23205
23206 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
23207 SDValue MinClamped = DAG.getNode(
23208 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
23209 // Clamp by MaxFloat from above. NaN cannot occur.
23210 SDValue BothClamped = DAG.getNode(
23211 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
23212 // Convert clamped value to integer.
23213 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
23214
23215 if (!IsSigned) {
23216 // In the unsigned case we're done, because we mapped NaN to MinFloat,
23217 // which is zero.
23218 return FpToInt;
23219 }
23220
23221 // Otherwise, select zero if Src is NaN.
23222 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23223 return DAG.getSelectCC(
23224 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
23225 }
23226
23227 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
23228 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
23229
23230 // Result of direct conversion, which may be selected away.
23231 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
23232
23233 if (DstVT != TmpVT) {
23234 // NaN will become INDVAL, with the top bit set and the rest zero.
23235 // Truncation will discard the top bit, resulting in zero.
23236 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23237 }
23238
23239 SDValue Select = FpToInt;
23240 // For signed conversions where we saturate to the same size as the
23241 // result type of the fptoi instructions, INDVAL coincides with integer
23242 // minimum, so we don't need to explicitly check it.
23243 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
23244 // If Src ULT MinFloat, select MinInt. In particular, this also selects
23245 // MinInt if Src is NaN.
23246 Select = DAG.getSelectCC(
23247 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
23248 }
23249
23250 // If Src OGT MaxFloat, select MaxInt.
23251 Select = DAG.getSelectCC(
23252 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
23253
23254 // In the unsigned case we are done, because we mapped NaN to MinInt, which
23255 // is already zero. The promoted case was already handled above.
23256 if (!IsSigned || DstVT != TmpVT) {
23257 return Select;
23258 }
23259
23260 // Otherwise, select 0 if Src is NaN.
23261 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23262 return DAG.getSelectCC(
23263 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
23264}
23265
23266SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
23267 bool IsStrict = Op->isStrictFPOpcode();
23268
23269 SDLoc DL(Op);
23270 MVT VT = Op.getSimpleValueType();
23271 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23272 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23273 MVT SVT = In.getSimpleValueType();
23274
23275 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
23276 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
23277 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
23278 !Subtarget.getTargetTriple().isOSDarwin()))
23279 return SDValue();
23280
23281 if (SVT == MVT::f16) {
23282 if (Subtarget.hasFP16())
23283 return Op;
23284
23285 if (VT != MVT::f32) {
23286 if (IsStrict)
23287 return DAG.getNode(
23288 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
23289 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
23290 {MVT::f32, MVT::Other}, {Chain, In})});
23291
23292 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
23293 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
23294 }
23295
23296 if (!Subtarget.hasF16C()) {
23297 if (!Subtarget.getTargetTriple().isOSDarwin())
23298 return SDValue();
23299
23300 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23300, __extension__
__PRETTY_FUNCTION__))
;
23301
23302 // Need a libcall, but ABI for f16 is soft-float on MacOS.
23303 TargetLowering::CallLoweringInfo CLI(DAG);
23304 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23305
23306 In = DAG.getBitcast(MVT::i16, In);
23307 TargetLowering::ArgListTy Args;
23308 TargetLowering::ArgListEntry Entry;
23309 Entry.Node = In;
23310 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
23311 Entry.IsSExt = false;
23312 Entry.IsZExt = true;
23313 Args.push_back(Entry);
23314
23315 SDValue Callee = DAG.getExternalSymbol(
23316 getLibcallName(RTLIB::FPEXT_F16_F32),
23317 getPointerTy(DAG.getDataLayout()));
23318 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23319 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
23320 std::move(Args));
23321
23322 SDValue Res;
23323 std::tie(Res,Chain) = LowerCallTo(CLI);
23324 if (IsStrict)
23325 Res = DAG.getMergeValues({Res, Chain}, DL);
23326
23327 return Res;
23328 }
23329
23330 In = DAG.getBitcast(MVT::i16, In);
23331 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
23332 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
23333 DAG.getIntPtrConstant(0, DL));
23334 SDValue Res;
23335 if (IsStrict) {
23336 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
23337 {Chain, In});
23338 Chain = Res.getValue(1);
23339 } else {
23340 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
23341 DAG.getTargetConstant(4, DL, MVT::i32));
23342 }
23343 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
23344 DAG.getIntPtrConstant(0, DL));
23345 if (IsStrict)
23346 return DAG.getMergeValues({Res, Chain}, DL);
23347 return Res;
23348 }
23349
23350 if (!SVT.isVector())
23351 return Op;
23352
23353 if (SVT.getVectorElementType() == MVT::f16) {
23354 assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23354, __extension__
__PRETTY_FUNCTION__))
;
23355 if (SVT == MVT::v2f16)
23356 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
23357 DAG.getUNDEF(MVT::v2f16));
23358 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
23359 DAG.getUNDEF(MVT::v4f16));
23360 if (IsStrict)
23361 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23362 {Op->getOperand(0), Res});
23363 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23364 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
23365 return Op;
23366 }
23367
23368 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23368, __extension__
__PRETTY_FUNCTION__))
;
23369
23370 SDValue Res =
23371 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
23372 if (IsStrict)
23373 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23374 {Op->getOperand(0), Res});
23375 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23376}
23377
23378SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
23379 bool IsStrict = Op->isStrictFPOpcode();
23380
23381 SDLoc DL(Op);
23382 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23383 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23384 MVT VT = Op.getSimpleValueType();
23385 MVT SVT = In.getSimpleValueType();
23386
23387 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
23388 return SDValue();
23389
23390 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
23391 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
23392 if (!Subtarget.getTargetTriple().isOSDarwin())
23393 return SDValue();
23394
23395 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
23396 TargetLowering::CallLoweringInfo CLI(DAG);
23397 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23398
23399 TargetLowering::ArgListTy Args;
23400 TargetLowering::ArgListEntry Entry;
23401 Entry.Node = In;
23402 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
23403 Entry.IsSExt = false;
23404 Entry.IsZExt = true;
23405 Args.push_back(Entry);
23406
23407 SDValue Callee = DAG.getExternalSymbol(
23408 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
23409 : RTLIB::FPROUND_F32_F16),
23410 getPointerTy(DAG.getDataLayout()));
23411 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23412 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
23413 std::move(Args));
23414
23415 SDValue Res;
23416 std::tie(Res, Chain) = LowerCallTo(CLI);
23417
23418 Res = DAG.getBitcast(MVT::f16, Res);
23419
23420 if (IsStrict)
23421 Res = DAG.getMergeValues({Res, Chain}, DL);
23422
23423 return Res;
23424 }
23425
23426 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
23427 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
23428 return SDValue();
23429
23430 if (VT.isVector())
23431 return Op;
23432
23433 SDValue Res;
23434 SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
23435 MVT::i32);
23436 if (IsStrict) {
23437 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
23438 DAG.getConstantFP(0, DL, MVT::v4f32), In,
23439 DAG.getIntPtrConstant(0, DL));
23440 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
23441 {Chain, Res, Rnd});
23442 Chain = Res.getValue(1);
23443 } else {
23444 // FIXME: Should we use zeros for upper elements for non-strict?
23445 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
23446 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
23447 }
23448
23449 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
23450 DAG.getIntPtrConstant(0, DL));
23451 Res = DAG.getBitcast(MVT::f16, Res);
23452
23453 if (IsStrict)
23454 return DAG.getMergeValues({Res, Chain}, DL);
23455
23456 return Res;
23457 }
23458
23459 return Op;
23460}
23461
23462static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
23463 bool IsStrict = Op->isStrictFPOpcode();
23464 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23465 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23466, __extension__
__PRETTY_FUNCTION__))
23466 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23466, __extension__
__PRETTY_FUNCTION__))
;
23467
23468 SDLoc dl(Op);
23469 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
23470 DAG.getConstant(0, dl, MVT::v8i16), Src,
23471 DAG.getIntPtrConstant(0, dl));
23472
23473 SDValue Chain;
23474 if (IsStrict) {
23475 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
23476 {Op.getOperand(0), Res});
23477 Chain = Res.getValue(1);
23478 } else {
23479 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
23480 }
23481
23482 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
23483 DAG.getIntPtrConstant(0, dl));
23484
23485 if (IsStrict)
23486 return DAG.getMergeValues({Res, Chain}, dl);
23487
23488 return Res;
23489}
23490
23491static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
23492 bool IsStrict = Op->isStrictFPOpcode();
23493 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23494 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23495, __extension__
__PRETTY_FUNCTION__))
23495 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23495, __extension__
__PRETTY_FUNCTION__))
;
23496
23497 SDLoc dl(Op);
23498 SDValue Res, Chain;
23499 if (IsStrict) {
23500 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
23501 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
23502 DAG.getIntPtrConstant(0, dl));
23503 Res = DAG.getNode(
23504 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
23505 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
23506 Chain = Res.getValue(1);
23507 } else {
23508 // FIXME: Should we use zeros for upper elements for non-strict?
23509 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
23510 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
23511 DAG.getTargetConstant(4, dl, MVT::i32));
23512 }
23513
23514 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
23515 DAG.getIntPtrConstant(0, dl));
23516
23517 if (IsStrict)
23518 return DAG.getMergeValues({Res, Chain}, dl);
23519
23520 return Res;
23521}
23522
23523SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
23524 SelectionDAG &DAG) const {
23525 SDLoc DL(Op);
23526 MakeLibCallOptions CallOptions;
23527 RTLIB::Libcall LC =
23528 RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
23529 SDValue Res =
23530 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
23531 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
23532 DAG.getBitcast(MVT::i32, Res));
23533}
23534
23535/// Depending on uarch and/or optimizing for size, we might prefer to use a
23536/// vector operation in place of the typical scalar operation.
23537static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
23538 const X86Subtarget &Subtarget) {
23539 // If both operands have other uses, this is probably not profitable.
23540 SDValue LHS = Op.getOperand(0);
23541 SDValue RHS = Op.getOperand(1);
23542 if (!LHS.hasOneUse() && !RHS.hasOneUse())
23543 return Op;
23544
23545 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
23546 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
23547 if (IsFP && !Subtarget.hasSSE3())
23548 return Op;
23549 if (!IsFP && !Subtarget.hasSSSE3())
23550 return Op;
23551
23552 // Extract from a common vector.
23553 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23554 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23555 LHS.getOperand(0) != RHS.getOperand(0) ||
23556 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
23557 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
23558 !shouldUseHorizontalOp(true, DAG, Subtarget))
23559 return Op;
23560
23561 // Allow commuted 'hadd' ops.
23562 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
23563 unsigned HOpcode;
23564 switch (Op.getOpcode()) {
23565 case ISD::ADD: HOpcode = X86ISD::HADD; break;
23566 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
23567 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
23568 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
23569 default:
23570 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23570)
;
23571 }
23572 unsigned LExtIndex = LHS.getConstantOperandVal(1);
23573 unsigned RExtIndex = RHS.getConstantOperandVal(1);
23574 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
23575 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
23576 std::swap(LExtIndex, RExtIndex);
23577
23578 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
23579 return Op;
23580
23581 SDValue X = LHS.getOperand(0);
23582 EVT VecVT = X.getValueType();
23583 unsigned BitWidth = VecVT.getSizeInBits();
23584 unsigned NumLanes = BitWidth / 128;
23585 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
23586 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23587, __extension__
__PRETTY_FUNCTION__))
23587 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23587, __extension__
__PRETTY_FUNCTION__))
;
23588
23589 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
23590 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
23591 SDLoc DL(Op);
23592 if (BitWidth == 256 || BitWidth == 512) {
23593 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
23594 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
23595 LExtIndex %= NumEltsPerLane;
23596 }
23597
23598 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
23599 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
23600 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
23601 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
23602 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
23603 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
23604 DAG.getIntPtrConstant(LExtIndex / 2, DL));
23605}
23606
23607/// Depending on uarch and/or optimizing for size, we might prefer to use a
23608/// vector operation in place of the typical scalar operation.
23609SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
23610 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23611, __extension__
__PRETTY_FUNCTION__))
23611 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23611, __extension__
__PRETTY_FUNCTION__))
;
23612 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
23613}
23614
23615/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
23616/// This mode isn't supported in hardware on X86. But as long as we aren't
23617/// compiling with trapping math, we can emulate this with
23618/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
23619static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
23620 SDValue N0 = Op.getOperand(0);
23621 SDLoc dl(Op);
23622 MVT VT = Op.getSimpleValueType();
23623
23624 // N0 += copysign(nextafter(0.5, 0.0), N0)
23625 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23626 bool Ignored;
23627 APFloat Point5Pred = APFloat(0.5f);
23628 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
23629 Point5Pred.next(/*nextDown*/true);
23630
23631 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
23632 DAG.getConstantFP(Point5Pred, dl, VT), N0);
23633 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
23634
23635 // Truncate the result to remove fraction.
23636 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
23637}
23638
23639/// The only differences between FABS and FNEG are the mask and the logic op.
23640/// FNEG also has a folding opportunity for FNEG(FABS(x)).
23641static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
23642 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23643, __extension__
__PRETTY_FUNCTION__))
23643 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23643, __extension__
__PRETTY_FUNCTION__))
;
23644
23645 bool IsFABS = (Op.getOpcode() == ISD::FABS);
23646
23647 // If this is a FABS and it has an FNEG user, bail out to fold the combination
23648 // into an FNABS. We'll lower the FABS after that if it is still in use.
23649 if (IsFABS)
23650 for (SDNode *User : Op->uses())
23651 if (User->getOpcode() == ISD::FNEG)
23652 return Op;
23653
23654 SDLoc dl(Op);
23655 MVT VT = Op.getSimpleValueType();
23656
23657 bool IsF128 = (VT == MVT::f128);
23658 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23660, __extension__
__PRETTY_FUNCTION__))
23659 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23660, __extension__
__PRETTY_FUNCTION__))
23660 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23660, __extension__
__PRETTY_FUNCTION__))
;
23661
23662 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
23663 // decide if we should generate a 16-byte constant mask when we only need 4 or
23664 // 8 bytes for the scalar case.
23665
23666 // There are no scalar bitwise logical SSE/AVX instructions, so we
23667 // generate a 16-byte vector constant and logic op even for the scalar case.
23668 // Using a 16-byte mask allows folding the load of the mask with
23669 // the logic op, so it can save (~4 bytes) on code size.
23670 bool IsFakeVector = !VT.isVector() && !IsF128;
23671 MVT LogicVT = VT;
23672 if (IsFakeVector)
23673 LogicVT = (VT == MVT::f64) ? MVT::v2f64
23674 : (VT == MVT::f32) ? MVT::v4f32
23675 : MVT::v8f16;
23676
23677 unsigned EltBits = VT.getScalarSizeInBits();
23678 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
23679 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
23680 APInt::getSignMask(EltBits);
23681 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23682 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
23683
23684 SDValue Op0 = Op.getOperand(0);
23685 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
23686 unsigned LogicOp = IsFABS ? X86ISD::FAND :
23687 IsFNABS ? X86ISD::FOR :
23688 X86ISD::FXOR;
23689 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
23690
23691 if (VT.isVector() || IsF128)
23692 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
23693
23694 // For the scalar case extend to a 128-bit vector, perform the logic op,
23695 // and extract the scalar result back out.
23696 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
23697 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
23698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
23699 DAG.getIntPtrConstant(0, dl));
23700}
23701
23702static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
23703 SDValue Mag = Op.getOperand(0);
23704 SDValue Sign = Op.getOperand(1);
23705 SDLoc dl(Op);
23706
23707 // If the sign operand is smaller, extend it first.
23708 MVT VT = Op.getSimpleValueType();
23709 if (Sign.getSimpleValueType().bitsLT(VT))
23710 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
23711
23712 // And if it is bigger, shrink it first.
23713 if (Sign.getSimpleValueType().bitsGT(VT))
23714 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
23715 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
23716
23717 // At this point the operands and the result should have the same
23718 // type, and that won't be f80 since that is not custom lowered.
23719 bool IsF128 = (VT == MVT::f128);
23720 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23722, __extension__
__PRETTY_FUNCTION__))
23721 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23722, __extension__
__PRETTY_FUNCTION__))
23722 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23722, __extension__
__PRETTY_FUNCTION__))
;
23723
23724 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
23725
23726 // Perform all scalar logic operations as 16-byte vectors because there are no
23727 // scalar FP logic instructions in SSE.
23728 // TODO: This isn't necessary. If we used scalar types, we might avoid some
23729 // unnecessary splats, but we might miss load folding opportunities. Should
23730 // this decision be based on OptimizeForSize?
23731 bool IsFakeVector = !VT.isVector() && !IsF128;
23732 MVT LogicVT = VT;
23733 if (IsFakeVector)
23734 LogicVT = (VT == MVT::f64) ? MVT::v2f64
23735 : (VT == MVT::f32) ? MVT::v4f32
23736 : MVT::v8f16;
23737
23738 // The mask constants are automatically splatted for vector types.
23739 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23740 SDValue SignMask = DAG.getConstantFP(
23741 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
23742 SDValue MagMask = DAG.getConstantFP(
23743 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
23744
23745 // First, clear all bits but the sign bit from the second operand (sign).
23746 if (IsFakeVector)
23747 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
23748 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
23749
23750 // Next, clear the sign bit from the first operand (magnitude).
23751 // TODO: If we had general constant folding for FP logic ops, this check
23752 // wouldn't be necessary.
23753 SDValue MagBits;
23754 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
23755 APFloat APF = Op0CN->getValueAPF();
23756 APF.clearSign();
23757 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
23758 } else {
23759 // If the magnitude operand wasn't a constant, we need to AND out the sign.
23760 if (IsFakeVector)
23761 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
23762 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
23763 }
23764
23765 // OR the magnitude value with the sign bit.
23766 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
23767 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
23768 DAG.getIntPtrConstant(0, dl));
23769}
23770
23771static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
23772 SDValue N0 = Op.getOperand(0);
23773 SDLoc dl(Op);
23774 MVT VT = Op.getSimpleValueType();
23775
23776 MVT OpVT = N0.getSimpleValueType();
23777 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23778, __extension__
__PRETTY_FUNCTION__))
23778 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23778, __extension__
__PRETTY_FUNCTION__))
;
23779
23780 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
23781 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
23782 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
23783 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
23784 Res = DAG.getZExtOrTrunc(Res, dl, VT);
23785 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
23786 return Res;
23787}
23788
23789/// Helper for attempting to create a X86ISD::BT node.
23790static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
23791 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
23792 // instruction. Since the shift amount is in-range-or-undefined, we know
23793 // that doing a bittest on the i32 value is ok. We extend to i32 because
23794 // the encoding for the i16 version is larger than the i32 version.
23795 // Also promote i16 to i32 for performance / code size reason.
23796 if (Src.getValueType().getScalarSizeInBits() < 32)
23797 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
23798
23799 // No legal type found, give up.
23800 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
23801 return SDValue();
23802
23803 // See if we can use the 32-bit instruction instead of the 64-bit one for a
23804 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
23805 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
23806 // known to be zero.
23807 if (Src.getValueType() == MVT::i64 &&
23808 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
23809 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
23810
23811 // If the operand types disagree, extend the shift amount to match. Since
23812 // BT ignores high bits (like shifts) we can use anyextend.
23813 if (Src.getValueType() != BitNo.getValueType()) {
23814 // Peek through a mask/modulo operation.
23815 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
23816 // we probably need a better IsDesirableToPromoteOp to handle this as well.
23817 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
23818 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
23819 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
23820 BitNo.getOperand(0)),
23821 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
23822 BitNo.getOperand(1)));
23823 else
23824 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
23825 }
23826
23827 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
23828}
23829
23830/// Helper for creating a X86ISD::SETCC node.
23831static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
23832 SelectionDAG &DAG) {
23833 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
23834 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
23835}
23836
23837/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
23838/// style scalarized (associative) reduction patterns. Partial reductions
23839/// are supported when the pointer SrcMask is non-null.
23840/// TODO - move this to SelectionDAG?
23841static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
23842 SmallVectorImpl<SDValue> &SrcOps,
23843 SmallVectorImpl<APInt> *SrcMask = nullptr) {
23844 SmallVector<SDValue, 8> Opnds;
23845 DenseMap<SDValue, APInt> SrcOpMap;
23846 EVT VT = MVT::Other;
23847
23848 // Recognize a special case where a vector is casted into wide integer to
23849 // test all 0s.
23850 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23851, __extension__
__PRETTY_FUNCTION__))
23851 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23851, __extension__
__PRETTY_FUNCTION__))
;
23852 Opnds.push_back(Op.getOperand(0));
23853 Opnds.push_back(Op.getOperand(1));
23854
23855 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23856 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
23857 // BFS traverse all BinOp operands.
23858 if (I->getOpcode() == unsigned(BinOp)) {
23859 Opnds.push_back(I->getOperand(0));
23860 Opnds.push_back(I->getOperand(1));
23861 // Re-evaluate the number of nodes to be traversed.
23862 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23863 continue;
23864 }
23865
23866 // Quit if a non-EXTRACT_VECTOR_ELT
23867 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23868 return false;
23869
23870 // Quit if without a constant index.
23871 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23872 if (!Idx)
23873 return false;
23874
23875 SDValue Src = I->getOperand(0);
23876 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23877 if (M == SrcOpMap.end()) {
23878 VT = Src.getValueType();
23879 // Quit if not the same type.
23880 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23881 return false;
23882 unsigned NumElts = VT.getVectorNumElements();
23883 APInt EltCount = APInt::getZero(NumElts);
23884 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23885 SrcOps.push_back(Src);
23886 }
23887
23888 // Quit if element already used.
23889 unsigned CIdx = Idx->getZExtValue();
23890 if (M->second[CIdx])
23891 return false;
23892 M->second.setBit(CIdx);
23893 }
23894
23895 if (SrcMask) {
23896 // Collect the source partial masks.
23897 for (SDValue &SrcOp : SrcOps)
23898 SrcMask->push_back(SrcOpMap[SrcOp]);
23899 } else {
23900 // Quit if not all elements are used.
23901 for (const auto &I : SrcOpMap)
23902 if (!I.second.isAllOnes())
23903 return false;
23904 }
23905
23906 return true;
23907}
23908
23909// Helper function for comparing all bits of a vector against zero.
23910static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
23911 const APInt &Mask,
23912 const X86Subtarget &Subtarget,
23913 SelectionDAG &DAG, X86::CondCode &X86CC) {
23914 EVT VT = V.getValueType();
23915 unsigned ScalarSize = VT.getScalarSizeInBits();
23916 if (Mask.getBitWidth() != ScalarSize) {
23917 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23917, __extension__
__PRETTY_FUNCTION__))
;
23918 return SDValue();
23919 }
23920
23921 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23921, __extension__
__PRETTY_FUNCTION__))
;
23922 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23923
23924 auto MaskBits = [&](SDValue Src) {
23925 if (Mask.isAllOnes())
23926 return Src;
23927 EVT SrcVT = Src.getValueType();
23928 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23929 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23930 };
23931
23932 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23933 if (VT.getSizeInBits() < 128) {
23934 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23935 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
23936 return SDValue();
23937 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23938 DAG.getBitcast(IntVT, MaskBits(V)),
23939 DAG.getConstant(0, DL, IntVT));
23940 }
23941
23942 // Quit if not splittable to 128/256-bit vector.
23943 if (!isPowerOf2_32(VT.getSizeInBits()))
23944 return SDValue();
23945
23946 // Split down to 128/256-bit vector.
23947 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
23948 while (VT.getSizeInBits() > TestSize) {
23949 auto Split = DAG.SplitVector(V, DL);
23950 VT = Split.first.getValueType();
23951 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23952 }
23953
23954 bool UsePTEST = Subtarget.hasSSE41();
23955 if (UsePTEST) {
23956 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
23957 V = DAG.getBitcast(TestVT, MaskBits(V));
23958 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23959 }
23960
23961 // Without PTEST, a masked v2i64 or-reduction is not faster than
23962 // scalarization.
23963 if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)
23964 return SDValue();
23965
23966 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
23967 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
23968 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
23969 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23970 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23971 DAG.getConstant(0xFFFF, DL, MVT::i32));
23972}
23973
23974// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
23975// CMP(MOVMSK(PCMPEQB(X,0))).
23976static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
23977 const SDLoc &DL,
23978 const X86Subtarget &Subtarget,
23979 SelectionDAG &DAG, SDValue &X86CC) {
23980 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23980, __extension__
__PRETTY_FUNCTION__))
;
23981
23982 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23983 return SDValue();
23984
23985 // Check whether we're masking/truncating an OR-reduction result, in which
23986 // case track the masked bits.
23987 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23988 switch (Op.getOpcode()) {
23989 case ISD::TRUNCATE: {
23990 SDValue Src = Op.getOperand(0);
23991 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23992 Op.getScalarValueSizeInBits());
23993 Op = Src;
23994 break;
23995 }
23996 case ISD::AND: {
23997 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23998 Mask = Cst->getAPIntValue();
23999 Op = Op.getOperand(0);
24000 }
24001 break;
24002 }
24003 }
24004
24005 SmallVector<SDValue, 8> VecIns;
24006 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
24007 EVT VT = VecIns[0].getValueType();
24008 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24010, __extension__
__PRETTY_FUNCTION__))
24009 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24010, __extension__
__PRETTY_FUNCTION__))
24010 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24010, __extension__
__PRETTY_FUNCTION__))
;
24011
24012 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
24013 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
24014 return SDValue();
24015
24016 // If more than one full vector is evaluated, OR them first before PTEST.
24017 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
24018 Slot += 2, e += 1) {
24019 // Each iteration will OR 2 nodes and append the result until there is
24020 // only 1 node left, i.e. the final OR'd value of all vectors.
24021 SDValue LHS = VecIns[Slot];
24022 SDValue RHS = VecIns[Slot + 1];
24023 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
24024 }
24025
24026 X86::CondCode CCode;
24027 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
24028 DAG, CCode)) {
24029 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
24030 return V;
24031 }
24032 }
24033
24034 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24035 ISD::NodeType BinOp;
24036 if (SDValue Match =
24037 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
24038 X86::CondCode CCode;
24039 if (SDValue V =
24040 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
24041 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
24042 return V;
24043 }
24044 }
24045 }
24046
24047 return SDValue();
24048}
24049
24050/// return true if \c Op has a use that doesn't just read flags.
24051static bool hasNonFlagsUse(SDValue Op) {
24052 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
24053 ++UI) {
24054 SDNode *User = *UI;
24055 unsigned UOpNo = UI.getOperandNo();
24056 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
24057 // Look pass truncate.
24058 UOpNo = User->use_begin().getOperandNo();
24059 User = *User->use_begin();
24060 }
24061
24062 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
24063 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
24064 return true;
24065 }
24066 return false;
24067}
24068
24069// Transform to an x86-specific ALU node with flags if there is a chance of
24070// using an RMW op or only the flags are used. Otherwise, leave
24071// the node alone and emit a 'cmp' or 'test' instruction.
24072static bool isProfitableToUseFlagOp(SDValue Op) {
24073 for (SDNode *U : Op->uses())
24074 if (U->getOpcode() != ISD::CopyToReg &&
24075 U->getOpcode() != ISD::SETCC &&
24076 U->getOpcode() != ISD::STORE)
24077 return false;
24078
24079 return true;
24080}
24081
24082/// Emit nodes that will be selected as "test Op0,Op0", or something
24083/// equivalent.
24084static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
24085 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
24086 // CF and OF aren't always set the way we want. Determine which
24087 // of these we need.
24088 bool NeedCF = false;
24089 bool NeedOF = false;
24090 switch (X86CC) {
24091 default: break;
24092 case X86::COND_A: case X86::COND_AE:
24093 case X86::COND_B: case X86::COND_BE:
24094 NeedCF = true;
24095 break;
24096 case X86::COND_G: case X86::COND_GE:
24097 case X86::COND_L: case X86::COND_LE:
24098 case X86::COND_O: case X86::COND_NO: {
24099 // Check if we really need to set the
24100 // Overflow flag. If NoSignedWrap is present
24101 // that is not actually needed.
24102 switch (Op->getOpcode()) {
24103 case ISD::ADD:
24104 case ISD::SUB:
24105 case ISD::MUL:
24106 case ISD::SHL:
24107 if (Op.getNode()->getFlags().hasNoSignedWrap())
24108 break;
24109 [[fallthrough]];
24110 default:
24111 NeedOF = true;
24112 break;
24113 }
24114 break;
24115 }
24116 }
24117 // See if we can use the EFLAGS value from the operand instead of
24118 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
24119 // we prove that the arithmetic won't overflow, we can't use OF or CF.
24120 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
24121 // Emit a CMP with 0, which is the TEST pattern.
24122 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24123 DAG.getConstant(0, dl, Op.getValueType()));
24124 }
24125 unsigned Opcode = 0;
24126 unsigned NumOperands = 0;
24127
24128 SDValue ArithOp = Op;
24129
24130 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
24131 // which may be the result of a CAST. We use the variable 'Op', which is the
24132 // non-casted variable when we check for possible users.
24133 switch (ArithOp.getOpcode()) {
24134 case ISD::AND:
24135 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
24136 // because a TEST instruction will be better.
24137 if (!hasNonFlagsUse(Op))
24138 break;
24139
24140 [[fallthrough]];
24141 case ISD::ADD:
24142 case ISD::SUB:
24143 case ISD::OR:
24144 case ISD::XOR:
24145 if (!isProfitableToUseFlagOp(Op))
24146 break;
24147
24148 // Otherwise use a regular EFLAGS-setting instruction.
24149 switch (ArithOp.getOpcode()) {
24150 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24150)
;
24151 case ISD::ADD: Opcode = X86ISD::ADD; break;
24152 case ISD::SUB: Opcode = X86ISD::SUB; break;
24153 case ISD::XOR: Opcode = X86ISD::XOR; break;
24154 case ISD::AND: Opcode = X86ISD::AND; break;
24155 case ISD::OR: Opcode = X86ISD::OR; break;
24156 }
24157
24158 NumOperands = 2;
24159 break;
24160 case X86ISD::ADD:
24161 case X86ISD::SUB:
24162 case X86ISD::OR:
24163 case X86ISD::XOR:
24164 case X86ISD::AND:
24165 return SDValue(Op.getNode(), 1);
24166 case ISD::SSUBO:
24167 case ISD::USUBO: {
24168 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
24169 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24170 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
24171 Op->getOperand(1)).getValue(1);
24172 }
24173 default:
24174 break;
24175 }
24176
24177 if (Opcode == 0) {
24178 // Emit a CMP with 0, which is the TEST pattern.
24179 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24180 DAG.getConstant(0, dl, Op.getValueType()));
24181 }
24182 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24183 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
24184
24185 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
24186 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
24187 return SDValue(New.getNode(), 1);
24188}
24189
24190/// Emit nodes that will be selected as "cmp Op0,Op1", or something
24191/// equivalent.
24192static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
24193 const SDLoc &dl, SelectionDAG &DAG,
24194 const X86Subtarget &Subtarget) {
24195 if (isNullConstant(Op1))
24196 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
24197
24198 EVT CmpVT = Op0.getValueType();
24199
24200 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24201, __extension__
__PRETTY_FUNCTION__))
24201 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24201, __extension__
__PRETTY_FUNCTION__))
;
24202
24203 // Only promote the compare up to I32 if it is a 16 bit operation
24204 // with an immediate. 16 bit immediates are to be avoided.
24205 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
24206 !DAG.getMachineFunction().getFunction().hasMinSize()) {
24207 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
24208 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
24209 // Don't do this if the immediate can fit in 8-bits.
24210 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
24211 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
24212 unsigned ExtendOp =
24213 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24214 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
24215 // For equality comparisons try to use SIGN_EXTEND if the input was
24216 // truncate from something with enough sign bits.
24217 if (Op0.getOpcode() == ISD::TRUNCATE) {
24218 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
24219 ExtendOp = ISD::SIGN_EXTEND;
24220 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
24221 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
24222 ExtendOp = ISD::SIGN_EXTEND;
24223 }
24224 }
24225
24226 CmpVT = MVT::i32;
24227 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
24228 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
24229 }
24230 }
24231
24232 // Try to shrink i64 compares if the input has enough zero bits.
24233 // FIXME: Do this for non-constant compares for constant on LHS?
24234 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
24235 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
24236 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
24237 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
24238 CmpVT = MVT::i32;
24239 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
24240 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
24241 }
24242
24243 // 0-x == y --> x+y == 0
24244 // 0-x != y --> x+y != 0
24245 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
24246 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24247 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24248 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
24249 return Add.getValue(1);
24250 }
24251
24252 // x == 0-y --> x+y == 0
24253 // x != 0-y --> x+y != 0
24254 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
24255 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
24256 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24257 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
24258 return Add.getValue(1);
24259 }
24260
24261 // Use SUB instead of CMP to enable CSE between SUB and CMP.
24262 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
24263 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
24264 return Sub.getValue(1);
24265}
24266
24267/// Check if replacement of SQRT with RSQRT should be disabled.
24268bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
24269 EVT VT = Op.getValueType();
24270
24271 // We don't need to replace SQRT with RSQRT for half type.
24272 if (VT.getScalarType() == MVT::f16)
24273 return true;
24274
24275 // We never want to use both SQRT and RSQRT instructions for the same input.
24276 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
24277 return false;
24278
24279 if (VT.isVector())
24280 return Subtarget.hasFastVectorFSQRT();
24281 return Subtarget.hasFastScalarFSQRT();
24282}
24283
24284/// The minimum architected relative accuracy is 2^-12. We need one
24285/// Newton-Raphson step to have a good float result (24 bits of precision).
24286SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
24287 SelectionDAG &DAG, int Enabled,
24288 int &RefinementSteps,
24289 bool &UseOneConstNR,
24290 bool Reciprocal) const {
24291 SDLoc DL(Op);
24292 EVT VT = Op.getValueType();
24293
24294 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
24295 // It is likely not profitable to do this for f64 because a double-precision
24296 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
24297 // instructions: convert to single, rsqrtss, convert back to double, refine
24298 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
24299 // along with FMA, this could be a throughput win.
24300 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
24301 // after legalize types.
24302 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24303 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
24304 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
24305 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24306 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24307 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24308 RefinementSteps = 1;
24309
24310 UseOneConstNR = false;
24311 // There is no FSQRT for 512-bits, but there is RSQRT14.
24312 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
24313 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
24314 if (RefinementSteps == 0 && !Reciprocal)
24315 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
24316 return Estimate;
24317 }
24318
24319 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24320 Subtarget.hasFP16()) {
24321 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24321, __extension__
__PRETTY_FUNCTION__))
;
24322 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24323 RefinementSteps = 0;
24324
24325 if (VT == MVT::f16) {
24326 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24327 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24328 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24329 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
24330 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24331 }
24332
24333 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
24334 }
24335 return SDValue();
24336}
24337
24338/// The minimum architected relative accuracy is 2^-12. We need one
24339/// Newton-Raphson step to have a good float result (24 bits of precision).
24340SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
24341 int Enabled,
24342 int &RefinementSteps) const {
24343 SDLoc DL(Op);
24344 EVT VT = Op.getValueType();
24345
24346 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
24347 // It is likely not profitable to do this for f64 because a double-precision
24348 // reciprocal estimate with refinement on x86 prior to FMA requires
24349 // 15 instructions: convert to single, rcpss, convert back to double, refine
24350 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
24351 // along with FMA, this could be a throughput win.
24352
24353 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
24354 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
24355 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
24356 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
24357 // Enable estimate codegen with 1 refinement step for vector division.
24358 // Scalar division estimates are disabled because they break too much
24359 // real-world code. These defaults are intended to match GCC behavior.
24360 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
24361 return SDValue();
24362
24363 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24364 RefinementSteps = 1;
24365
24366 // There is no FSQRT for 512-bits, but there is RCP14.
24367 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
24368 return DAG.getNode(Opcode, DL, VT, Op);
24369 }
24370
24371 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
24372 Subtarget.hasFP16()) {
24373 if (RefinementSteps == ReciprocalEstimate::Unspecified)
24374 RefinementSteps = 0;
24375
24376 if (VT == MVT::f16) {
24377 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24378 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
24379 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
24380 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
24381 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
24382 }
24383
24384 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
24385 }
24386 return SDValue();
24387}
24388
24389/// If we have at least two divisions that use the same divisor, convert to
24390/// multiplication by a reciprocal. This may need to be adjusted for a given
24391/// CPU if a division's cost is not at least twice the cost of a multiplication.
24392/// This is because we still need one division to calculate the reciprocal and
24393/// then we need two multiplies by that reciprocal as replacements for the
24394/// original divisions.
24395unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
24396 return 2;
24397}
24398
24399SDValue
24400X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
24401 SelectionDAG &DAG,
24402 SmallVectorImpl<SDNode *> &Created) const {
24403 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
24404 if (isIntDivCheap(N->getValueType(0), Attr))
24405 return SDValue(N,0); // Lower SDIV as SDIV
24406
24407 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24408, __extension__
__PRETTY_FUNCTION__))
24408 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24408, __extension__
__PRETTY_FUNCTION__))
;
24409
24410 // Only perform this transform if CMOV is supported otherwise the select
24411 // below will become a branch.
24412 if (!Subtarget.canUseCMOV())
24413 return SDValue();
24414
24415 // fold (sdiv X, pow2)
24416 EVT VT = N->getValueType(0);
24417 // FIXME: Support i8.
24418 if (VT != MVT::i16 && VT != MVT::i32 &&
24419 !(Subtarget.is64Bit() && VT == MVT::i64))
24420 return SDValue();
24421
24422 unsigned Lg2 = Divisor.countTrailingZeros();
24423
24424 // If the divisor is 2 or -2, the default expansion is better.
24425 if (Lg2 == 1)
24426 return SDValue();
24427
24428 SDLoc DL(N);
24429 SDValue N0 = N->getOperand(0);
24430 SDValue Zero = DAG.getConstant(0, DL, VT);
24431 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
24432 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
24433
24434 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
24435 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
24436 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
24437 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
24438
24439 Created.push_back(Cmp.getNode());
24440 Created.push_back(Add.getNode());
24441 Created.push_back(CMov.getNode());
24442
24443 // Divide by pow2.
24444 SDValue SRA =
24445 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
24446
24447 // If we're dividing by a positive value, we're done. Otherwise, we must
24448 // negate the result.
24449 if (Divisor.isNonNegative())
24450 return SRA;
24451
24452 Created.push_back(SRA.getNode());
24453 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
24454}
24455
24456/// Result of 'and' is compared against zero. Change to a BT node if possible.
24457/// Returns the BT node and the condition code needed to use it.
24458static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
24459 SelectionDAG &DAG, X86::CondCode &X86CC) {
24460 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24460, __extension__
__PRETTY_FUNCTION__))
;
24461 SDValue Op0 = And.getOperand(0);
24462 SDValue Op1 = And.getOperand(1);
24463 if (Op0.getOpcode() == ISD::TRUNCATE)
24464 Op0 = Op0.getOperand(0);
24465 if (Op1.getOpcode() == ISD::TRUNCATE)
24466 Op1 = Op1.getOperand(0);
24467
24468 SDValue Src, BitNo;
24469 if (Op1.getOpcode() == ISD::SHL)
24470 std::swap(Op0, Op1);
24471 if (Op0.getOpcode() == ISD::SHL) {
24472 if (isOneConstant(Op0.getOperand(0))) {
24473 // If we looked past a truncate, check that it's only truncating away
24474 // known zeros.
24475 unsigned BitWidth = Op0.getValueSizeInBits();
24476 unsigned AndBitWidth = And.getValueSizeInBits();
24477 if (BitWidth > AndBitWidth) {
24478 KnownBits Known = DAG.computeKnownBits(Op0);
24479 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
24480 return SDValue();
24481 }
24482 Src = Op1;
24483 BitNo = Op0.getOperand(1);
24484 }
24485 } else if (Op1.getOpcode() == ISD::Constant) {
24486 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
24487 uint64_t AndRHSVal = AndRHS->getZExtValue();
24488 SDValue AndLHS = Op0;
24489
24490 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
24491 Src = AndLHS.getOperand(0);
24492 BitNo = AndLHS.getOperand(1);
24493 } else {
24494 // Use BT if the immediate can't be encoded in a TEST instruction or we
24495 // are optimizing for size and the immedaite won't fit in a byte.
24496 bool OptForSize = DAG.shouldOptForSize();
24497 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
24498 isPowerOf2_64(AndRHSVal)) {
24499 Src = AndLHS;
24500 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
24501 Src.getValueType());
24502 }
24503 }
24504 }
24505
24506 // No patterns found, give up.
24507 if (!Src.getNode())
24508 return SDValue();
24509
24510 // Remove any bit flip.
24511 if (isBitwiseNot(Src)) {
24512 Src = Src.getOperand(0);
24513 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
24514 }
24515
24516 // Attempt to create the X86ISD::BT node.
24517 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
24518 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24519 return BT;
24520 }
24521
24522 return SDValue();
24523}
24524
24525// Check if pre-AVX condcode can be performed by a single FCMP op.
24526static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
24527 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
24528}
24529
24530/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
24531/// CMPs.
24532static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
24533 SDValue &Op1, bool &IsAlwaysSignaling) {
24534 unsigned SSECC;
24535 bool Swap = false;
24536
24537 // SSE Condition code mapping:
24538 // 0 - EQ
24539 // 1 - LT
24540 // 2 - LE
24541 // 3 - UNORD
24542 // 4 - NEQ
24543 // 5 - NLT
24544 // 6 - NLE
24545 // 7 - ORD
24546 switch (SetCCOpcode) {
24547 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24547)
;
24548 case ISD::SETOEQ:
24549 case ISD::SETEQ: SSECC = 0; break;
24550 case ISD::SETOGT:
24551 case ISD::SETGT: Swap = true; [[fallthrough]];
24552 case ISD::SETLT:
24553 case ISD::SETOLT: SSECC = 1; break;
24554 case ISD::SETOGE:
24555 case ISD::SETGE: Swap = true; [[fallthrough]];
24556 case ISD::SETLE:
24557 case ISD::SETOLE: SSECC = 2; break;
24558 case ISD::SETUO: SSECC = 3; break;
24559 case ISD::SETUNE:
24560 case ISD::SETNE: SSECC = 4; break;
24561 case ISD::SETULE: Swap = true; [[fallthrough]];
24562 case ISD::SETUGE: SSECC = 5; break;
24563 case ISD::SETULT: Swap = true; [[fallthrough]];
24564 case ISD::SETUGT: SSECC = 6; break;
24565 case ISD::SETO: SSECC = 7; break;
24566 case ISD::SETUEQ: SSECC = 8; break;
24567 case ISD::SETONE: SSECC = 12; break;
24568 }
24569 if (Swap)
24570 std::swap(Op0, Op1);
24571
24572 switch (SetCCOpcode) {
24573 default:
24574 IsAlwaysSignaling = true;
24575 break;
24576 case ISD::SETEQ:
24577 case ISD::SETOEQ:
24578 case ISD::SETUEQ:
24579 case ISD::SETNE:
24580 case ISD::SETONE:
24581 case ISD::SETUNE:
24582 case ISD::SETO:
24583 case ISD::SETUO:
24584 IsAlwaysSignaling = false;
24585 break;
24586 }
24587
24588 return SSECC;
24589}
24590
24591/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
24592/// concatenate the result back.
24593static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
24594 ISD::CondCode Cond, SelectionDAG &DAG,
24595 const SDLoc &dl) {
24596 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24597, __extension__
__PRETTY_FUNCTION__))
24597 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24597, __extension__
__PRETTY_FUNCTION__))
;
24598
24599 SDValue CC = DAG.getCondCode(Cond);
24600
24601 // Extract the LHS Lo/Hi vectors
24602 SDValue LHS1, LHS2;
24603 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
24604
24605 // Extract the RHS Lo/Hi vectors
24606 SDValue RHS1, RHS2;
24607 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
24608
24609 // Issue the operation on the smaller types and concatenate the result back
24610 EVT LoVT, HiVT;
24611 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
24612 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
24613 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
24614 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
24615}
24616
24617static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
24618
24619 SDValue Op0 = Op.getOperand(0);
24620 SDValue Op1 = Op.getOperand(1);
24621 SDValue CC = Op.getOperand(2);
24622 MVT VT = Op.getSimpleValueType();
24623 SDLoc dl(Op);
24624
24625 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24626, __extension__
__PRETTY_FUNCTION__))
24626 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24626, __extension__
__PRETTY_FUNCTION__))
;
24627
24628 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
24629
24630 // Prefer SETGT over SETLT.
24631 if (SetCCOpcode == ISD::SETLT) {
24632 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
24633 std::swap(Op0, Op1);
24634 }
24635
24636 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
24637}
24638
24639/// Given a buildvector constant, return a new vector constant with each element
24640/// incremented or decremented. If incrementing or decrementing would result in
24641/// unsigned overflow or underflow or this is not a simple vector constant,
24642/// return an empty value.
24643static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
24644 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
24645 if (!BV)
24646 return SDValue();
24647
24648 MVT VT = V.getSimpleValueType();
24649 MVT EltVT = VT.getVectorElementType();
24650 unsigned NumElts = VT.getVectorNumElements();
24651 SmallVector<SDValue, 8> NewVecC;
24652 SDLoc DL(V);
24653 for (unsigned i = 0; i < NumElts; ++i) {
24654 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
24655 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
24656 return SDValue();
24657
24658 // Avoid overflow/underflow.
24659 const APInt &EltC = Elt->getAPIntValue();
24660 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
24661 return SDValue();
24662
24663 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
24664 }
24665
24666 return DAG.getBuildVector(VT, DL, NewVecC);
24667}
24668
24669/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
24670/// Op0 u<= Op1:
24671/// t = psubus Op0, Op1
24672/// pcmpeq t, <0..0>
24673static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
24674 ISD::CondCode Cond, const SDLoc &dl,
24675 const X86Subtarget &Subtarget,
24676 SelectionDAG &DAG) {
24677 if (!Subtarget.hasSSE2())
24678 return SDValue();
24679
24680 MVT VET = VT.getVectorElementType();
24681 if (VET != MVT::i8 && VET != MVT::i16)
24682 return SDValue();
24683
24684 switch (Cond) {
24685 default:
24686 return SDValue();
24687 case ISD::SETULT: {
24688 // If the comparison is against a constant we can turn this into a
24689 // setule. With psubus, setule does not require a swap. This is
24690 // beneficial because the constant in the register is no longer
24691 // destructed as the destination so it can be hoisted out of a loop.
24692 // Only do this pre-AVX since vpcmp* is no longer destructive.
24693 if (Subtarget.hasAVX())
24694 return SDValue();
24695 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
24696 if (!ULEOp1)
24697 return SDValue();
24698 Op1 = ULEOp1;
24699 break;
24700 }
24701 case ISD::SETUGT: {
24702 // If the comparison is against a constant, we can turn this into a setuge.
24703 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24704 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24705 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24706 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
24707 if (!UGEOp1)
24708 return SDValue();
24709 Op1 = Op0;
24710 Op0 = UGEOp1;
24711 break;
24712 }
24713 // Psubus is better than flip-sign because it requires no inversion.
24714 case ISD::SETUGE:
24715 std::swap(Op0, Op1);
24716 break;
24717 case ISD::SETULE:
24718 break;
24719 }
24720
24721 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24722 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24723 DAG.getConstant(0, dl, VT));
24724}
24725
24726static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24727 SelectionDAG &DAG) {
24728 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24729 Op.getOpcode() == ISD::STRICT_FSETCCS;
24730 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24731 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24732 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24733 MVT VT = Op->getSimpleValueType(0);
24734 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
24735 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
24736 SDLoc dl(Op);
24737
24738 if (isFP) {
24739 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
24740 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24740, __extension__
__PRETTY_FUNCTION__))
;
24741 if (isSoftFP16(EltVT, Subtarget))
24742 return SDValue();
24743
24744 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24745 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24746
24747 // If we have a strict compare with a vXi1 result and the input is 128/256
24748 // bits we can't use a masked compare unless we have VLX. If we use a wider
24749 // compare like we do for non-strict, we might trigger spurious exceptions
24750 // from the upper elements. Instead emit a AVX compare and convert to mask.
24751 unsigned Opc;
24752 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24753 (!IsStrict || Subtarget.hasVLX() ||
24754 Op0.getSimpleValueType().is512BitVector())) {
24755#ifndef NDEBUG
24756 unsigned Num = VT.getVectorNumElements();
24757 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24757, __extension__
__PRETTY_FUNCTION__))
;
24758#endif
24759 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24760 } else {
24761 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24762 // The SSE/AVX packed FP comparison nodes are defined with a
24763 // floating-point vector result that matches the operand type. This allows
24764 // them to work with an SSE1 target (integer vector types are not legal).
24765 VT = Op0.getSimpleValueType();
24766 }
24767
24768 SDValue Cmp;
24769 bool IsAlwaysSignaling;
24770 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24771 if (!Subtarget.hasAVX()) {
24772 // TODO: We could use following steps to handle a quiet compare with
24773 // signaling encodings.
24774 // 1. Get ordered masks from a quiet ISD::SETO
24775 // 2. Use the masks to mask potential unordered elements in operand A, B
24776 // 3. Get the compare results of masked A, B
24777 // 4. Calculating final result using the mask and result from 3
24778 // But currently, we just fall back to scalar operations.
24779 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24780 return SDValue();
24781
24782 // Insert an extra signaling instruction to raise exception.
24783 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24784 SDValue SignalCmp = DAG.getNode(
24785 Opc, dl, {VT, MVT::Other},
24786 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24787 // FIXME: It seems we need to update the flags of all new strict nodes.
24788 // Otherwise, mayRaiseFPException in MI will return false due to
24789 // NoFPExcept = false by default. However, I didn't find it in other
24790 // patches.
24791 SignalCmp->setFlags(Op->getFlags());
24792 Chain = SignalCmp.getValue(1);
24793 }
24794
24795 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24796 // emit two comparisons and a logic op to tie them together.
24797 if (!cheapX86FSETCC_SSE(Cond)) {
24798 // LLVM predicate is SETUEQ or SETONE.
24799 unsigned CC0, CC1;
24800 unsigned CombineOpc;
24801 if (Cond == ISD::SETUEQ) {
24802 CC0 = 3; // UNORD
24803 CC1 = 0; // EQ
24804 CombineOpc = X86ISD::FOR;
24805 } else {
24806 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24806, __extension__ __PRETTY_FUNCTION__))
;
24807 CC0 = 7; // ORD
24808 CC1 = 4; // NEQ
24809 CombineOpc = X86ISD::FAND;
24810 }
24811
24812 SDValue Cmp0, Cmp1;
24813 if (IsStrict) {
24814 Cmp0 = DAG.getNode(
24815 Opc, dl, {VT, MVT::Other},
24816 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24817 Cmp1 = DAG.getNode(
24818 Opc, dl, {VT, MVT::Other},
24819 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24820 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24821 Cmp1.getValue(1));
24822 } else {
24823 Cmp0 = DAG.getNode(
24824 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24825 Cmp1 = DAG.getNode(
24826 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24827 }
24828 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24829 } else {
24830 if (IsStrict) {
24831 Cmp = DAG.getNode(
24832 Opc, dl, {VT, MVT::Other},
24833 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24834 Chain = Cmp.getValue(1);
24835 } else
24836 Cmp = DAG.getNode(
24837 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24838 }
24839 } else {
24840 // Handle all other FP comparisons here.
24841 if (IsStrict) {
24842 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24843 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24844 Cmp = DAG.getNode(
24845 Opc, dl, {VT, MVT::Other},
24846 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24847 Chain = Cmp.getValue(1);
24848 } else
24849 Cmp = DAG.getNode(
24850 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24851 }
24852
24853 if (VT.getFixedSizeInBits() >
24854 Op.getSimpleValueType().getFixedSizeInBits()) {
24855 // We emitted a compare with an XMM/YMM result. Finish converting to a
24856 // mask register using a vptestm.
24857 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
24858 Cmp = DAG.getBitcast(CastVT, Cmp);
24859 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24860 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24861 } else {
24862 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24863 // the result type of SETCC. The bitcast is expected to be optimized
24864 // away during combining/isel.
24865 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24866 }
24867
24868 if (IsStrict)
24869 return DAG.getMergeValues({Cmp, Chain}, dl);
24870
24871 return Cmp;
24872 }
24873
24874 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24874, __extension__
__PRETTY_FUNCTION__))
;
24875
24876 MVT VTOp0 = Op0.getSimpleValueType();
24877 (void)VTOp0;
24878 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24879, __extension__
__PRETTY_FUNCTION__))
24879 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24879, __extension__
__PRETTY_FUNCTION__))
;
24880 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24881, __extension__
__PRETTY_FUNCTION__))
24881 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24881, __extension__
__PRETTY_FUNCTION__))
;
24882
24883 // The non-AVX512 code below works under the assumption that source and
24884 // destination types are the same.
24885 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24886, __extension__
__PRETTY_FUNCTION__))
24886 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24886, __extension__
__PRETTY_FUNCTION__))
;
24887
24888 // The result is boolean, but operands are int/float
24889 if (VT.getVectorElementType() == MVT::i1) {
24890 // In AVX-512 architecture setcc returns mask with i1 elements,
24891 // But there is no compare instruction for i8 and i16 elements in KNL.
24892 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24893, __extension__
__PRETTY_FUNCTION__))
24893 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24893, __extension__
__PRETTY_FUNCTION__))
;
24894 return LowerIntVSETCC_AVX512(Op, DAG);
24895 }
24896
24897 // Lower using XOP integer comparisons.
24898 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24899 // Translate compare code to XOP PCOM compare mode.
24900 unsigned CmpMode = 0;
24901 switch (Cond) {
24902 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24902)
;
24903 case ISD::SETULT:
24904 case ISD::SETLT: CmpMode = 0x00; break;
24905 case ISD::SETULE:
24906 case ISD::SETLE: CmpMode = 0x01; break;
24907 case ISD::SETUGT:
24908 case ISD::SETGT: CmpMode = 0x02; break;
24909 case ISD::SETUGE:
24910 case ISD::SETGE: CmpMode = 0x03; break;
24911 case ISD::SETEQ: CmpMode = 0x04; break;
24912 case ISD::SETNE: CmpMode = 0x05; break;
24913 }
24914
24915 // Are we comparing unsigned or signed integers?
24916 unsigned Opc =
24917 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
24918
24919 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24920 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24921 }
24922
24923 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24924 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24925 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
24926 SDValue BC0 = peekThroughBitcasts(Op0);
24927 if (BC0.getOpcode() == ISD::AND) {
24928 APInt UndefElts;
24929 SmallVector<APInt, 64> EltBits;
24930 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
24931 VT.getScalarSizeInBits(), UndefElts,
24932 EltBits, false, false)) {
24933 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
24934 Cond = ISD::SETEQ;
24935 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24936 }
24937 }
24938 }
24939 }
24940
24941 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24942 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24943 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24944 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
24945 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24946 unsigned BitWidth = VT.getScalarSizeInBits();
24947 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24948
24949 SDValue Result = Op0.getOperand(0);
24950 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24951 DAG.getConstant(ShiftAmt, dl, VT));
24952 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24953 DAG.getConstant(BitWidth - 1, dl, VT));
24954 return Result;
24955 }
24956 }
24957
24958 // Break 256-bit integer vector compare into smaller ones.
24959 if (VT.is256BitVector() && !Subtarget.hasInt256())
24960 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24961
24962 // Break 512-bit integer vector compare into smaller ones.
24963 // TODO: Try harder to use VPCMPx + VPMOV2x?
24964 if (VT.is512BitVector())
24965 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24966
24967 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24968 // not-of-PCMPEQ:
24969 // X != INT_MIN --> X >s INT_MIN
24970 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24971 // +X != 0 --> +X >s 0
24972 APInt ConstValue;
24973 if (Cond == ISD::SETNE &&
24974 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24975 if (ConstValue.isMinSignedValue())
24976 Cond = ISD::SETGT;
24977 else if (ConstValue.isMaxSignedValue())
24978 Cond = ISD::SETLT;
24979 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24980 Cond = ISD::SETGT;
24981 }
24982
24983 // If both operands are known non-negative, then an unsigned compare is the
24984 // same as a signed compare and there's no need to flip signbits.
24985 // TODO: We could check for more general simplifications here since we're
24986 // computing known bits.
24987 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24988 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24989
24990 // Special case: Use min/max operations for unsigned compares.
24991 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24992 if (ISD::isUnsignedIntSetCC(Cond) &&
24993 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24994 TLI.isOperationLegal(ISD::UMIN, VT)) {
24995 // If we have a constant operand, increment/decrement it and change the
24996 // condition to avoid an invert.
24997 if (Cond == ISD::SETUGT) {
24998 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24999 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
25000 Op1 = UGTOp1;
25001 Cond = ISD::SETUGE;
25002 }
25003 }
25004 if (Cond == ISD::SETULT) {
25005 // X < C --> X <= (C-1) --> X == umin(X, C-1)
25006 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
25007 Op1 = ULTOp1;
25008 Cond = ISD::SETULE;
25009 }
25010 }
25011 bool Invert = false;
25012 unsigned Opc;
25013 switch (Cond) {
25014 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25014)
;
25015 case ISD::SETUGT: Invert = true; [[fallthrough]];
25016 case ISD::SETULE: Opc = ISD::UMIN; break;
25017 case ISD::SETULT: Invert = true; [[fallthrough]];
25018 case ISD::SETUGE: Opc = ISD::UMAX; break;
25019 }
25020
25021 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25022 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
25023
25024 // If the logical-not of the result is required, perform that now.
25025 if (Invert)
25026 Result = DAG.getNOT(dl, Result, VT);
25027
25028 return Result;
25029 }
25030
25031 // Try to use SUBUS and PCMPEQ.
25032 if (FlipSigns)
25033 if (SDValue V =
25034 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
25035 return V;
25036
25037 // We are handling one of the integer comparisons here. Since SSE only has
25038 // GT and EQ comparisons for integer, swapping operands and multiple
25039 // operations may be required for some comparisons.
25040 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
25041 : X86ISD::PCMPGT;
25042 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
25043 Cond == ISD::SETGE || Cond == ISD::SETUGE;
25044 bool Invert = Cond == ISD::SETNE ||
25045 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
25046
25047 if (Swap)
25048 std::swap(Op0, Op1);
25049
25050 // Check that the operation in question is available (most are plain SSE2,
25051 // but PCMPGTQ and PCMPEQQ have different requirements).
25052 if (VT == MVT::v2i64) {
25053 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
25054 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25054, __extension__
__PRETTY_FUNCTION__))
;
25055
25056 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
25057 // the odd elements over the even elements.
25058 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
25059 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
25060 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25061
25062 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25063 static const int MaskHi[] = { 1, 1, 3, 3 };
25064 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25065
25066 return DAG.getBitcast(VT, Result);
25067 }
25068
25069 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
25070 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25071 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
25072
25073 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25074 static const int MaskHi[] = { 1, 1, 3, 3 };
25075 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25076
25077 return DAG.getBitcast(VT, Result);
25078 }
25079
25080 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25081 // bits of the inputs before performing those operations. The lower
25082 // compare is always unsigned.
25083 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
25084 : 0x0000000080000000ULL,
25085 dl, MVT::v2i64);
25086
25087 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
25088 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
25089
25090 // Cast everything to the right type.
25091 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25092 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25093
25094 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
25095 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25096 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
25097
25098 // Create masks for only the low parts/high parts of the 64 bit integers.
25099 static const int MaskHi[] = { 1, 1, 3, 3 };
25100 static const int MaskLo[] = { 0, 0, 2, 2 };
25101 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
25102 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
25103 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25104
25105 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
25106 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
25107
25108 if (Invert)
25109 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25110
25111 return DAG.getBitcast(VT, Result);
25112 }
25113
25114 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
25115 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
25116 // pcmpeqd + pshufd + pand.
25117 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25117, __extension__
__PRETTY_FUNCTION__))
;
25118
25119 // First cast everything to the right type.
25120 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25121 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25122
25123 // Do the compare.
25124 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
25125
25126 // Make sure the lower and upper halves are both all-ones.
25127 static const int Mask[] = { 1, 0, 3, 2 };
25128 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
25129 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
25130
25131 if (Invert)
25132 Result = DAG.getNOT(dl, Result, MVT::v4i32);
25133
25134 return DAG.getBitcast(VT, Result);
25135 }
25136 }
25137
25138 // Since SSE has no unsigned integer comparisons, we need to flip the sign
25139 // bits of the inputs before performing those operations.
25140 if (FlipSigns) {
25141 MVT EltVT = VT.getVectorElementType();
25142 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
25143 VT);
25144 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
25145 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
25146 }
25147
25148 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25149
25150 // If the logical-not of the result is required, perform that now.
25151 if (Invert)
25152 Result = DAG.getNOT(dl, Result, VT);
25153
25154 return Result;
25155}
25156
25157// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
25158static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
25159 const SDLoc &dl, SelectionDAG &DAG,
25160 const X86Subtarget &Subtarget,
25161 SDValue &X86CC) {
25162 // Only support equality comparisons.
25163 if (CC != ISD::SETEQ && CC != ISD::SETNE)
25164 return SDValue();
25165
25166 // Must be a bitcast from vXi1.
25167 if (Op0.getOpcode() != ISD::BITCAST)
25168 return SDValue();
25169
25170 Op0 = Op0.getOperand(0);
25171 MVT VT = Op0.getSimpleValueType();
25172 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
25173 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
25174 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
25175 return SDValue();
25176
25177 X86::CondCode X86Cond;
25178 if (isNullConstant(Op1)) {
25179 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
25180 } else if (isAllOnesConstant(Op1)) {
25181 // C flag is set for all ones.
25182 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
25183 } else
25184 return SDValue();
25185
25186 // If the input is an AND, we can combine it's operands into the KTEST.
25187 bool KTestable = false;
25188 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
25189 KTestable = true;
25190 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
25191 KTestable = true;
25192 if (!isNullConstant(Op1))
25193 KTestable = false;
25194 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
25195 SDValue LHS = Op0.getOperand(0);
25196 SDValue RHS = Op0.getOperand(1);
25197 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25198 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
25199 }
25200
25201 // If the input is an OR, we can combine it's operands into the KORTEST.
25202 SDValue LHS = Op0;
25203 SDValue RHS = Op0;
25204 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
25205 LHS = Op0.getOperand(0);
25206 RHS = Op0.getOperand(1);
25207 }
25208
25209 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25210 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
25211}
25212
25213/// Emit flags for the given setcc condition and operands. Also returns the
25214/// corresponding X86 condition code constant in X86CC.
25215SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
25216 ISD::CondCode CC, const SDLoc &dl,
25217 SelectionDAG &DAG,
25218 SDValue &X86CC) const {
25219 // Optimize to BT if possible.
25220 // Lower (X & (1 << N)) == 0 to BT(X, N).
25221 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
25222 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
25223 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
25224 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
25225 X86::CondCode X86CondCode;
25226 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
25227 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
25228 return BT;
25229 }
25230 }
25231
25232 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
25233 // TODO: We could do AND tree with all 1s as well by using the C flag.
25234 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
25235 if (SDValue CmpZ =
25236 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
25237 return CmpZ;
25238
25239 // Try to lower using KORTEST or KTEST.
25240 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
25241 return Test;
25242
25243 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
25244 // these.
25245 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
25246 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
25247 // If the input is a setcc, then reuse the input setcc or use a new one with
25248 // the inverted condition.
25249 if (Op0.getOpcode() == X86ISD::SETCC) {
25250 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
25251
25252 X86CC = Op0.getOperand(0);
25253 if (Invert) {
25254 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
25255 CCode = X86::GetOppositeBranchCondition(CCode);
25256 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
25257 }
25258
25259 return Op0.getOperand(1);
25260 }
25261 }
25262
25263 // Try to use the carry flag from the add in place of an separate CMP for:
25264 // (seteq (add X, -1), -1). Similar for setne.
25265 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
25266 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
25267 if (isProfitableToUseFlagOp(Op0)) {
25268 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
25269
25270 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
25271 Op0.getOperand(1));
25272 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
25273 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25274 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
25275 return SDValue(New.getNode(), 1);
25276 }
25277 }
25278
25279 X86::CondCode CondCode =
25280 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
25281 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25281, __extension__
__PRETTY_FUNCTION__))
;
25282
25283 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
25284 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25285 return EFLAGS;
25286}
25287
25288SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
25289
25290 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25291 Op.getOpcode() == ISD::STRICT_FSETCCS;
25292 MVT VT = Op->getSimpleValueType(0);
25293
25294 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
25295
25296 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25296, __extension__
__PRETTY_FUNCTION__))
;
25297 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25298 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25299 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25300 SDLoc dl(Op);
25301 ISD::CondCode CC =
25302 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
25303
25304 if (isSoftFP16(Op0.getValueType()))
25305 return SDValue();
25306
25307 // Handle f128 first, since one possible outcome is a normal integer
25308 // comparison which gets handled by emitFlagsForSetcc.
25309 if (Op0.getValueType() == MVT::f128) {
25310 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
25311 Op.getOpcode() == ISD::STRICT_FSETCCS);
25312
25313 // If softenSetCCOperands returned a scalar, use it.
25314 if (!Op1.getNode()) {
25315 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25316, __extension__
__PRETTY_FUNCTION__))
25316 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25316, __extension__
__PRETTY_FUNCTION__))
;
25317 if (IsStrict)
25318 return DAG.getMergeValues({Op0, Chain}, dl);
25319 return Op0;
25320 }
25321 }
25322
25323 if (Op0.getSimpleValueType().isInteger()) {
25324 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
25325 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
25326 // this may translate to less uops depending on uarch implementation. The
25327 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
25328 // canonicalize to that CondCode.
25329 // NOTE: Only do this if incrementing the constant doesn't increase the bit
25330 // encoding size - so it must either already be a i8 or i32 immediate, or it
25331 // shrinks down to that. We don't do this for any i64's to avoid additional
25332 // constant materializations.
25333 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
25334 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
25335 const APInt &Op1Val = Op1C->getAPIntValue();
25336 if (!Op1Val.isZero()) {
25337 // Ensure the constant+1 doesn't overflow.
25338 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
25339 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
25340 APInt Op1ValPlusOne = Op1Val + 1;
25341 if (Op1ValPlusOne.isSignedIntN(32) &&
25342 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
25343 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
25344 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
25345 : ISD::CondCode::SETUGE;
25346 }
25347 }
25348 }
25349 }
25350
25351 SDValue X86CC;
25352 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
25353 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
25354 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
25355 }
25356
25357 // Handle floating point.
25358 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
25359 if (CondCode == X86::COND_INVALID)
25360 return SDValue();
25361
25362 SDValue EFLAGS;
25363 if (IsStrict) {
25364 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25365 EFLAGS =
25366 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
25367 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
25368 Chain = EFLAGS.getValue(1);
25369 } else {
25370 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
25371 }
25372
25373 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
25374 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
25375 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
25376}
25377
25378SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
25379 SDValue LHS = Op.getOperand(0);
25380 SDValue RHS = Op.getOperand(1);
25381 SDValue Carry = Op.getOperand(2);
25382 SDValue Cond = Op.getOperand(3);
25383 SDLoc DL(Op);
25384
25385 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25385, __extension__
__PRETTY_FUNCTION__))
;
25386 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
25387
25388 // Recreate the carry if needed.
25389 EVT CarryVT = Carry.getValueType();
25390 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
25391 Carry, DAG.getAllOnesConstant(DL, CarryVT));
25392
25393 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
25394 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
25395 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
25396}
25397
25398// This function returns three things: the arithmetic computation itself
25399// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
25400// flag and the condition code define the case in which the arithmetic
25401// computation overflows.
25402static std::pair<SDValue, SDValue>
25403getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
25404 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25404, __extension__
__PRETTY_FUNCTION__))
;
25405 SDValue Value, Overflow;
25406 SDValue LHS = Op.getOperand(0);
25407 SDValue RHS = Op.getOperand(1);
25408 unsigned BaseOp = 0;
25409 SDLoc DL(Op);
25410 switch (Op.getOpcode()) {
25411 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 25411)
;
25412 case ISD::SADDO:
25413 BaseOp = X86ISD::ADD;
25414 Cond = X86::COND_O;
25415 break;
25416 case ISD::UADDO:
25417 BaseOp = X86ISD::ADD;
25418 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
25419 break;
25420 case ISD::SSUBO:
25421 BaseOp = X86ISD::SUB;
25422 Cond = X86::COND_O;
25423 break;
25424 case ISD::USUBO:
25425 BaseOp = X86ISD::SUB;
25426 Cond = X86::COND_B;
25427 break;
25428 case ISD::SMULO:
25429 BaseOp = X86ISD::SMUL;
25430 Cond = X86::COND_O;
25431 break;
25432 case ISD::UMULO:
25433 BaseOp = X86ISD::UMUL;
25434 Cond = X86::COND_O;
25435 break;
25436 }
25437
25438 if (BaseOp) {
25439 // Also sets EFLAGS.
25440 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
25441 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
25442 Overflow = Value.getValue(1);
25443 }
25444
25445 return std::make_pair(Value, Overflow);
25446}
25447
25448static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
25449 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
25450 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
25451 // looks for this combo and may remove the "setcc" instruction if the "setcc"
25452 // has only one use.
25453 SDLoc DL(Op);
25454 X86::CondCode Cond;
25455 SDValue Value, Overflow;
25456 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
25457
25458 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
25459 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25459, __extension__
__PRETTY_FUNCTION__))
;
25460 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
25461}
25462
25463/// Return true if opcode is a X86 logical comparison.
25464static bool isX86LogicalCmp(SDValue Op) {
25465 unsigned Opc = Op.getOpcode();
25466 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
25467 Opc == X86ISD::FCMP)
25468 return true;
25469 if (Op.getResNo() == 1 &&
25470 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
25471 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
25472 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
25473 return true;
25474
25475 return false;
25476}
25477
25478static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
25479 if (V.getOpcode() != ISD::TRUNCATE)
25480 return false;
25481
25482 SDValue VOp0 = V.getOperand(0);
25483 unsigned InBits = VOp0.getValueSizeInBits();
25484 unsigned Bits = V.getValueSizeInBits();
25485 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
25486}
25487
25488SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25489 bool AddTest = true;
25490 SDValue Cond = Op.getOperand(0);
25491 SDValue Op1 = Op.getOperand(1);
25492 SDValue Op2 = Op.getOperand(2);
25493 SDLoc DL(Op);
25494 MVT VT = Op1.getSimpleValueType();
25495 SDValue CC;
25496
25497 if (isSoftFP16(VT)) {
25498 MVT NVT = VT.changeTypeToInteger();
25499 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25500 DAG.getBitcast(NVT, Op1),
25501 DAG.getBitcast(NVT, Op2)));
25502 }
25503
25504 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25505 // are available or VBLENDV if AVX is available.
25506 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25507 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25508 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25509 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25510 bool IsAlwaysSignaling;
25511 unsigned SSECC =
25512 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25513 CondOp0, CondOp1, IsAlwaysSignaling);
25514
25515 if (Subtarget.hasAVX512()) {
25516 SDValue Cmp =
25517 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25518 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25519 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25519, __extension__
__PRETTY_FUNCTION__))
;
25520 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25521 }
25522
25523 if (SSECC < 8 || Subtarget.hasAVX()) {
25524 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25525 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25526
25527 // If we have AVX, we can use a variable vector select (VBLENDV) instead
25528 // of 3 logic instructions for size savings and potentially speed.
25529 // Unfortunately, there is no scalar form of VBLENDV.
25530
25531 // If either operand is a +0.0 constant, don't try this. We can expect to
25532 // optimize away at least one of the logic instructions later in that
25533 // case, so that sequence would be faster than a variable blend.
25534
25535 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
25536 // uses XMM0 as the selection register. That may need just as many
25537 // instructions as the AND/ANDN/OR sequence due to register moves, so
25538 // don't bother.
25539 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
25540 !isNullFPConstant(Op2)) {
25541 // Convert to vectors, do a VSELECT, and convert back to scalar.
25542 // All of the conversions should be optimized away.
25543 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25544 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25545 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25546 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25547
25548 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25549 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25550
25551 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25552
25553 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
25554 VSel, DAG.getIntPtrConstant(0, DL));
25555 }
25556 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25557 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25558 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25559 }
25560 }
25561
25562 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25563 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25564 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25565 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25566 }
25567
25568 if (Cond.getOpcode() == ISD::SETCC &&
25569 !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
25570 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25571 Cond = NewCond;
25572 // If the condition was updated, it's possible that the operands of the
25573 // select were also updated (for example, EmitTest has a RAUW). Refresh
25574 // the local references to the select operands in case they got stale.
25575 Op1 = Op.getOperand(1);
25576 Op2 = Op.getOperand(2);
25577 }
25578 }
25579
25580 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25581 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25582 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25583 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25584 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25585 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25586 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25587 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25588 if (Cond.getOpcode() == X86ISD::SETCC &&
25589 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25590 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25591 SDValue Cmp = Cond.getOperand(1);
25592 SDValue CmpOp0 = Cmp.getOperand(0);
25593 unsigned CondCode = Cond.getConstantOperandVal(0);
25594
25595 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25596 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25597 // handle to keep the CMP with 0. This should be removed by
25598 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25599 // cttz_zero_undef.
25600 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25601 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25602 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25603 };
25604 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25605 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25606 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25607 // Keep Cmp.
25608 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25609 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
25610 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
25611 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
25612
25613 // 'X - 1' sets the carry flag if X == 0.
25614 // '0 - X' sets the carry flag if X != 0.
25615 // Convert the carry flag to a -1/0 mask with sbb:
25616 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
25617 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
25618 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
25619 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
25620 SDValue Sub;
25621 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
25622 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
25623 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
25624 } else {
25625 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
25626 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
25627 }
25628 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25629 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
25630 Sub.getValue(1));
25631 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
25632 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
25633 CmpOp0.getOpcode() == ISD::AND &&
25634 isOneConstant(CmpOp0.getOperand(1))) {
25635 SDValue Src1, Src2;
25636 // true if Op2 is XOR or OR operator and one of its operands
25637 // is equal to Op1
25638 // ( a , a op b) || ( b , a op b)
25639 auto isOrXorPattern = [&]() {
25640 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
25641 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
25642 Src1 =
25643 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
25644 Src2 = Op1;
25645 return true;
25646 }
25647 return false;
25648 };
25649
25650 if (isOrXorPattern()) {
25651 SDValue Neg;
25652 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
25653 // we need mask of all zeros or ones with same size of the other
25654 // operands.
25655 if (CmpSz > VT.getSizeInBits())
25656 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
25657 else if (CmpSz < VT.getSizeInBits())
25658 Neg = DAG.getNode(ISD::AND, DL, VT,
25659 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
25660 DAG.getConstant(1, DL, VT));
25661 else
25662 Neg = CmpOp0;
25663 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
25664 Neg); // -(and (x, 0x1))
25665 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
25666 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
25667 }
25668 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
25669 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25670 ((CondCode == X86::COND_S) || // smin(x, 0)
25671 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25672 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25673 //
25674 // If the comparison is testing for a positive value, we have to invert
25675 // the sign bit mask, so only do that transform if the target has a
25676 // bitwise 'and not' instruction (the invert is free).
25677 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25678 unsigned ShCt = VT.getSizeInBits() - 1;
25679 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25680 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25681 if (CondCode == X86::COND_G)
25682 Shift = DAG.getNOT(DL, Shift, VT);
25683 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25684 }
25685 }
25686
25687 // Look past (and (setcc_carry (cmp ...)), 1).
25688 if (Cond.getOpcode() == ISD::AND &&
25689 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25690 isOneConstant(Cond.getOperand(1)))
25691 Cond = Cond.getOperand(0);
25692
25693 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25694 // setting operand in place of the X86ISD::SETCC.
25695 unsigned CondOpcode = Cond.getOpcode();
25696 if (CondOpcode == X86ISD::SETCC ||
25697 CondOpcode == X86ISD::SETCC_CARRY) {
25698 CC = Cond.getOperand(0);
25699
25700 SDValue Cmp = Cond.getOperand(1);
25701 bool IllegalFPCMov = false;
25702 if (VT.isFloatingPoint() && !VT.isVector() &&
25703 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25704 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25705
25706 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25707 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25708 Cond = Cmp;
25709 AddTest = false;
25710 }
25711 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25712 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25713 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25714 SDValue Value;
25715 X86::CondCode X86Cond;
25716 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25717
25718 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25719 AddTest = false;
25720 }
25721
25722 if (AddTest) {
25723 // Look past the truncate if the high bits are known zero.
25724 if (isTruncWithZeroHighBitsInput(Cond, DAG))
25725 Cond = Cond.getOperand(0);
25726
25727 // We know the result of AND is compared against zero. Try to match
25728 // it to BT.
25729 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25730 X86::CondCode X86CondCode;
25731 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25732 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25733 Cond = BT;
25734 AddTest = false;
25735 }
25736 }
25737 }
25738
25739 if (AddTest) {
25740 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25741 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25742 }
25743
25744 // a < b ? -1 : 0 -> RES = ~setcc_carry
25745 // a < b ? 0 : -1 -> RES = setcc_carry
25746 // a >= b ? -1 : 0 -> RES = setcc_carry
25747 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25748 if (Cond.getOpcode() == X86ISD::SUB) {
25749 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
25750
25751 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25752 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25753 (isNullConstant(Op1) || isNullConstant(Op2))) {
25754 SDValue Res =
25755 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25756 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25757 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25758 return DAG.getNOT(DL, Res, Res.getValueType());
25759 return Res;
25760 }
25761 }
25762
25763 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25764 // widen the cmov and push the truncate through. This avoids introducing a new
25765 // branch during isel and doesn't add any extensions.
25766 if (Op.getValueType() == MVT::i8 &&
25767 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25768 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25769 if (T1.getValueType() == T2.getValueType() &&
25770 // Exclude CopyFromReg to avoid partial register stalls.
25771 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25772 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25773 CC, Cond);
25774 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25775 }
25776 }
25777
25778 // Or finally, promote i8 cmovs if we have CMOV,
25779 // or i16 cmovs if it won't prevent folding a load.
25780 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25781 // legal, but EmitLoweredSelect() can not deal with these extensions
25782 // being inserted between two CMOV's. (in i16 case too TBN)
25783 // https://bugs.llvm.org/show_bug.cgi?id=40974
25784 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25785 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25786 !X86::mayFoldLoad(Op2, Subtarget))) {
25787 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25788 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25789 SDValue Ops[] = { Op2, Op1, CC, Cond };
25790 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25791 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25792 }
25793
25794 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25795 // condition is true.
25796 SDValue Ops[] = { Op2, Op1, CC, Cond };
25797 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
25798}
25799
25800static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
25801 const X86Subtarget &Subtarget,
25802 SelectionDAG &DAG) {
25803 MVT VT = Op->getSimpleValueType(0);
25804 SDValue In = Op->getOperand(0);
25805 MVT InVT = In.getSimpleValueType();
25806 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25806, __extension__
__PRETTY_FUNCTION__))
;
25807 MVT VTElt = VT.getVectorElementType();
25808 SDLoc dl(Op);
25809
25810 unsigned NumElts = VT.getVectorNumElements();
25811
25812 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25813 MVT ExtVT = VT;
25814 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25815 // If v16i32 is to be avoided, we'll need to split and concatenate.
25816 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25817 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25818
25819 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25820 }
25821
25822 // Widen to 512-bits if VLX is not supported.
25823 MVT WideVT = ExtVT;
25824 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25825 NumElts *= 512 / ExtVT.getSizeInBits();
25826 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25827 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
25828 In, DAG.getIntPtrConstant(0, dl));
25829 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25830 }
25831
25832 SDValue V;
25833 MVT WideEltVT = WideVT.getVectorElementType();
25834 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25835 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25836 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25837 } else {
25838 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
25839 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25840 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25841 }
25842
25843 // Truncate if we had to extend i16/i8 above.
25844 if (VT != ExtVT) {
25845 WideVT = MVT::getVectorVT(VTElt, NumElts);
25846 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25847 }
25848
25849 // Extract back to 128/256-bit if we widened.
25850 if (WideVT != VT)
25851 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25852 DAG.getIntPtrConstant(0, dl));
25853
25854 return V;
25855}
25856
25857static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
25858 SelectionDAG &DAG) {
25859 SDValue In = Op->getOperand(0);
25860 MVT InVT = In.getSimpleValueType();
25861
25862 if (InVT.getVectorElementType() == MVT::i1)
25863 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
25864
25865 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25865, __extension__
__PRETTY_FUNCTION__))
;
25866 return LowerAVXExtend(Op, DAG, Subtarget);
25867}
25868
25869// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25870// For sign extend this needs to handle all vector sizes and SSE4.1 and
25871// non-SSE4.1 targets. For zero extend this should only handle inputs of
25872// MVT::v64i8 when BWI is not supported, but AVX512 is.
25873static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
25874 const X86Subtarget &Subtarget,
25875 SelectionDAG &DAG) {
25876 SDValue In = Op->getOperand(0);
25877 MVT VT = Op->getSimpleValueType(0);
25878 MVT InVT = In.getSimpleValueType();
25879
25880 MVT SVT = VT.getVectorElementType();
25881 MVT InSVT = InVT.getVectorElementType();
25882 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25882, __extension__
__PRETTY_FUNCTION__))
;
25883
25884 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25885 return SDValue();
25886 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25887 return SDValue();
25888 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25889 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25890 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25891 return SDValue();
25892
25893 SDLoc dl(Op);
25894 unsigned Opc = Op.getOpcode();
25895 unsigned NumElts = VT.getVectorNumElements();
25896
25897 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25898 // For 512-bit vectors, we need 128-bits or 256-bits.
25899 if (InVT.getSizeInBits() > 128) {
25900 // Input needs to be at least the same number of elements as output, and
25901 // at least 128-bits.
25902 int InSize = InSVT.getSizeInBits() * NumElts;
25903 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25904 InVT = In.getSimpleValueType();
25905 }
25906
25907 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25908 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25909 // need to be handled here for 256/512-bit results.
25910 if (Subtarget.hasInt256()) {
25911 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25911, __extension__
__PRETTY_FUNCTION__))
;
25912
25913 if (InVT.getVectorNumElements() != NumElts)
25914 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25915
25916 // FIXME: Apparently we create inreg operations that could be regular
25917 // extends.
25918 unsigned ExtOpc =
25919 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
25920 : ISD::ZERO_EXTEND;
25921 return DAG.getNode(ExtOpc, dl, VT, In);
25922 }
25923
25924 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25925 if (Subtarget.hasAVX()) {
25926 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25926, __extension__
__PRETTY_FUNCTION__))
;
25927 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25928 int HalfNumElts = HalfVT.getVectorNumElements();
25929
25930 unsigned NumSrcElts = InVT.getVectorNumElements();
25931 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25932 for (int i = 0; i != HalfNumElts; ++i)
25933 HiMask[i] = HalfNumElts + i;
25934
25935 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25936 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25937 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25938 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25939 }
25940
25941 // We should only get here for sign extend.
25942 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25942, __extension__
__PRETTY_FUNCTION__))
;
25943 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25943, __extension__
__PRETTY_FUNCTION__))
;
25944
25945 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25946 SDValue Curr = In;
25947 SDValue SignExt = Curr;
25948
25949 // As SRAI is only available on i16/i32 types, we expand only up to i32
25950 // and handle i64 separately.
25951 if (InVT != MVT::v4i32) {
25952 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25953
25954 unsigned DestWidth = DestVT.getScalarSizeInBits();
25955 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25956
25957 unsigned InNumElts = InVT.getVectorNumElements();
25958 unsigned DestElts = DestVT.getVectorNumElements();
25959
25960 // Build a shuffle mask that takes each input element and places it in the
25961 // MSBs of the new element size.
25962 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25963 for (unsigned i = 0; i != DestElts; ++i)
25964 Mask[i * Scale + (Scale - 1)] = i;
25965
25966 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25967 Curr = DAG.getBitcast(DestVT, Curr);
25968
25969 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25970 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25971 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25972 }
25973
25974 if (VT == MVT::v2i64) {
25975 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25975, __extension__
__PRETTY_FUNCTION__))
;
25976 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25977 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25978 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25979 SignExt = DAG.getBitcast(VT, SignExt);
25980 }
25981
25982 return SignExt;
25983}
25984
25985static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
25986 SelectionDAG &DAG) {
25987 MVT VT = Op->getSimpleValueType(0);
25988 SDValue In = Op->getOperand(0);
25989 MVT InVT = In.getSimpleValueType();
25990 SDLoc dl(Op);
25991
25992 if (InVT.getVectorElementType() == MVT::i1)
25993 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
25994
25995 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25995, __extension__
__PRETTY_FUNCTION__))
;
25996 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25997, __extension__
__PRETTY_FUNCTION__))
25997 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25997, __extension__
__PRETTY_FUNCTION__))
;
25998 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))
25999 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))
26000 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))
26001 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))
;
26002 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))
26003 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))
26004 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))
26005 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))
;
26006
26007 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
26008 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26008, __extension__
__PRETTY_FUNCTION__))
;
26009 return splitVectorIntUnary(Op, DAG);
26010 }
26011
26012 if (Subtarget.hasInt256())
26013 return Op;
26014
26015 // Optimize vectors in AVX mode
26016 // Sign extend v8i16 to v8i32 and
26017 // v4i32 to v4i64
26018 //
26019 // Divide input vector into two parts
26020 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
26021 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
26022 // concat the vectors to original VT
26023 MVT HalfVT = VT.getHalfNumVectorElementsVT();
26024 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
26025
26026 unsigned NumElems = InVT.getVectorNumElements();
26027 SmallVector<int,8> ShufMask(NumElems, -1);
26028 for (unsigned i = 0; i != NumElems/2; ++i)
26029 ShufMask[i] = i + NumElems/2;
26030
26031 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26032 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
26033
26034 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
26035}
26036
26037/// Change a vector store into a pair of half-size vector stores.
26038static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
26039 SDValue StoredVal = Store->getValue();
26040 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26042, __extension__
__PRETTY_FUNCTION__))
26041 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26042, __extension__
__PRETTY_FUNCTION__))
26042 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26042, __extension__
__PRETTY_FUNCTION__))
;
26043
26044 // Splitting volatile memory ops is not allowed unless the operation was not
26045 // legal to begin with. Assume the input store is legal (this transform is
26046 // only used for targets with AVX). Note: It is possible that we have an
26047 // illegal type like v2i128, and so we could allow splitting a volatile store
26048 // in that case if that is important.
26049 if (!Store->isSimple())
26050 return SDValue();
26051
26052 SDLoc DL(Store);
26053 SDValue Value0, Value1;
26054 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
26055 unsigned HalfOffset = Value0.getValueType().getStoreSize();
26056 SDValue Ptr0 = Store->getBasePtr();
26057 SDValue Ptr1 =
26058 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
26059 SDValue Ch0 =
26060 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
26061 Store->getOriginalAlign(),
26062 Store->getMemOperand()->getFlags());
26063 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
26064 Store->getPointerInfo().getWithOffset(HalfOffset),
26065 Store->getOriginalAlign(),
26066 Store->getMemOperand()->getFlags());
26067 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
26068}
26069
26070/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
26071/// type.
26072static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
26073 SelectionDAG &DAG) {
26074 SDValue StoredVal = Store->getValue();
26075 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26076, __extension__
__PRETTY_FUNCTION__))
26076 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26076, __extension__
__PRETTY_FUNCTION__))
;
26077 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
26078
26079 // Splitting volatile memory ops is not allowed unless the operation was not
26080 // legal to begin with. We are assuming the input op is legal (this transform
26081 // is only used for targets with AVX).
26082 if (!Store->isSimple())
26083 return SDValue();
26084
26085 MVT StoreSVT = StoreVT.getScalarType();
26086 unsigned NumElems = StoreVT.getVectorNumElements();
26087 unsigned ScalarSize = StoreSVT.getStoreSize();
26088
26089 SDLoc DL(Store);
26090 SmallVector<SDValue, 4> Stores;
26091 for (unsigned i = 0; i != NumElems; ++i) {
26092 unsigned Offset = i * ScalarSize;
26093 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
26094 TypeSize::Fixed(Offset), DL);
26095 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
26096 DAG.getIntPtrConstant(i, DL));
26097 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
26098 Store->getPointerInfo().getWithOffset(Offset),
26099 Store->getOriginalAlign(),
26100 Store->getMemOperand()->getFlags());
26101 Stores.push_back(Ch);
26102 }
26103 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
26104}
26105
26106static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
26107 SelectionDAG &DAG) {
26108 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
26109 SDLoc dl(St);
26110 SDValue StoredVal = St->getValue();
26111
26112 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
26113 if (StoredVal.getValueType().isVector() &&
26114 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
26115 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
26116 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26116, __extension__
__PRETTY_FUNCTION__))
;
26117 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26117, __extension__
__PRETTY_FUNCTION__))
;
26118 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26119, __extension__
__PRETTY_FUNCTION__))
26119 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26119, __extension__
__PRETTY_FUNCTION__))
;
26120
26121 // We must pad with zeros to ensure we store zeroes to any unused bits.
26122 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26123 DAG.getUNDEF(MVT::v16i1), StoredVal,
26124 DAG.getIntPtrConstant(0, dl));
26125 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
26126 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
26127 // Make sure we store zeros in the extra bits.
26128 if (NumElts < 8)
26129 StoredVal = DAG.getZeroExtendInReg(
26130 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
26131
26132 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26133 St->getPointerInfo(), St->getOriginalAlign(),
26134 St->getMemOperand()->getFlags());
26135 }
26136
26137 if (St->isTruncatingStore())
26138 return SDValue();
26139
26140 // If this is a 256-bit store of concatenated ops, we are better off splitting
26141 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
26142 // and each half can execute independently. Some cores would split the op into
26143 // halves anyway, so the concat (vinsertf128) is purely an extra op.
26144 MVT StoreVT = StoredVal.getSimpleValueType();
26145 if (StoreVT.is256BitVector() ||
26146 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
26147 !Subtarget.hasBWI())) {
26148 SmallVector<SDValue, 4> CatOps;
26149 if (StoredVal.hasOneUse() &&
26150 collectConcatOps(StoredVal.getNode(), CatOps, DAG))
26151 return splitVectorStore(St, DAG);
26152 return SDValue();
26153 }
26154
26155 if (StoreVT.is32BitVector())
26156 return SDValue();
26157
26158 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26159 assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26159, __extension__
__PRETTY_FUNCTION__))
;
26160 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26162, __extension__
__PRETTY_FUNCTION__))
26161 TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26162, __extension__
__PRETTY_FUNCTION__))
26162 "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26162, __extension__
__PRETTY_FUNCTION__))
;
26163
26164 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
26165 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
26166 DAG.getUNDEF(StoreVT));
26167
26168 if (Subtarget.hasSSE2()) {
26169 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
26170 // and store it.
26171 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
26172 MVT CastVT = MVT::getVectorVT(StVT, 2);
26173 StoredVal = DAG.getBitcast(CastVT, StoredVal);
26174 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
26175 DAG.getIntPtrConstant(0, dl));
26176
26177 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26178 St->getPointerInfo(), St->getOriginalAlign(),
26179 St->getMemOperand()->getFlags());
26180 }
26181 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26181, __extension__
__PRETTY_FUNCTION__))
;
26182 SDVTList Tys = DAG.getVTList(MVT::Other);
26183 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
26184 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
26185 St->getMemOperand());
26186}
26187
26188// Lower vector extended loads using a shuffle. If SSSE3 is not available we
26189// may emit an illegal shuffle but the expansion is still better than scalar
26190// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
26191// we'll emit a shuffle and a arithmetic shift.
26192// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
26193// TODO: It is possible to support ZExt by zeroing the undef values during
26194// the shuffle phase or after the shuffle.
26195static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
26196 SelectionDAG &DAG) {
26197 MVT RegVT = Op.getSimpleValueType();
26198 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26198, __extension__
__PRETTY_FUNCTION__))
;
26199 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26200, __extension__
__PRETTY_FUNCTION__))
26200 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26200, __extension__
__PRETTY_FUNCTION__))
;
26201
26202 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
26203 SDLoc dl(Ld);
26204
26205 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
26206 if (RegVT.getVectorElementType() == MVT::i1) {
26207 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26207, __extension__
__PRETTY_FUNCTION__))
;
26208 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26208, __extension__
__PRETTY_FUNCTION__))
;
26209 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26210, __extension__
__PRETTY_FUNCTION__))
26210 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26210, __extension__
__PRETTY_FUNCTION__))
;
26211
26212 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
26213 Ld->getPointerInfo(), Ld->getOriginalAlign(),
26214 Ld->getMemOperand()->getFlags());
26215
26216 // Replace chain users with the new chain.
26217 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26217, __extension__
__PRETTY_FUNCTION__))
;
26218
26219 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
26220 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
26221 DAG.getBitcast(MVT::v16i1, Val),
26222 DAG.getIntPtrConstant(0, dl));
26223 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
26224 }
26225
26226 return SDValue();
26227}
26228
26229/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
26230/// each of which has no other use apart from the AND / OR.
26231static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
26232 Opc = Op.getOpcode();
26233 if (Opc != ISD::OR && Opc != ISD::AND)
26234 return false;
26235 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
26236 Op.getOperand(0).hasOneUse() &&
26237 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
26238 Op.getOperand(1).hasOneUse());
26239}
26240
26241SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
26242 SDValue Chain = Op.getOperand(0);
26243 SDValue Cond = Op.getOperand(1);
26244 SDValue Dest = Op.getOperand(2);
26245 SDLoc dl(Op);
26246
26247 // Bail out when we don't have native compare instructions.
26248 if (Cond.getOpcode() == ISD::SETCC &&
26249 Cond.getOperand(0).getValueType() != MVT::f128 &&
26250 !isSoftFP16(Cond.getOperand(0).getValueType())) {
26251 SDValue LHS = Cond.getOperand(0);
26252 SDValue RHS = Cond.getOperand(1);
26253 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26254
26255 // Special case for
26256 // setcc([su]{add,sub,mul}o == 0)
26257 // setcc([su]{add,sub,mul}o != 1)
26258 if (ISD::isOverflowIntrOpRes(LHS) &&
26259 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
26260 (isNullConstant(RHS) || isOneConstant(RHS))) {
26261 SDValue Value, Overflow;
26262 X86::CondCode X86Cond;
26263 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
26264
26265 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
26266 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
26267
26268 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26269 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26270 Overflow);
26271 }
26272
26273 if (LHS.getSimpleValueType().isInteger()) {
26274 SDValue CCVal;
26275 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
26276 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26277 EFLAGS);
26278 }
26279
26280 if (CC == ISD::SETOEQ) {
26281 // For FCMP_OEQ, we can emit
26282 // two branches instead of an explicit AND instruction with a
26283 // separate test. However, we only do this if this block doesn't
26284 // have a fall-through edge, because this requires an explicit
26285 // jmp when the condition is false.
26286 if (Op.getNode()->hasOneUse()) {
26287 SDNode *User = *Op.getNode()->use_begin();
26288 // Look for an unconditional branch following this conditional branch.
26289 // We need this because we need to reverse the successors in order
26290 // to implement FCMP_OEQ.
26291 if (User->getOpcode() == ISD::BR) {
26292 SDValue FalseBB = User->getOperand(1);
26293 SDNode *NewBR =
26294 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
26295 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26295, __extension__ __PRETTY_FUNCTION__))
;
26296 (void)NewBR;
26297 Dest = FalseBB;
26298
26299 SDValue Cmp =
26300 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26301 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26302 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
26303 CCVal, Cmp);
26304 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26305 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26306 Cmp);
26307 }
26308 }
26309 } else if (CC == ISD::SETUNE) {
26310 // For FCMP_UNE, we can emit
26311 // two branches instead of an explicit OR instruction with a
26312 // separate test.
26313 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26314 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
26315 Chain =
26316 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
26317 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
26318 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26319 Cmp);
26320 } else {
26321 X86::CondCode X86Cond =
26322 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
26323 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
26324 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26325 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26326 Cmp);
26327 }
26328 }
26329
26330 if (ISD::isOverflowIntrOpRes(Cond)) {
26331 SDValue Value, Overflow;
26332 X86::CondCode X86Cond;
26333 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26334
26335 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
26336 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26337 Overflow);
26338 }
26339
26340 // Look past the truncate if the high bits are known zero.
26341 if (isTruncWithZeroHighBitsInput(Cond, DAG))
26342 Cond = Cond.getOperand(0);
26343
26344 EVT CondVT = Cond.getValueType();
26345
26346 // Add an AND with 1 if we don't already have one.
26347 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
26348 Cond =
26349 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
26350
26351 SDValue LHS = Cond;
26352 SDValue RHS = DAG.getConstant(0, dl, CondVT);
26353
26354 SDValue CCVal;
26355 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
26356 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
26357 EFLAGS);
26358}
26359
26360// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
26361// Calls to _alloca are needed to probe the stack when allocating more than 4k
26362// bytes in one go. Touching the stack at 4K increments is necessary to ensure
26363// that the guard pages used by the OS virtual memory manager are allocated in
26364// correct sequence.
26365SDValue
26366X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
26367 SelectionDAG &DAG) const {
26368 MachineFunction &MF = DAG.getMachineFunction();
26369 bool SplitStack = MF.shouldSplitStack();
26370 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
26371 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
26372 SplitStack || EmitStackProbeCall;
26373 SDLoc dl(Op);
26374
26375 // Get the inputs.
26376 SDNode *Node = Op.getNode();
26377 SDValue Chain = Op.getOperand(0);
26378 SDValue Size = Op.getOperand(1);
26379 MaybeAlign Alignment(Op.getConstantOperandVal(2));
26380 EVT VT = Node->getValueType(0);
26381
26382 // Chain the dynamic stack allocation so that it doesn't modify the stack
26383 // pointer when other instructions are using the stack.
26384 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
26385
26386 bool Is64Bit = Subtarget.is64Bit();
26387 MVT SPTy = getPointerTy(DAG.getDataLayout());
26388
26389 SDValue Result;
26390 if (!Lower) {
26391 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26392 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
26393 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26394, __extension__
__PRETTY_FUNCTION__))
26394 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26394, __extension__
__PRETTY_FUNCTION__))
;
26395
26396 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26397 const Align StackAlign = TFI.getStackAlign();
26398 if (hasInlineStackProbe(MF)) {
26399 MachineRegisterInfo &MRI = MF.getRegInfo();
26400
26401 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26402 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
26403 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
26404 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
26405 DAG.getRegister(Vreg, SPTy));
26406 } else {
26407 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
26408 Chain = SP.getValue(1);
26409 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
26410 }
26411 if (Alignment && *Alignment > StackAlign)
26412 Result =
26413 DAG.getNode(ISD::AND, dl, VT, Result,
26414 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
26415 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
26416 } else if (SplitStack) {
26417 MachineRegisterInfo &MRI = MF.getRegInfo();
26418
26419 if (Is64Bit) {
26420 // The 64 bit implementation of segmented stacks needs to clobber both r10
26421 // r11. This makes it impossible to use it along with nested parameters.
26422 const Function &F = MF.getFunction();
26423 for (const auto &A : F.args()) {
26424 if (A.hasNestAttr())
26425 report_fatal_error("Cannot use segmented stacks with functions that "
26426 "have nested arguments.");
26427 }
26428 }
26429
26430 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26431 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
26432 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
26433 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
26434 DAG.getRegister(Vreg, SPTy));
26435 } else {
26436 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
26437 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
26438 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
26439
26440 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26441 Register SPReg = RegInfo->getStackRegister();
26442 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
26443 Chain = SP.getValue(1);
26444
26445 if (Alignment) {
26446 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
26447 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
26448 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
26449 }
26450
26451 Result = SP;
26452 }
26453
26454 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
26455
26456 SDValue Ops[2] = {Result, Chain};
26457 return DAG.getMergeValues(Ops, dl);
26458}
26459
26460SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
26461 MachineFunction &MF = DAG.getMachineFunction();
26462 auto PtrVT = getPointerTy(MF.getDataLayout());
26463 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26464
26465 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
26466 SDLoc DL(Op);
26467
26468 if (!Subtarget.is64Bit() ||
26469 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
26470 // vastart just stores the address of the VarArgsFrameIndex slot into the
26471 // memory location argument.
26472 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
26473 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
26474 MachinePointerInfo(SV));
26475 }
26476
26477 // __va_list_tag:
26478 // gp_offset (0 - 6 * 8)
26479 // fp_offset (48 - 48 + 8 * 16)
26480 // overflow_arg_area (point to parameters coming in memory).
26481 // reg_save_area
26482 SmallVector<SDValue, 8> MemOps;
26483 SDValue FIN = Op.getOperand(1);
26484 // Store gp_offset
26485 SDValue Store = DAG.getStore(
26486 Op.getOperand(0), DL,
26487 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
26488 MachinePointerInfo(SV));
26489 MemOps.push_back(Store);
26490
26491 // Store fp_offset
26492 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
26493 Store = DAG.getStore(
26494 Op.getOperand(0), DL,
26495 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
26496 MachinePointerInfo(SV, 4));
26497 MemOps.push_back(Store);
26498
26499 // Store ptr to overflow_arg_area
26500 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
26501 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
26502 Store =
26503 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
26504 MemOps.push_back(Store);
26505
26506 // Store ptr to reg_save_area.
26507 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
26508 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
26509 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
26510 Store = DAG.getStore(
26511 Op.getOperand(0), DL, RSFIN, FIN,
26512 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
26513 MemOps.push_back(Store);
26514 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
26515}
26516
26517SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
26518 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26519, __extension__
__PRETTY_FUNCTION__))
26519 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26519, __extension__
__PRETTY_FUNCTION__))
;
26520 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26520, __extension__ __PRETTY_FUNCTION__))
;
26521
26522 MachineFunction &MF = DAG.getMachineFunction();
26523 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
26524 // The Win64 ABI uses char* instead of a structure.
26525 return DAG.expandVAArg(Op.getNode());
26526
26527 SDValue Chain = Op.getOperand(0);
26528 SDValue SrcPtr = Op.getOperand(1);
26529 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
26530 unsigned Align = Op.getConstantOperandVal(3);
26531 SDLoc dl(Op);
26532
26533 EVT ArgVT = Op.getNode()->getValueType(0);
26534 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
26535 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
26536 uint8_t ArgMode;
26537
26538 // Decide which area this value should be read from.
26539 // TODO: Implement the AMD64 ABI in its entirety. This simple
26540 // selection mechanism works only for the basic types.
26541 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26541, __extension__
__PRETTY_FUNCTION__))
;
26542 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26543 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26544 } else {
26545 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26546, __extension__
__PRETTY_FUNCTION__))
26546 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26546, __extension__
__PRETTY_FUNCTION__))
;
26547 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26548 }
26549
26550 if (ArgMode == 2) {
26551 // Make sure using fp_offset makes sense.
26552 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26554, __extension__
__PRETTY_FUNCTION__))
26553 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26554, __extension__
__PRETTY_FUNCTION__))
26554 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26554, __extension__
__PRETTY_FUNCTION__))
;
26555 }
26556
26557 // Insert VAARG node into the DAG
26558 // VAARG returns two values: Variable Argument Address, Chain
26559 SDValue InstOps[] = {Chain, SrcPtr,
26560 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26561 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26562 DAG.getTargetConstant(Align, dl, MVT::i32)};
26563 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
26564 SDValue VAARG = DAG.getMemIntrinsicNode(
26565 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26566 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26567 /*Alignment=*/std::nullopt,
26568 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
26569 Chain = VAARG.getValue(1);
26570
26571 // Load the next argument and return it
26572 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26573}
26574
26575static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26576 SelectionDAG &DAG) {
26577 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26578 // where a va_list is still an i8*.
26579 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26579, __extension__
__PRETTY_FUNCTION__))
;
26580 if (Subtarget.isCallingConvWin64(
26581 DAG.getMachineFunction().getFunction().getCallingConv()))
26582 // Probably a Win64 va_copy.
26583 return DAG.expandVACopy(Op.getNode());
26584
26585 SDValue Chain = Op.getOperand(0);
26586 SDValue DstPtr = Op.getOperand(1);
26587 SDValue SrcPtr = Op.getOperand(2);
26588 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26589 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26590 SDLoc DL(Op);
26591
26592 return DAG.getMemcpy(
26593 Chain, DL, DstPtr, SrcPtr,
26594 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26595 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26596 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
26597}
26598
26599// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26600static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26601 switch (Opc) {
26602 case ISD::SHL:
26603 case X86ISD::VSHL:
26604 case X86ISD::VSHLI:
26605 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26606 case ISD::SRL:
26607 case X86ISD::VSRL:
26608 case X86ISD::VSRLI:
26609 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26610 case ISD::SRA:
26611 case X86ISD::VSRA:
26612 case X86ISD::VSRAI:
26613 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26614 }
26615 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26615)
;
26616}
26617
26618/// Handle vector element shifts where the shift amount is a constant.
26619/// Takes immediate version of shift as input.
26620static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26621 SDValue SrcOp, uint64_t ShiftAmt,
26622 SelectionDAG &DAG) {
26623 MVT ElementType = VT.getVectorElementType();
26624
26625 // Bitcast the source vector to the output type, this is mainly necessary for
26626 // vXi8/vXi64 shifts.
26627 if (VT != SrcOp.getSimpleValueType())
26628 SrcOp = DAG.getBitcast(VT, SrcOp);
26629
26630 // Fold this packed shift into its first operand if ShiftAmt is 0.
26631 if (ShiftAmt == 0)
26632 return SrcOp;
26633
26634 // Check for ShiftAmt >= element width
26635 if (ShiftAmt >= ElementType.getSizeInBits()) {
26636 if (Opc == X86ISD::VSRAI)
26637 ShiftAmt = ElementType.getSizeInBits() - 1;
26638 else
26639 return DAG.getConstant(0, dl, VT);
26640 }
26641
26642 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__))
26643 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__))
;
26644
26645 // Fold this packed vector shift into a build vector if SrcOp is a
26646 // vector of Constants or UNDEFs.
26647 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
26648 unsigned ShiftOpc;
26649 switch (Opc) {
26650 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26650)
;
26651 case X86ISD::VSHLI:
26652 ShiftOpc = ISD::SHL;
26653 break;
26654 case X86ISD::VSRLI:
26655 ShiftOpc = ISD::SRL;
26656 break;
26657 case X86ISD::VSRAI:
26658 ShiftOpc = ISD::SRA;
26659 break;
26660 }
26661
26662 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26663 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26664 return C;
26665 }
26666
26667 return DAG.getNode(Opc, dl, VT, SrcOp,
26668 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26669}
26670
26671/// Handle vector element shifts by a splat shift amount
26672static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26673 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26674 const X86Subtarget &Subtarget,
26675 SelectionDAG &DAG) {
26676 MVT AmtVT = ShAmt.getSimpleValueType();
26677 assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26677, __extension__
__PRETTY_FUNCTION__))
;
26678 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__))
26679 "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__))
;
26680
26681 // Move the splat element to the bottom element.
26682 if (ShAmtIdx != 0) {
26683 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26684 Mask[0] = ShAmtIdx;
26685 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26686 }
26687
26688 // Peek through any zext node if we can get back to a 128-bit source.
26689 if (AmtVT.getScalarSizeInBits() == 64 &&
26690 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26691 ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
26692 ShAmt.getOperand(0).getValueType().isSimple() &&
26693 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26694 ShAmt = ShAmt.getOperand(0);
26695 AmtVT = ShAmt.getSimpleValueType();
26696 }
26697
26698 // See if we can mask off the upper elements using the existing source node.
26699 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26700 // do this for vXi64 types.
26701 bool IsMasked = false;
26702 if (AmtVT.getScalarSizeInBits() < 64) {
26703 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26704 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26705 // If the shift amount has come from a scalar, then zero-extend the scalar
26706 // before moving to the vector.
26707 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26708 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26709 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26710 AmtVT = MVT::v4i32;
26711 IsMasked = true;
26712 } else if (ShAmt.getOpcode() == ISD::AND) {
26713 // See if the shift amount is already masked (e.g. for rotation modulo),
26714 // then we can zero-extend it by setting all the other mask elements to
26715 // zero.
26716 SmallVector<SDValue> MaskElts(
26717 AmtVT.getVectorNumElements(),
26718 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26719 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26720 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26721 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26722 {ShAmt.getOperand(1), Mask}))) {
26723 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26724 IsMasked = true;
26725 }
26726 }
26727 }
26728
26729 // Extract if the shift amount vector is larger than 128-bits.
26730 if (AmtVT.getSizeInBits() > 128) {
26731 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26732 AmtVT = ShAmt.getSimpleValueType();
26733 }
26734
26735 // Zero-extend bottom element to v2i64 vector type, either by extension or
26736 // shuffle masking.
26737 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26738 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26739 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26740 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26741 } else if (Subtarget.hasSSE41()) {
26742 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26743 MVT::v2i64, ShAmt);
26744 } else {
26745 SDValue ByteShift = DAG.getTargetConstant(
26746 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26747 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26748 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26749 ByteShift);
26750 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26751 ByteShift);
26752 }
26753 }
26754
26755 // Change opcode to non-immediate version.
26756 Opc = getTargetVShiftUniformOpcode(Opc, true);
26757
26758 // The return type has to be a 128-bit type with the same element
26759 // type as the input type.
26760 MVT EltVT = VT.getVectorElementType();
26761 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26762
26763 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26764 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26765}
26766
26767/// Return Mask with the necessary casting or extending
26768/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26769static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26770 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26771 const SDLoc &dl) {
26772
26773 if (isAllOnesConstant(Mask))
26774 return DAG.getConstant(1, dl, MaskVT);
26775 if (X86::isZeroNode(Mask))
26776 return DAG.getConstant(0, dl, MaskVT);
26777
26778 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26778, __extension__
__PRETTY_FUNCTION__))
;
26779
26780 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26781 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26781, __extension__
__PRETTY_FUNCTION__))
;
26782 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26782, __extension__
__PRETTY_FUNCTION__))
;
26783 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26784 SDValue Lo, Hi;
26785 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
26786 DAG.getConstant(0, dl, MVT::i32));
26787 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
26788 DAG.getConstant(1, dl, MVT::i32));
26789
26790 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26791 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26792
26793 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26794 } else {
26795 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26796 Mask.getSimpleValueType().getSizeInBits());
26797 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26798 // are extracted by EXTRACT_SUBVECTOR.
26799 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26800 DAG.getBitcast(BitcastVT, Mask),
26801 DAG.getIntPtrConstant(0, dl));
26802 }
26803}
26804
26805/// Return (and \p Op, \p Mask) for compare instructions or
26806/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26807/// necessary casting or extending for \p Mask when lowering masking intrinsics
26808static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
26809 SDValue PreservedSrc,
26810 const X86Subtarget &Subtarget,
26811 SelectionDAG &DAG) {
26812 MVT VT = Op.getSimpleValueType();
26813 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26814 unsigned OpcodeSelect = ISD::VSELECT;
26815 SDLoc dl(Op);
26816
26817 if (isAllOnesConstant(Mask))
26818 return Op;
26819
26820 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26821
26822 if (PreservedSrc.isUndef())
26823 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26824 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26825}
26826
26827/// Creates an SDNode for a predicated scalar operation.
26828/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26829/// The mask is coming as MVT::i8 and it should be transformed
26830/// to MVT::v1i1 while lowering masking intrinsics.
26831/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26832/// "X86select" instead of "vselect". We just can't create the "vselect" node
26833/// for a scalar instruction.
26834static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
26835 SDValue PreservedSrc,
26836 const X86Subtarget &Subtarget,
26837 SelectionDAG &DAG) {
26838
26839 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
26840 if (MaskConst->getZExtValue() & 0x1)
26841 return Op;
26842
26843 MVT VT = Op.getSimpleValueType();
26844 SDLoc dl(Op);
26845
26846 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26846, __extension__
__PRETTY_FUNCTION__))
;
26847 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26848 DAG.getBitcast(MVT::v8i1, Mask),
26849 DAG.getIntPtrConstant(0, dl));
26850 if (Op.getOpcode() == X86ISD::FSETCCM ||
26851 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26852 Op.getOpcode() == X86ISD::VFPCLASSS)
26853 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26854
26855 if (PreservedSrc.isUndef())
26856 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26857 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26858}
26859
26860static int getSEHRegistrationNodeSize(const Function *Fn) {
26861 if (!Fn->hasPersonalityFn())
26862 report_fatal_error(
26863 "querying registration node size for function without personality");
26864 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26865 // WinEHStatePass for the full struct definition.
26866 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26867 case EHPersonality::MSVC_X86SEH: return 24;
26868 case EHPersonality::MSVC_CXX: return 16;
26869 default: break;
26870 }
26871 report_fatal_error(
26872 "can only recover FP for 32-bit MSVC EH personality functions");
26873}
26874
26875/// When the MSVC runtime transfers control to us, either to an outlined
26876/// function or when returning to a parent frame after catching an exception, we
26877/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26878/// Here's the math:
26879/// RegNodeBase = EntryEBP - RegNodeSize
26880/// ParentFP = RegNodeBase - ParentFrameOffset
26881/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26882/// subtracting the offset (negative on x86) takes us back to the parent FP.
26883static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
26884 SDValue EntryEBP) {
26885 MachineFunction &MF = DAG.getMachineFunction();
26886 SDLoc dl;
26887
26888 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26889 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26890
26891 // It's possible that the parent function no longer has a personality function
26892 // if the exceptional code was optimized away, in which case we just return
26893 // the incoming EBP.
26894 if (!Fn->hasPersonalityFn())
26895 return EntryEBP;
26896
26897 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26898 // registration, or the .set_setframe offset.
26899 MCSymbol *OffsetSym =
26900 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
26901 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26902 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26903 SDValue ParentFrameOffset =
26904 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26905
26906 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26907 // prologue to RBP in the parent function.
26908 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26909 if (Subtarget.is64Bit())
26910 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26911
26912 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26913 // RegNodeBase = EntryEBP - RegNodeSize
26914 // ParentFP = RegNodeBase - ParentFrameOffset
26915 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26916 DAG.getConstant(RegNodeSize, dl, PtrVT));
26917 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26918}
26919
26920SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26921 SelectionDAG &DAG) const {
26922 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26923 auto isRoundModeCurDirection = [](SDValue Rnd) {
26924 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26925 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26926
26927 return false;
26928 };
26929 auto isRoundModeSAE = [](SDValue Rnd) {
26930 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26931 unsigned RC = C->getZExtValue();
26932 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
26933 // Clear the NO_EXC bit and check remaining bits.
26934 RC ^= X86::STATIC_ROUNDING::NO_EXC;
26935 // As a convenience we allow no other bits or explicitly
26936 // current direction.
26937 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26938 }
26939 }
26940
26941 return false;
26942 };
26943 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26944 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26945 RC = C->getZExtValue();
26946 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
26947 // Clear the NO_EXC bit and check remaining bits.
26948 RC ^= X86::STATIC_ROUNDING::NO_EXC;
26949 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
26950 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
26951 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
26952 RC == X86::STATIC_ROUNDING::TO_ZERO;
26953 }
26954 }
26955
26956 return false;
26957 };
26958
26959 SDLoc dl(Op);
26960 unsigned IntNo = Op.getConstantOperandVal(0);
26961 MVT VT = Op.getSimpleValueType();
26962 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26963
26964 // Propagate flags from original node to transformed node(s).
26965 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26966
26967 if (IntrData) {
26968 switch(IntrData->Type) {
26969 case INTR_TYPE_1OP: {
26970 // We specify 2 possible opcodes for intrinsics with rounding modes.
26971 // First, we check if the intrinsic may have non-default rounding mode,
26972 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26973 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26974 if (IntrWithRoundingModeOpcode != 0) {
26975 SDValue Rnd = Op.getOperand(2);
26976 unsigned RC = 0;
26977 if (isRoundModeSAEToX(Rnd, RC))
26978 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26979 Op.getOperand(1),
26980 DAG.getTargetConstant(RC, dl, MVT::i32));
26981 if (!isRoundModeCurDirection(Rnd))
26982 return SDValue();
26983 }
26984 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26985 Op.getOperand(1));
26986 }
26987 case INTR_TYPE_1OP_SAE: {
26988 SDValue Sae = Op.getOperand(2);
26989
26990 unsigned Opc;
26991 if (isRoundModeCurDirection(Sae))
26992 Opc = IntrData->Opc0;
26993 else if (isRoundModeSAE(Sae))
26994 Opc = IntrData->Opc1;
26995 else
26996 return SDValue();
26997
26998 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26999 }
27000 case INTR_TYPE_2OP: {
27001 SDValue Src2 = Op.getOperand(2);
27002
27003 // We specify 2 possible opcodes for intrinsics with rounding modes.
27004 // First, we check if the intrinsic may have non-default rounding mode,
27005 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27006 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27007 if (IntrWithRoundingModeOpcode != 0) {
27008 SDValue Rnd = Op.getOperand(3);
27009 unsigned RC = 0;
27010 if (isRoundModeSAEToX(Rnd, RC))
27011 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27012 Op.getOperand(1), Src2,
27013 DAG.getTargetConstant(RC, dl, MVT::i32));
27014 if (!isRoundModeCurDirection(Rnd))
27015 return SDValue();
27016 }
27017
27018 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27019 Op.getOperand(1), Src2);
27020 }
27021 case INTR_TYPE_2OP_SAE: {
27022 SDValue Sae = Op.getOperand(3);
27023
27024 unsigned Opc;
27025 if (isRoundModeCurDirection(Sae))
27026 Opc = IntrData->Opc0;
27027 else if (isRoundModeSAE(Sae))
27028 Opc = IntrData->Opc1;
27029 else
27030 return SDValue();
27031
27032 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
27033 Op.getOperand(2));
27034 }
27035 case INTR_TYPE_3OP:
27036 case INTR_TYPE_3OP_IMM8: {
27037 SDValue Src1 = Op.getOperand(1);
27038 SDValue Src2 = Op.getOperand(2);
27039 SDValue Src3 = Op.getOperand(3);
27040
27041 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
27042 Src3.getValueType() != MVT::i8) {
27043 Src3 = DAG.getTargetConstant(
27044 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
27045 }
27046
27047 // We specify 2 possible opcodes for intrinsics with rounding modes.
27048 // First, we check if the intrinsic may have non-default rounding mode,
27049 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27050 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27051 if (IntrWithRoundingModeOpcode != 0) {
27052 SDValue Rnd = Op.getOperand(4);
27053 unsigned RC = 0;
27054 if (isRoundModeSAEToX(Rnd, RC))
27055 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27056 Src1, Src2, Src3,
27057 DAG.getTargetConstant(RC, dl, MVT::i32));
27058 if (!isRoundModeCurDirection(Rnd))
27059 return SDValue();
27060 }
27061
27062 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27063 {Src1, Src2, Src3});
27064 }
27065 case INTR_TYPE_4OP_IMM8: {
27066 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27066, __extension__
__PRETTY_FUNCTION__))
;
27067 SDValue Src4 = Op.getOperand(4);
27068 if (Src4.getValueType() != MVT::i8) {
27069 Src4 = DAG.getTargetConstant(
27070 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
27071 }
27072
27073 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27074 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
27075 Src4);
27076 }
27077 case INTR_TYPE_1OP_MASK: {
27078 SDValue Src = Op.getOperand(1);
27079 SDValue PassThru = Op.getOperand(2);
27080 SDValue Mask = Op.getOperand(3);
27081 // We add rounding mode to the Node when
27082 // - RC Opcode is specified and
27083 // - RC is not "current direction".
27084 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27085 if (IntrWithRoundingModeOpcode != 0) {
27086 SDValue Rnd = Op.getOperand(4);
27087 unsigned RC = 0;
27088 if (isRoundModeSAEToX(Rnd, RC))
27089 return getVectorMaskingNode(
27090 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27091 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
27092 Mask, PassThru, Subtarget, DAG);
27093 if (!isRoundModeCurDirection(Rnd))
27094 return SDValue();
27095 }
27096 return getVectorMaskingNode(
27097 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
27098 Subtarget, DAG);
27099 }
27100 case INTR_TYPE_1OP_MASK_SAE: {
27101 SDValue Src = Op.getOperand(1);
27102 SDValue PassThru = Op.getOperand(2);
27103 SDValue Mask = Op.getOperand(3);
27104 SDValue Rnd = Op.getOperand(4);
27105
27106 unsigned Opc;
27107 if (isRoundModeCurDirection(Rnd))
27108 Opc = IntrData->Opc0;
27109 else if (isRoundModeSAE(Rnd))
27110 Opc = IntrData->Opc1;
27111 else
27112 return SDValue();
27113
27114 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
27115 Subtarget, DAG);
27116 }
27117 case INTR_TYPE_SCALAR_MASK: {
27118 SDValue Src1 = Op.getOperand(1);
27119 SDValue Src2 = Op.getOperand(2);
27120 SDValue passThru = Op.getOperand(3);
27121 SDValue Mask = Op.getOperand(4);
27122 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27123 // There are 2 kinds of intrinsics in this group:
27124 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
27125 // (2) With rounding mode and sae - 7 operands.
27126 bool HasRounding = IntrWithRoundingModeOpcode != 0;
27127 if (Op.getNumOperands() == (5U + HasRounding)) {
27128 if (HasRounding) {
27129 SDValue Rnd = Op.getOperand(5);
27130 unsigned RC = 0;
27131 if (isRoundModeSAEToX(Rnd, RC))
27132 return getScalarMaskingNode(
27133 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
27134 DAG.getTargetConstant(RC, dl, MVT::i32)),
27135 Mask, passThru, Subtarget, DAG);
27136 if (!isRoundModeCurDirection(Rnd))
27137 return SDValue();
27138 }
27139 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
27140 Src2),
27141 Mask, passThru, Subtarget, DAG);
27142 }
27143
27144 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27145, __extension__
__PRETTY_FUNCTION__))
27145 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27145, __extension__
__PRETTY_FUNCTION__))
;
27146 SDValue RoundingMode = Op.getOperand(5);
27147 unsigned Opc = IntrData->Opc0;
27148 if (HasRounding) {
27149 SDValue Sae = Op.getOperand(6);
27150 if (isRoundModeSAE(Sae))
27151 Opc = IntrWithRoundingModeOpcode;
27152 else if (!isRoundModeCurDirection(Sae))
27153 return SDValue();
27154 }
27155 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
27156 Src2, RoundingMode),
27157 Mask, passThru, Subtarget, DAG);
27158 }
27159 case INTR_TYPE_SCALAR_MASK_RND: {
27160 SDValue Src1 = Op.getOperand(1);
27161 SDValue Src2 = Op.getOperand(2);
27162 SDValue passThru = Op.getOperand(3);
27163 SDValue Mask = Op.getOperand(4);
27164 SDValue Rnd = Op.getOperand(5);
27165
27166 SDValue NewOp;
27167 unsigned RC = 0;
27168 if (isRoundModeCurDirection(Rnd))
27169 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27170 else if (isRoundModeSAEToX(Rnd, RC))
27171 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27172 DAG.getTargetConstant(RC, dl, MVT::i32));
27173 else
27174 return SDValue();
27175
27176 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
27177 }
27178 case INTR_TYPE_SCALAR_MASK_SAE: {
27179 SDValue Src1 = Op.getOperand(1);
27180 SDValue Src2 = Op.getOperand(2);
27181 SDValue passThru = Op.getOperand(3);
27182 SDValue Mask = Op.getOperand(4);
27183 SDValue Sae = Op.getOperand(5);
27184 unsigned Opc;
27185 if (isRoundModeCurDirection(Sae))
27186 Opc = IntrData->Opc0;
27187 else if (isRoundModeSAE(Sae))
27188 Opc = IntrData->Opc1;
27189 else
27190 return SDValue();
27191
27192 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27193 Mask, passThru, Subtarget, DAG);
27194 }
27195 case INTR_TYPE_2OP_MASK: {
27196 SDValue Src1 = Op.getOperand(1);
27197 SDValue Src2 = Op.getOperand(2);
27198 SDValue PassThru = Op.getOperand(3);
27199 SDValue Mask = Op.getOperand(4);
27200 SDValue NewOp;
27201 if (IntrData->Opc1 != 0) {
27202 SDValue Rnd = Op.getOperand(5);
27203 unsigned RC = 0;
27204 if (isRoundModeSAEToX(Rnd, RC))
27205 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27206 DAG.getTargetConstant(RC, dl, MVT::i32));
27207 else if (!isRoundModeCurDirection(Rnd))
27208 return SDValue();
27209 }
27210 if (!NewOp)
27211 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27212 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27213 }
27214 case INTR_TYPE_2OP_MASK_SAE: {
27215 SDValue Src1 = Op.getOperand(1);
27216 SDValue Src2 = Op.getOperand(2);
27217 SDValue PassThru = Op.getOperand(3);
27218 SDValue Mask = Op.getOperand(4);
27219
27220 unsigned Opc = IntrData->Opc0;
27221 if (IntrData->Opc1 != 0) {
27222 SDValue Sae = Op.getOperand(5);
27223 if (isRoundModeSAE(Sae))
27224 Opc = IntrData->Opc1;
27225 else if (!isRoundModeCurDirection(Sae))
27226 return SDValue();
27227 }
27228
27229 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27230 Mask, PassThru, Subtarget, DAG);
27231 }
27232 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
27233 SDValue Src1 = Op.getOperand(1);
27234 SDValue Src2 = Op.getOperand(2);
27235 SDValue Src3 = Op.getOperand(3);
27236 SDValue PassThru = Op.getOperand(4);
27237 SDValue Mask = Op.getOperand(5);
27238 SDValue Sae = Op.getOperand(6);
27239 unsigned Opc;
27240 if (isRoundModeCurDirection(Sae))
27241 Opc = IntrData->Opc0;
27242 else if (isRoundModeSAE(Sae))
27243 Opc = IntrData->Opc1;
27244 else
27245 return SDValue();
27246
27247 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27248 Mask, PassThru, Subtarget, DAG);
27249 }
27250 case INTR_TYPE_3OP_MASK_SAE: {
27251 SDValue Src1 = Op.getOperand(1);
27252 SDValue Src2 = Op.getOperand(2);
27253 SDValue Src3 = Op.getOperand(3);
27254 SDValue PassThru = Op.getOperand(4);
27255 SDValue Mask = Op.getOperand(5);
27256
27257 unsigned Opc = IntrData->Opc0;
27258 if (IntrData->Opc1 != 0) {
27259 SDValue Sae = Op.getOperand(6);
27260 if (isRoundModeSAE(Sae))
27261 Opc = IntrData->Opc1;
27262 else if (!isRoundModeCurDirection(Sae))
27263 return SDValue();
27264 }
27265 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
27266 Mask, PassThru, Subtarget, DAG);
27267 }
27268 case BLENDV: {
27269 SDValue Src1 = Op.getOperand(1);
27270 SDValue Src2 = Op.getOperand(2);
27271 SDValue Src3 = Op.getOperand(3);
27272
27273 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
27274 Src3 = DAG.getBitcast(MaskVT, Src3);
27275
27276 // Reverse the operands to match VSELECT order.
27277 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
27278 }
27279 case VPERM_2OP : {
27280 SDValue Src1 = Op.getOperand(1);
27281 SDValue Src2 = Op.getOperand(2);
27282
27283 // Swap Src1 and Src2 in the node creation
27284 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
27285 }
27286 case CFMA_OP_MASKZ:
27287 case CFMA_OP_MASK: {
27288 SDValue Src1 = Op.getOperand(1);
27289 SDValue Src2 = Op.getOperand(2);
27290 SDValue Src3 = Op.getOperand(3);
27291 SDValue Mask = Op.getOperand(4);
27292 MVT VT = Op.getSimpleValueType();
27293
27294 SDValue PassThru = Src3;
27295 if (IntrData->Type == CFMA_OP_MASKZ)
27296 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
27297
27298 // We add rounding mode to the Node when
27299 // - RC Opcode is specified and
27300 // - RC is not "current direction".
27301 SDValue NewOp;
27302 if (IntrData->Opc1 != 0) {
27303 SDValue Rnd = Op.getOperand(5);
27304 unsigned RC = 0;
27305 if (isRoundModeSAEToX(Rnd, RC))
27306 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
27307 DAG.getTargetConstant(RC, dl, MVT::i32));
27308 else if (!isRoundModeCurDirection(Rnd))
27309 return SDValue();
27310 }
27311 if (!NewOp)
27312 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
27313 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27314 }
27315 case IFMA_OP:
27316 // NOTE: We need to swizzle the operands to pass the multiply operands
27317 // first.
27318 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27319 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
27320 case FPCLASSS: {
27321 SDValue Src1 = Op.getOperand(1);
27322 SDValue Imm = Op.getOperand(2);
27323 SDValue Mask = Op.getOperand(3);
27324 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
27325 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
27326 Subtarget, DAG);
27327 // Need to fill with zeros to ensure the bitcast will produce zeroes
27328 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27329 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27330 DAG.getConstant(0, dl, MVT::v8i1),
27331 FPclassMask, DAG.getIntPtrConstant(0, dl));
27332 return DAG.getBitcast(MVT::i8, Ins);
27333 }
27334
27335 case CMP_MASK_CC: {
27336 MVT MaskVT = Op.getSimpleValueType();
27337 SDValue CC = Op.getOperand(3);
27338 SDValue Mask = Op.getOperand(4);
27339 // We specify 2 possible opcodes for intrinsics with rounding modes.
27340 // First, we check if the intrinsic may have non-default rounding mode,
27341 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27342 if (IntrData->Opc1 != 0) {
27343 SDValue Sae = Op.getOperand(5);
27344 if (isRoundModeSAE(Sae))
27345 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
27346 Op.getOperand(2), CC, Mask, Sae);
27347 if (!isRoundModeCurDirection(Sae))
27348 return SDValue();
27349 }
27350 //default rounding mode
27351 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
27352 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
27353 }
27354 case CMP_MASK_SCALAR_CC: {
27355 SDValue Src1 = Op.getOperand(1);
27356 SDValue Src2 = Op.getOperand(2);
27357 SDValue CC = Op.getOperand(3);
27358 SDValue Mask = Op.getOperand(4);
27359
27360 SDValue Cmp;
27361 if (IntrData->Opc1 != 0) {
27362 SDValue Sae = Op.getOperand(5);
27363 if (isRoundModeSAE(Sae))
27364 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
27365 else if (!isRoundModeCurDirection(Sae))
27366 return SDValue();
27367 }
27368 //default rounding mode
27369 if (!Cmp.getNode())
27370 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
27371
27372 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
27373 Subtarget, DAG);
27374 // Need to fill with zeros to ensure the bitcast will produce zeroes
27375 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27376 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
27377 DAG.getConstant(0, dl, MVT::v8i1),
27378 CmpMask, DAG.getIntPtrConstant(0, dl));
27379 return DAG.getBitcast(MVT::i8, Ins);
27380 }
27381 case COMI: { // Comparison intrinsics
27382 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
27383 SDValue LHS = Op.getOperand(1);
27384 SDValue RHS = Op.getOperand(2);
27385 // Some conditions require the operands to be swapped.
27386 if (CC == ISD::SETLT || CC == ISD::SETLE)
27387 std::swap(LHS, RHS);
27388
27389 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
27390 SDValue SetCC;
27391 switch (CC) {
27392 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
27393 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
27394 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
27395 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
27396 break;
27397 }
27398 case ISD::SETNE: { // (ZF = 1 or PF = 1)
27399 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
27400 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
27401 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
27402 break;
27403 }
27404 case ISD::SETGT: // (CF = 0 and ZF = 0)
27405 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
27406 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
27407 break;
27408 }
27409 case ISD::SETGE: // CF = 0
27410 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
27411 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
27412 break;
27413 default:
27414 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27414)
;
27415 }
27416 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27417 }
27418 case COMI_RM: { // Comparison intrinsics with Sae
27419 SDValue LHS = Op.getOperand(1);
27420 SDValue RHS = Op.getOperand(2);
27421 unsigned CondVal = Op.getConstantOperandVal(3);
27422 SDValue Sae = Op.getOperand(4);
27423
27424 SDValue FCmp;
27425 if (isRoundModeCurDirection(Sae))
27426 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
27427 DAG.getTargetConstant(CondVal, dl, MVT::i8));
27428 else if (isRoundModeSAE(Sae))
27429 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
27430 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
27431 else
27432 return SDValue();
27433 // Need to fill with zeros to ensure the bitcast will produce zeroes
27434 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
27435 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
27436 DAG.getConstant(0, dl, MVT::v16i1),
27437 FCmp, DAG.getIntPtrConstant(0, dl));
27438 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
27439 DAG.getBitcast(MVT::i16, Ins));
27440 }
27441 case VSHIFT: {
27442 SDValue SrcOp = Op.getOperand(1);
27443 SDValue ShAmt = Op.getOperand(2);
27444 assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27445, __extension__
__PRETTY_FUNCTION__))
27445 "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27445, __extension__
__PRETTY_FUNCTION__))
;
27446
27447 // Catch shift-by-constant.
27448 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
27449 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
27450 Op.getSimpleValueType(), SrcOp,
27451 CShAmt->getZExtValue(), DAG);
27452
27453 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
27454 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
27455 SrcOp, ShAmt, 0, Subtarget, DAG);
27456 }
27457 case COMPRESS_EXPAND_IN_REG: {
27458 SDValue Mask = Op.getOperand(3);
27459 SDValue DataToCompress = Op.getOperand(1);
27460 SDValue PassThru = Op.getOperand(2);
27461 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
27462 return Op.getOperand(1);
27463
27464 // Avoid false dependency.
27465 if (PassThru.isUndef())
27466 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
27467
27468 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
27469 Mask);
27470 }
27471 case FIXUPIMM:
27472 case FIXUPIMM_MASKZ: {
27473 SDValue Src1 = Op.getOperand(1);
27474 SDValue Src2 = Op.getOperand(2);
27475 SDValue Src3 = Op.getOperand(3);
27476 SDValue Imm = Op.getOperand(4);
27477 SDValue Mask = Op.getOperand(5);
27478 SDValue Passthru = (IntrData->Type == FIXUPIMM)
27479 ? Src1
27480 : getZeroVector(VT, Subtarget, DAG, dl);
27481
27482 unsigned Opc = IntrData->Opc0;
27483 if (IntrData->Opc1 != 0) {
27484 SDValue Sae = Op.getOperand(6);
27485 if (isRoundModeSAE(Sae))
27486 Opc = IntrData->Opc1;
27487 else if (!isRoundModeCurDirection(Sae))
27488 return SDValue();
27489 }
27490
27491 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
27492
27493 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
27494 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
27495
27496 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
27497 }
27498 case ROUNDP: {
27499 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27499, __extension__
__PRETTY_FUNCTION__))
;
27500 // Clear the upper bits of the rounding immediate so that the legacy
27501 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
27502 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
27503 SDValue RoundingMode =
27504 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
27505 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27506 Op.getOperand(1), RoundingMode);
27507 }
27508 case ROUNDS: {
27509 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27509, __extension__
__PRETTY_FUNCTION__))
;
27510 // Clear the upper bits of the rounding immediate so that the legacy
27511 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
27512 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
27513 SDValue RoundingMode =
27514 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
27515 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27516 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27517 }
27518 case BEXTRI: {
27519 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27519, __extension__
__PRETTY_FUNCTION__))
;
27520
27521 uint64_t Imm = Op.getConstantOperandVal(2);
27522 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27523 Op.getValueType());
27524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27525 Op.getOperand(1), Control);
27526 }
27527 // ADC/ADCX/SBB
27528 case ADX: {
27529 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27530 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27531
27532 SDValue Res;
27533 // If the carry in is zero, then we should just use ADD/SUB instead of
27534 // ADC/SBB.
27535 if (isNullConstant(Op.getOperand(1))) {
27536 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27537 Op.getOperand(3));
27538 } else {
27539 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27540 DAG.getConstant(-1, dl, MVT::i8));
27541 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27542 Op.getOperand(3), GenCF.getValue(1));
27543 }
27544 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27545 SDValue Results[] = { SetCC, Res };
27546 return DAG.getMergeValues(Results, dl);
27547 }
27548 case CVTPD2PS_MASK:
27549 case CVTPD2DQ_MASK:
27550 case CVTQQ2PS_MASK:
27551 case TRUNCATE_TO_REG: {
27552 SDValue Src = Op.getOperand(1);
27553 SDValue PassThru = Op.getOperand(2);
27554 SDValue Mask = Op.getOperand(3);
27555
27556 if (isAllOnesConstant(Mask))
27557 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27558
27559 MVT SrcVT = Src.getSimpleValueType();
27560 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27561 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27562 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27563 {Src, PassThru, Mask});
27564 }
27565 case CVTPS2PH_MASK: {
27566 SDValue Src = Op.getOperand(1);
27567 SDValue Rnd = Op.getOperand(2);
27568 SDValue PassThru = Op.getOperand(3);
27569 SDValue Mask = Op.getOperand(4);
27570
27571 unsigned RC = 0;
27572 unsigned Opc = IntrData->Opc0;
27573 bool SAE = Src.getValueType().is512BitVector() &&
27574 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27575 if (SAE) {
27576 Opc = X86ISD::CVTPS2PH_SAE;
27577 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27578 }
27579
27580 if (isAllOnesConstant(Mask))
27581 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27582
27583 if (SAE)
27584 Opc = X86ISD::MCVTPS2PH_SAE;
27585 else
27586 Opc = IntrData->Opc1;
27587 MVT SrcVT = Src.getSimpleValueType();
27588 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27589 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27590 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27591 }
27592 case CVTNEPS2BF16_MASK: {
27593 SDValue Src = Op.getOperand(1);
27594 SDValue PassThru = Op.getOperand(2);
27595 SDValue Mask = Op.getOperand(3);
27596
27597 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27598 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27599
27600 // Break false dependency.
27601 if (PassThru.isUndef())
27602 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27603
27604 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27605 Mask);
27606 }
27607 default:
27608 break;
27609 }
27610 }
27611
27612 switch (IntNo) {
27613 default: return SDValue(); // Don't custom lower most intrinsics.
27614
27615 // ptest and testp intrinsics. The intrinsic these come from are designed to
27616 // return an integer value, not just an instruction so lower it to the ptest
27617 // or testp pattern and a setcc for the result.
27618 case Intrinsic::x86_avx512_ktestc_b:
27619 case Intrinsic::x86_avx512_ktestc_w:
27620 case Intrinsic::x86_avx512_ktestc_d:
27621 case Intrinsic::x86_avx512_ktestc_q:
27622 case Intrinsic::x86_avx512_ktestz_b:
27623 case Intrinsic::x86_avx512_ktestz_w:
27624 case Intrinsic::x86_avx512_ktestz_d:
27625 case Intrinsic::x86_avx512_ktestz_q:
27626 case Intrinsic::x86_sse41_ptestz:
27627 case Intrinsic::x86_sse41_ptestc:
27628 case Intrinsic::x86_sse41_ptestnzc:
27629 case Intrinsic::x86_avx_ptestz_256:
27630 case Intrinsic::x86_avx_ptestc_256:
27631 case Intrinsic::x86_avx_ptestnzc_256:
27632 case Intrinsic::x86_avx_vtestz_ps:
27633 case Intrinsic::x86_avx_vtestc_ps:
27634 case Intrinsic::x86_avx_vtestnzc_ps:
27635 case Intrinsic::x86_avx_vtestz_pd:
27636 case Intrinsic::x86_avx_vtestc_pd:
27637 case Intrinsic::x86_avx_vtestnzc_pd:
27638 case Intrinsic::x86_avx_vtestz_ps_256:
27639 case Intrinsic::x86_avx_vtestc_ps_256:
27640 case Intrinsic::x86_avx_vtestnzc_ps_256:
27641 case Intrinsic::x86_avx_vtestz_pd_256:
27642 case Intrinsic::x86_avx_vtestc_pd_256:
27643 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27644 unsigned TestOpc = X86ISD::PTEST;
27645 X86::CondCode X86CC;
27646 switch (IntNo) {
27647 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27647)
;
27648 case Intrinsic::x86_avx512_ktestc_b:
27649 case Intrinsic::x86_avx512_ktestc_w:
27650 case Intrinsic::x86_avx512_ktestc_d:
27651 case Intrinsic::x86_avx512_ktestc_q:
27652 // CF = 1
27653 TestOpc = X86ISD::KTEST;
27654 X86CC = X86::COND_B;
27655 break;
27656 case Intrinsic::x86_avx512_ktestz_b:
27657 case Intrinsic::x86_avx512_ktestz_w:
27658 case Intrinsic::x86_avx512_ktestz_d:
27659 case Intrinsic::x86_avx512_ktestz_q:
27660 TestOpc = X86ISD::KTEST;
27661 X86CC = X86::COND_E;
27662 break;
27663 case Intrinsic::x86_avx_vtestz_ps:
27664 case Intrinsic::x86_avx_vtestz_pd:
27665 case Intrinsic::x86_avx_vtestz_ps_256:
27666 case Intrinsic::x86_avx_vtestz_pd_256:
27667 TestOpc = X86ISD::TESTP;
27668 [[fallthrough]];
27669 case Intrinsic::x86_sse41_ptestz:
27670 case Intrinsic::x86_avx_ptestz_256:
27671 // ZF = 1
27672 X86CC = X86::COND_E;
27673 break;
27674 case Intrinsic::x86_avx_vtestc_ps:
27675 case Intrinsic::x86_avx_vtestc_pd:
27676 case Intrinsic::x86_avx_vtestc_ps_256:
27677 case Intrinsic::x86_avx_vtestc_pd_256:
27678 TestOpc = X86ISD::TESTP;
27679 [[fallthrough]];
27680 case Intrinsic::x86_sse41_ptestc:
27681 case Intrinsic::x86_avx_ptestc_256:
27682 // CF = 1
27683 X86CC = X86::COND_B;
27684 break;
27685 case Intrinsic::x86_avx_vtestnzc_ps:
27686 case Intrinsic::x86_avx_vtestnzc_pd:
27687 case Intrinsic::x86_avx_vtestnzc_ps_256:
27688 case Intrinsic::x86_avx_vtestnzc_pd_256:
27689 TestOpc = X86ISD::TESTP;
27690 [[fallthrough]];
27691 case Intrinsic::x86_sse41_ptestnzc:
27692 case Intrinsic::x86_avx_ptestnzc_256:
27693 // ZF and CF = 0
27694 X86CC = X86::COND_A;
27695 break;
27696 }
27697
27698 SDValue LHS = Op.getOperand(1);
27699 SDValue RHS = Op.getOperand(2);
27700 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27701 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27702 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27703 }
27704
27705 case Intrinsic::x86_sse42_pcmpistria128:
27706 case Intrinsic::x86_sse42_pcmpestria128:
27707 case Intrinsic::x86_sse42_pcmpistric128:
27708 case Intrinsic::x86_sse42_pcmpestric128:
27709 case Intrinsic::x86_sse42_pcmpistrio128:
27710 case Intrinsic::x86_sse42_pcmpestrio128:
27711 case Intrinsic::x86_sse42_pcmpistris128:
27712 case Intrinsic::x86_sse42_pcmpestris128:
27713 case Intrinsic::x86_sse42_pcmpistriz128:
27714 case Intrinsic::x86_sse42_pcmpestriz128: {
27715 unsigned Opcode;
27716 X86::CondCode X86CC;
27717 switch (IntNo) {
27718 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27718)
; // Can't reach here.
27719 case Intrinsic::x86_sse42_pcmpistria128:
27720 Opcode = X86ISD::PCMPISTR;
27721 X86CC = X86::COND_A;
27722 break;
27723 case Intrinsic::x86_sse42_pcmpestria128:
27724 Opcode = X86ISD::PCMPESTR;
27725 X86CC = X86::COND_A;
27726 break;
27727 case Intrinsic::x86_sse42_pcmpistric128:
27728 Opcode = X86ISD::PCMPISTR;
27729 X86CC = X86::COND_B;
27730 break;
27731 case Intrinsic::x86_sse42_pcmpestric128:
27732 Opcode = X86ISD::PCMPESTR;
27733 X86CC = X86::COND_B;
27734 break;
27735 case Intrinsic::x86_sse42_pcmpistrio128:
27736 Opcode = X86ISD::PCMPISTR;
27737 X86CC = X86::COND_O;
27738 break;
27739 case Intrinsic::x86_sse42_pcmpestrio128:
27740 Opcode = X86ISD::PCMPESTR;
27741 X86CC = X86::COND_O;
27742 break;
27743 case Intrinsic::x86_sse42_pcmpistris128:
27744 Opcode = X86ISD::PCMPISTR;
27745 X86CC = X86::COND_S;
27746 break;
27747 case Intrinsic::x86_sse42_pcmpestris128:
27748 Opcode = X86ISD::PCMPESTR;
27749 X86CC = X86::COND_S;
27750 break;
27751 case Intrinsic::x86_sse42_pcmpistriz128:
27752 Opcode = X86ISD::PCMPISTR;
27753 X86CC = X86::COND_E;
27754 break;
27755 case Intrinsic::x86_sse42_pcmpestriz128:
27756 Opcode = X86ISD::PCMPESTR;
27757 X86CC = X86::COND_E;
27758 break;
27759 }
27760 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
27761 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27762 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27763 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27764 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27765 }
27766
27767 case Intrinsic::x86_sse42_pcmpistri128:
27768 case Intrinsic::x86_sse42_pcmpestri128: {
27769 unsigned Opcode;
27770 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27771 Opcode = X86ISD::PCMPISTR;
27772 else
27773 Opcode = X86ISD::PCMPESTR;
27774
27775 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
27776 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27777 return DAG.getNode(Opcode, dl, VTs, NewOps);
27778 }
27779
27780 case Intrinsic::x86_sse42_pcmpistrm128:
27781 case Intrinsic::x86_sse42_pcmpestrm128: {
27782 unsigned Opcode;
27783 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27784 Opcode = X86ISD::PCMPISTR;
27785 else
27786 Opcode = X86ISD::PCMPESTR;
27787
27788 SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
27789 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27790 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27791 }
27792
27793 case Intrinsic::eh_sjlj_lsda: {
27794 MachineFunction &MF = DAG.getMachineFunction();
27795 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27796 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27797 auto &Context = MF.getMMI().getContext();
27798 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27799 Twine(MF.getFunctionNumber()));
27800 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
27801 DAG.getMCSymbol(S, PtrVT));
27802 }
27803
27804 case Intrinsic::x86_seh_lsda: {
27805 // Compute the symbol for the LSDA. We know it'll get emitted later.
27806 MachineFunction &MF = DAG.getMachineFunction();
27807 SDValue Op1 = Op.getOperand(1);
27808 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27809 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
27810 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
27811
27812 // Generate a simple absolute symbol reference. This intrinsic is only
27813 // supported on 32-bit Windows, which isn't PIC.
27814 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27815 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27816 }
27817
27818 case Intrinsic::eh_recoverfp: {
27819 SDValue FnOp = Op.getOperand(1);
27820 SDValue IncomingFPOp = Op.getOperand(2);
27821 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27822 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27823 if (!Fn)
27824 report_fatal_error(
27825 "llvm.eh.recoverfp must take a function as the first argument");
27826 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27827 }
27828
27829 case Intrinsic::localaddress: {
27830 // Returns one of the stack, base, or frame pointer registers, depending on
27831 // which is used to reference local variables.
27832 MachineFunction &MF = DAG.getMachineFunction();
27833 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27834 unsigned Reg;
27835 if (RegInfo->hasBasePointer(MF))
27836 Reg = RegInfo->getBaseRegister();
27837 else { // Handles the SP or FP case.
27838 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27839 if (CantUseFP)
27840 Reg = RegInfo->getPtrSizedStackRegister(MF);
27841 else
27842 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27843 }
27844 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27845 }
27846 case Intrinsic::x86_avx512_vp2intersect_q_512:
27847 case Intrinsic::x86_avx512_vp2intersect_q_256:
27848 case Intrinsic::x86_avx512_vp2intersect_q_128:
27849 case Intrinsic::x86_avx512_vp2intersect_d_512:
27850 case Intrinsic::x86_avx512_vp2intersect_d_256:
27851 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27852 MVT MaskVT = Op.getSimpleValueType();
27853
27854 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27855 SDLoc DL(Op);
27856
27857 SDValue Operation =
27858 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
27859 Op->getOperand(1), Op->getOperand(2));
27860
27861 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
27862 MaskVT, Operation);
27863 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
27864 MaskVT, Operation);
27865 return DAG.getMergeValues({Result0, Result1}, DL);
27866 }
27867 case Intrinsic::x86_mmx_pslli_w:
27868 case Intrinsic::x86_mmx_pslli_d:
27869 case Intrinsic::x86_mmx_pslli_q:
27870 case Intrinsic::x86_mmx_psrli_w:
27871 case Intrinsic::x86_mmx_psrli_d:
27872 case Intrinsic::x86_mmx_psrli_q:
27873 case Intrinsic::x86_mmx_psrai_w:
27874 case Intrinsic::x86_mmx_psrai_d: {
27875 SDLoc DL(Op);
27876 SDValue ShAmt = Op.getOperand(2);
27877 // If the argument is a constant, convert it to a target constant.
27878 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27879 // Clamp out of bounds shift amounts since they will otherwise be masked
27880 // to 8-bits which may make it no longer out of bounds.
27881 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27882 if (ShiftAmount == 0)
27883 return Op.getOperand(1);
27884
27885 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27886 Op.getOperand(0), Op.getOperand(1),
27887 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27888 }
27889
27890 unsigned NewIntrinsic;
27891 switch (IntNo) {
27892 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27892)
; // Can't reach here.
27893 case Intrinsic::x86_mmx_pslli_w:
27894 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27895 break;
27896 case Intrinsic::x86_mmx_pslli_d:
27897 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27898 break;
27899 case Intrinsic::x86_mmx_pslli_q:
27900 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27901 break;
27902 case Intrinsic::x86_mmx_psrli_w:
27903 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27904 break;
27905 case Intrinsic::x86_mmx_psrli_d:
27906 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27907 break;
27908 case Intrinsic::x86_mmx_psrli_q:
27909 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27910 break;
27911 case Intrinsic::x86_mmx_psrai_w:
27912 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27913 break;
27914 case Intrinsic::x86_mmx_psrai_d:
27915 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27916 break;
27917 }
27918
27919 // The vector shift intrinsics with scalars uses 32b shift amounts but
27920 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27921 // MMX register.
27922 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27923 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27924 DAG.getTargetConstant(NewIntrinsic, DL,
27925 getPointerTy(DAG.getDataLayout())),
27926 Op.getOperand(1), ShAmt);
27927 }
27928 case Intrinsic::thread_pointer: {
27929 if (Subtarget.isTargetELF()) {
27930 SDLoc dl(Op);
27931 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27932 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27933 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
27934 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27935 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27936 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27937 }
27938 report_fatal_error(
27939 "Target OS doesn't support __builtin_thread_pointer() yet.");
27940 }
27941 }
27942}
27943
27944static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27945 SDValue Src, SDValue Mask, SDValue Base,
27946 SDValue Index, SDValue ScaleOp, SDValue Chain,
27947 const X86Subtarget &Subtarget) {
27948 SDLoc dl(Op);
27949 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27950 // Scale must be constant.
27951 if (!C)
27952 return SDValue();
27953 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27954 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27955 TLI.getPointerTy(DAG.getDataLayout()));
27956 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27957 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27958 // If source is undef or we know it won't be used, use a zero vector
27959 // to break register dependency.
27960 // TODO: use undef instead and let BreakFalseDeps deal with it?
27961 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27962 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27963
27964 // Cast mask to an integer type.
27965 Mask = DAG.getBitcast(MaskVT, Mask);
27966
27967 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27968
27969 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27970 SDValue Res =
27971 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27972 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27973 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27974}
27975
27976static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
27977 SDValue Src, SDValue Mask, SDValue Base,
27978 SDValue Index, SDValue ScaleOp, SDValue Chain,
27979 const X86Subtarget &Subtarget) {
27980 MVT VT = Op.getSimpleValueType();
27981 SDLoc dl(Op);
27982 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27983 // Scale must be constant.
27984 if (!C)
27985 return SDValue();
27986 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27987 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27988 TLI.getPointerTy(DAG.getDataLayout()));
27989 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27990 VT.getVectorNumElements());
27991 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27992
27993 // We support two versions of the gather intrinsics. One with scalar mask and
27994 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27995 if (Mask.getValueType() != MaskVT)
27996 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27997
27998 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27999 // If source is undef or we know it won't be used, use a zero vector
28000 // to break register dependency.
28001 // TODO: use undef instead and let BreakFalseDeps deal with it?
28002 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28003 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28004
28005 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28006
28007 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28008 SDValue Res =
28009 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28010 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28011 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28012}
28013
28014static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28015 SDValue Src, SDValue Mask, SDValue Base,
28016 SDValue Index, SDValue ScaleOp, SDValue Chain,
28017 const X86Subtarget &Subtarget) {
28018 SDLoc dl(Op);
28019 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28020 // Scale must be constant.
28021 if (!C)
28022 return SDValue();
28023 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28024 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28025 TLI.getPointerTy(DAG.getDataLayout()));
28026 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28027 Src.getSimpleValueType().getVectorNumElements());
28028 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28029
28030 // We support two versions of the scatter intrinsics. One with scalar mask and
28031 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28032 if (Mask.getValueType() != MaskVT)
28033 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28034
28035 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28036
28037 SDVTList VTs = DAG.getVTList(MVT::Other);
28038 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
28039 SDValue Res =
28040 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28041 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28042 return Res;
28043}
28044
28045static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28046 SDValue Mask, SDValue Base, SDValue Index,
28047 SDValue ScaleOp, SDValue Chain,
28048 const X86Subtarget &Subtarget) {
28049 SDLoc dl(Op);
28050 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28051 // Scale must be constant.
28052 if (!C)
28053 return SDValue();
28054 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28055 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28056 TLI.getPointerTy(DAG.getDataLayout()));
28057 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
28058 SDValue Segment = DAG.getRegister(0, MVT::i32);
28059 MVT MaskVT =
28060 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
28061 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28062 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
28063 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
28064 return SDValue(Res, 0);
28065}
28066
28067/// Handles the lowering of builtin intrinsics with chain that return their
28068/// value into registers EDX:EAX.
28069/// If operand ScrReg is a valid register identifier, then operand 2 of N is
28070/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
28071/// TargetOpcode.
28072/// Returns a Glue value which can be used to add extra copy-from-reg if the
28073/// expanded intrinsics implicitly defines extra registers (i.e. not just
28074/// EDX:EAX).
28075static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
28076 SelectionDAG &DAG,
28077 unsigned TargetOpcode,
28078 unsigned SrcReg,
28079 const X86Subtarget &Subtarget,
28080 SmallVectorImpl<SDValue> &Results) {
28081 SDValue Chain = N->getOperand(0);
28082 SDValue Glue;
28083
28084 if (SrcReg) {
28085 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28085, __extension__
__PRETTY_FUNCTION__))
;
28086 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
28087 Glue = Chain.getValue(1);
28088 }
28089
28090 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28091 SDValue N1Ops[] = {Chain, Glue};
28092 SDNode *N1 = DAG.getMachineNode(
28093 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
28094 Chain = SDValue(N1, 0);
28095
28096 // Reads the content of XCR and returns it in registers EDX:EAX.
28097 SDValue LO, HI;
28098 if (Subtarget.is64Bit()) {
28099 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
28100 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
28101 LO.getValue(2));
28102 } else {
28103 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
28104 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
28105 LO.getValue(2));
28106 }
28107 Chain = HI.getValue(1);
28108 Glue = HI.getValue(2);
28109
28110 if (Subtarget.is64Bit()) {
28111 // Merge the two 32-bit values into a 64-bit one.
28112 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
28113 DAG.getConstant(32, DL, MVT::i8));
28114 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
28115 Results.push_back(Chain);
28116 return Glue;
28117 }
28118
28119 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
28120 SDValue Ops[] = { LO, HI };
28121 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
28122 Results.push_back(Pair);
28123 Results.push_back(Chain);
28124 return Glue;
28125}
28126
28127/// Handles the lowering of builtin intrinsics that read the time stamp counter
28128/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
28129/// READCYCLECOUNTER nodes.
28130static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
28131 SelectionDAG &DAG,
28132 const X86Subtarget &Subtarget,
28133 SmallVectorImpl<SDValue> &Results) {
28134 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
28135 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
28136 // and the EAX register is loaded with the low-order 32 bits.
28137 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
28138 /* NoRegister */0, Subtarget,
28139 Results);
28140 if (Opcode != X86::RDTSCP)
28141 return;
28142
28143 SDValue Chain = Results[1];
28144 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
28145 // the ECX register. Add 'ecx' explicitly to the chain.
28146 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
28147 Results[1] = ecx;
28148 Results.push_back(ecx.getValue(1));
28149}
28150
28151static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
28152 SelectionDAG &DAG) {
28153 SmallVector<SDValue, 3> Results;
28154 SDLoc DL(Op);
28155 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
28156 Results);
28157 return DAG.getMergeValues(Results, DL);
28158}
28159
28160static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
28161 MachineFunction &MF = DAG.getMachineFunction();
28162 SDValue Chain = Op.getOperand(0);
28163 SDValue RegNode = Op.getOperand(2);
28164 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28165 if (!EHInfo)
28166 report_fatal_error("EH registrations only live in functions using WinEH");
28167
28168 // Cast the operand to an alloca, and remember the frame index.
28169 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
28170 if (!FINode)
28171 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
28172 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
28173
28174 // Return the chain operand without making any DAG nodes.
28175 return Chain;
28176}
28177
28178static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
28179 MachineFunction &MF = DAG.getMachineFunction();
28180 SDValue Chain = Op.getOperand(0);
28181 SDValue EHGuard = Op.getOperand(2);
28182 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28183 if (!EHInfo)
28184 report_fatal_error("EHGuard only live in functions using WinEH");
28185
28186 // Cast the operand to an alloca, and remember the frame index.
28187 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
28188 if (!FINode)
28189 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
28190 EHInfo->EHGuardFrameIndex = FINode->getIndex();
28191
28192 // Return the chain operand without making any DAG nodes.
28193 return Chain;
28194}
28195
28196/// Emit Truncating Store with signed or unsigned saturation.
28197static SDValue
28198EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
28199 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
28200 SelectionDAG &DAG) {
28201 SDVTList VTs = DAG.getVTList(MVT::Other);
28202 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
28203 SDValue Ops[] = { Chain, Val, Ptr, Undef };
28204 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
28205 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28206}
28207
28208/// Emit Masked Truncating Store with signed or unsigned saturation.
28209static SDValue
28210EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
28211 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
28212 MachineMemOperand *MMO, SelectionDAG &DAG) {
28213 SDVTList VTs = DAG.getVTList(MVT::Other);
28214 SDValue Ops[] = { Chain, Val, Ptr, Mask };
28215 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
28216 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28217}
28218
28219static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
28220 SelectionDAG &DAG) {
28221 unsigned IntNo = Op.getConstantOperandVal(1);
28222 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
28223 if (!IntrData) {
28224 switch (IntNo) {
28225
28226 case Intrinsic::swift_async_context_addr: {
28227 SDLoc dl(Op);
28228 auto &MF = DAG.getMachineFunction();
28229 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
28230 if (Subtarget.is64Bit()) {
28231 MF.getFrameInfo().setFrameAddressIsTaken(true);
28232 X86FI->setHasSwiftAsyncContext(true);
28233 SDValue Chain = Op->getOperand(0);
28234 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
28235 SDValue Result =
28236 SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
28237 DAG.getTargetConstant(8, dl, MVT::i32)),
28238 0);
28239 // Return { result, chain }.
28240 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28241 CopyRBP.getValue(1));
28242 } else {
28243 // 32-bit so no special extended frame, create or reuse an existing
28244 // stack slot.
28245 if (!X86FI->getSwiftAsyncContextFrameIdx())
28246 X86FI->setSwiftAsyncContextFrameIdx(
28247 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
28248 SDValue Result =
28249 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
28250 // Return { result, chain }.
28251 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
28252 Op->getOperand(0));
28253 }
28254 }
28255
28256 case llvm::Intrinsic::x86_seh_ehregnode:
28257 return MarkEHRegistrationNode(Op, DAG);
28258 case llvm::Intrinsic::x86_seh_ehguard:
28259 return MarkEHGuard(Op, DAG);
28260 case llvm::Intrinsic::x86_rdpkru: {
28261 SDLoc dl(Op);
28262 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28263 // Create a RDPKRU node and pass 0 to the ECX parameter.
28264 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
28265 DAG.getConstant(0, dl, MVT::i32));
28266 }
28267 case llvm::Intrinsic::x86_wrpkru: {
28268 SDLoc dl(Op);
28269 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
28270 // to the EDX and ECX parameters.
28271 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
28272 Op.getOperand(0), Op.getOperand(2),
28273 DAG.getConstant(0, dl, MVT::i32),
28274 DAG.getConstant(0, dl, MVT::i32));
28275 }
28276 case llvm::Intrinsic::asan_check_memaccess: {
28277 // Mark this as adjustsStack because it will be lowered to a call.
28278 DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
28279 // Don't do anything here, we will expand these intrinsics out later.
28280 return Op;
28281 }
28282 case llvm::Intrinsic::x86_flags_read_u32:
28283 case llvm::Intrinsic::x86_flags_read_u64:
28284 case llvm::Intrinsic::x86_flags_write_u32:
28285 case llvm::Intrinsic::x86_flags_write_u64: {
28286 // We need a frame pointer because this will get lowered to a PUSH/POP
28287 // sequence.
28288 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28289 MFI.setHasCopyImplyingStackAdjustment(true);
28290 // Don't do anything here, we will expand these intrinsics out later
28291 // during FinalizeISel in EmitInstrWithCustomInserter.
28292 return Op;
28293 }
28294 case Intrinsic::x86_lwpins32:
28295 case Intrinsic::x86_lwpins64:
28296 case Intrinsic::x86_umwait:
28297 case Intrinsic::x86_tpause: {
28298 SDLoc dl(Op);
28299 SDValue Chain = Op->getOperand(0);
28300 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28301 unsigned Opcode;
28302
28303 switch (IntNo) {
28304 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28304)
;
28305 case Intrinsic::x86_umwait:
28306 Opcode = X86ISD::UMWAIT;
28307 break;
28308 case Intrinsic::x86_tpause:
28309 Opcode = X86ISD::TPAUSE;
28310 break;
28311 case Intrinsic::x86_lwpins32:
28312 case Intrinsic::x86_lwpins64:
28313 Opcode = X86ISD::LWPINS;
28314 break;
28315 }
28316
28317 SDValue Operation =
28318 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
28319 Op->getOperand(3), Op->getOperand(4));
28320 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28321 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28322 Operation.getValue(1));
28323 }
28324 case Intrinsic::x86_enqcmd:
28325 case Intrinsic::x86_enqcmds: {
28326 SDLoc dl(Op);
28327 SDValue Chain = Op.getOperand(0);
28328 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28329 unsigned Opcode;
28330 switch (IntNo) {
28331 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28331)
;
28332 case Intrinsic::x86_enqcmd:
28333 Opcode = X86ISD::ENQCMD;
28334 break;
28335 case Intrinsic::x86_enqcmds:
28336 Opcode = X86ISD::ENQCMDS;
28337 break;
28338 }
28339 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
28340 Op.getOperand(3));
28341 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
28342 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28343 Operation.getValue(1));
28344 }
28345 case Intrinsic::x86_aesenc128kl:
28346 case Intrinsic::x86_aesdec128kl:
28347 case Intrinsic::x86_aesenc256kl:
28348 case Intrinsic::x86_aesdec256kl: {
28349 SDLoc DL(Op);
28350 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
28351 SDValue Chain = Op.getOperand(0);
28352 unsigned Opcode;
28353
28354 switch (IntNo) {
28355 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28355)
;
28356 case Intrinsic::x86_aesenc128kl:
28357 Opcode = X86ISD::AESENC128KL;
28358 break;
28359 case Intrinsic::x86_aesdec128kl:
28360 Opcode = X86ISD::AESDEC128KL;
28361 break;
28362 case Intrinsic::x86_aesenc256kl:
28363 Opcode = X86ISD::AESENC256KL;
28364 break;
28365 case Intrinsic::x86_aesdec256kl:
28366 Opcode = X86ISD::AESDEC256KL;
28367 break;
28368 }
28369
28370 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28371 MachineMemOperand *MMO = MemIntr->getMemOperand();
28372 EVT MemVT = MemIntr->getMemoryVT();
28373 SDValue Operation = DAG.getMemIntrinsicNode(
28374 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
28375 MMO);
28376 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
28377
28378 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
28379 {ZF, Operation.getValue(0), Operation.getValue(2)});
28380 }
28381 case Intrinsic::x86_aesencwide128kl:
28382 case Intrinsic::x86_aesdecwide128kl:
28383 case Intrinsic::x86_aesencwide256kl:
28384 case Intrinsic::x86_aesdecwide256kl: {
28385 SDLoc DL(Op);
28386 SDVTList VTs = DAG.getVTList(
28387 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
28388 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
28389 SDValue Chain = Op.getOperand(0);
28390 unsigned Opcode;
28391
28392 switch (IntNo) {
28393 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28393)
;
28394 case Intrinsic::x86_aesencwide128kl:
28395 Opcode = X86ISD::AESENCWIDE128KL;
28396 break;
28397 case Intrinsic::x86_aesdecwide128kl:
28398 Opcode = X86ISD::AESDECWIDE128KL;
28399 break;
28400 case Intrinsic::x86_aesencwide256kl:
28401 Opcode = X86ISD::AESENCWIDE256KL;
28402 break;
28403 case Intrinsic::x86_aesdecwide256kl:
28404 Opcode = X86ISD::AESDECWIDE256KL;
28405 break;
28406 }
28407
28408 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28409 MachineMemOperand *MMO = MemIntr->getMemOperand();
28410 EVT MemVT = MemIntr->getMemoryVT();
28411 SDValue Operation = DAG.getMemIntrinsicNode(
28412 Opcode, DL, VTs,
28413 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
28414 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
28415 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
28416 MemVT, MMO);
28417 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
28418
28419 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
28420 {ZF, Operation.getValue(1), Operation.getValue(2),
28421 Operation.getValue(3), Operation.getValue(4),
28422 Operation.getValue(5), Operation.getValue(6),
28423 Operation.getValue(7), Operation.getValue(8),
28424 Operation.getValue(9)});
28425 }
28426 case Intrinsic::x86_testui: {
28427 SDLoc dl(Op);
28428 SDValue Chain = Op.getOperand(0);
28429 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
28430 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
28431 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
28432 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
28433 Operation.getValue(1));
28434 }
28435 case Intrinsic::x86_atomic_bts_rm:
28436 case Intrinsic::x86_atomic_btc_rm:
28437 case Intrinsic::x86_atomic_btr_rm: {
28438 SDLoc DL(Op);
28439 MVT VT = Op.getSimpleValueType();
28440 SDValue Chain = Op.getOperand(0);
28441 SDValue Op1 = Op.getOperand(2);
28442 SDValue Op2 = Op.getOperand(3);
28443 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28444 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28445 : X86ISD::LBTR_RM;
28446 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28447 SDValue Res =
28448 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28449 {Chain, Op1, Op2}, VT, MMO);
28450 Chain = Res.getValue(1);
28451 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28452 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28453 }
28454 case Intrinsic::x86_atomic_bts:
28455 case Intrinsic::x86_atomic_btc:
28456 case Intrinsic::x86_atomic_btr: {
28457 SDLoc DL(Op);
28458 MVT VT = Op.getSimpleValueType();
28459 SDValue Chain = Op.getOperand(0);
28460 SDValue Op1 = Op.getOperand(2);
28461 SDValue Op2 = Op.getOperand(3);
28462 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28463 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28464 : X86ISD::LBTR;
28465 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28466 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28467 SDValue Res =
28468 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28469 {Chain, Op1, Op2, Size}, VT, MMO);
28470 Chain = Res.getValue(1);
28471 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28472 unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
28473 if (Imm)
28474 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28475 DAG.getShiftAmountConstant(Imm, VT, DL));
28476 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28477 }
28478 case Intrinsic::x86_cmpccxadd32:
28479 case Intrinsic::x86_cmpccxadd64: {
28480 SDLoc DL(Op);
28481 SDValue Chain = Op.getOperand(0);
28482 SDValue Addr = Op.getOperand(2);
28483 SDValue Src1 = Op.getOperand(3);
28484 SDValue Src2 = Op.getOperand(4);
28485 SDValue CC = Op.getOperand(5);
28486 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28487 SDValue Operation = DAG.getMemIntrinsicNode(
28488 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28489 MVT::i32, MMO);
28490 return Operation;
28491 }
28492 case Intrinsic::x86_aadd32:
28493 case Intrinsic::x86_aadd64:
28494 case Intrinsic::x86_aand32:
28495 case Intrinsic::x86_aand64:
28496 case Intrinsic::x86_aor32:
28497 case Intrinsic::x86_aor64:
28498 case Intrinsic::x86_axor32:
28499 case Intrinsic::x86_axor64: {
28500 SDLoc DL(Op);
28501 SDValue Chain = Op.getOperand(0);
28502 SDValue Op1 = Op.getOperand(2);
28503 SDValue Op2 = Op.getOperand(3);
28504 MVT VT = Op2.getSimpleValueType();
28505 unsigned Opc = 0;
28506 switch (IntNo) {
28507 default:
28508 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28508)
;
28509 case Intrinsic::x86_aadd32:
28510 case Intrinsic::x86_aadd64:
28511 Opc = X86ISD::AADD;
28512 break;
28513 case Intrinsic::x86_aand32:
28514 case Intrinsic::x86_aand64:
28515 Opc = X86ISD::AAND;
28516 break;
28517 case Intrinsic::x86_aor32:
28518 case Intrinsic::x86_aor64:
28519 Opc = X86ISD::AOR;
28520 break;
28521 case Intrinsic::x86_axor32:
28522 case Intrinsic::x86_axor64:
28523 Opc = X86ISD::AXOR;
28524 break;
28525 }
28526 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28527 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28528 {Chain, Op1, Op2}, VT, MMO);
28529 }
28530 case Intrinsic::x86_atomic_add_cc:
28531 case Intrinsic::x86_atomic_sub_cc:
28532 case Intrinsic::x86_atomic_or_cc:
28533 case Intrinsic::x86_atomic_and_cc:
28534 case Intrinsic::x86_atomic_xor_cc: {
28535 SDLoc DL(Op);
28536 SDValue Chain = Op.getOperand(0);
28537 SDValue Op1 = Op.getOperand(2);
28538 SDValue Op2 = Op.getOperand(3);
28539 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28540 MVT VT = Op2.getSimpleValueType();
28541 unsigned Opc = 0;
28542 switch (IntNo) {
28543 default:
28544 llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28544)
;
28545 case Intrinsic::x86_atomic_add_cc:
28546 Opc = X86ISD::LADD;
28547 break;
28548 case Intrinsic::x86_atomic_sub_cc:
28549 Opc = X86ISD::LSUB;
28550 break;
28551 case Intrinsic::x86_atomic_or_cc:
28552 Opc = X86ISD::LOR;
28553 break;
28554 case Intrinsic::x86_atomic_and_cc:
28555 Opc = X86ISD::LAND;
28556 break;
28557 case Intrinsic::x86_atomic_xor_cc:
28558 Opc = X86ISD::LXOR;
28559 break;
28560 }
28561 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28562 SDValue LockArith =
28563 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28564 {Chain, Op1, Op2}, VT, MMO);
28565 Chain = LockArith.getValue(1);
28566 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28567 }
28568 }
28569 return SDValue();
28570 }
28571
28572 SDLoc dl(Op);
28573 switch(IntrData->Type) {
28574 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28574)
;
28575 case RDSEED:
28576 case RDRAND: {
28577 // Emit the node with the right value type.
28578 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28579 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28580
28581 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28582 // Otherwise return the value from Rand, which is always 0, casted to i32.
28583 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28584 DAG.getConstant(1, dl, Op->getValueType(1)),
28585 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28586 SDValue(Result.getNode(), 1)};
28587 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28588
28589 // Return { result, isValid, chain }.
28590 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28591 SDValue(Result.getNode(), 2));
28592 }
28593 case GATHER_AVX2: {
28594 SDValue Chain = Op.getOperand(0);
28595 SDValue Src = Op.getOperand(2);
28596 SDValue Base = Op.getOperand(3);
28597 SDValue Index = Op.getOperand(4);
28598 SDValue Mask = Op.getOperand(5);
28599 SDValue Scale = Op.getOperand(6);
28600 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28601 Scale, Chain, Subtarget);
28602 }
28603 case GATHER: {
28604 //gather(v1, mask, index, base, scale);
28605 SDValue Chain = Op.getOperand(0);
28606 SDValue Src = Op.getOperand(2);
28607 SDValue Base = Op.getOperand(3);
28608 SDValue Index = Op.getOperand(4);
28609 SDValue Mask = Op.getOperand(5);
28610 SDValue Scale = Op.getOperand(6);
28611 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28612 Chain, Subtarget);
28613 }
28614 case SCATTER: {
28615 //scatter(base, mask, index, v1, scale);
28616 SDValue Chain = Op.getOperand(0);
28617 SDValue Base = Op.getOperand(2);
28618 SDValue Mask = Op.getOperand(3);
28619 SDValue Index = Op.getOperand(4);
28620 SDValue Src = Op.getOperand(5);
28621 SDValue Scale = Op.getOperand(6);
28622 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28623 Scale, Chain, Subtarget);
28624 }
28625 case PREFETCH: {
28626 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28627 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28628, __extension__
__PRETTY_FUNCTION__))
28628 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28628, __extension__
__PRETTY_FUNCTION__))
;
28629 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28630 SDValue Chain = Op.getOperand(0);
28631 SDValue Mask = Op.getOperand(2);
28632 SDValue Index = Op.getOperand(3);
28633 SDValue Base = Op.getOperand(4);
28634 SDValue Scale = Op.getOperand(5);
28635 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28636 Subtarget);
28637 }
28638 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28639 case RDTSC: {
28640 SmallVector<SDValue, 2> Results;
28641 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28642 Results);
28643 return DAG.getMergeValues(Results, dl);
28644 }
28645 // Read Performance Monitoring Counters.
28646 case RDPMC:
28647 // Read Processor Register.
28648 case RDPRU:
28649 // GetExtended Control Register.
28650 case XGETBV: {
28651 SmallVector<SDValue, 2> Results;
28652
28653 // RDPMC uses ECX to select the index of the performance counter to read.
28654 // RDPRU uses ECX to select the processor register to read.
28655 // XGETBV uses ECX to select the index of the XCR register to return.
28656 // The result is stored into registers EDX:EAX.
28657 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28658 Subtarget, Results);
28659 return DAG.getMergeValues(Results, dl);
28660 }
28661 // XTEST intrinsics.
28662 case XTEST: {
28663 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28664 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28665
28666 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28667 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28668 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28669 Ret, SDValue(InTrans.getNode(), 1));
28670 }
28671 case TRUNCATE_TO_MEM_VI8:
28672 case TRUNCATE_TO_MEM_VI16:
28673 case TRUNCATE_TO_MEM_VI32: {
28674 SDValue Mask = Op.getOperand(4);
28675 SDValue DataToTruncate = Op.getOperand(3);
28676 SDValue Addr = Op.getOperand(2);
28677 SDValue Chain = Op.getOperand(0);
28678
28679 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
28680 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28680, __extension__
__PRETTY_FUNCTION__))
;
28681
28682 EVT MemVT = MemIntr->getMemoryVT();
28683
28684 uint16_t TruncationOp = IntrData->Opc0;
28685 switch (TruncationOp) {
28686 case X86ISD::VTRUNC: {
28687 if (isAllOnesConstant(Mask)) // return just a truncate store
28688 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28689 MemIntr->getMemOperand());
28690
28691 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28692 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28693 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28694
28695 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28696 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28697 true /* truncating */);
28698 }
28699 case X86ISD::VTRUNCUS:
28700 case X86ISD::VTRUNCS: {
28701 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28702 if (isAllOnesConstant(Mask))
28703 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28704 MemIntr->getMemOperand(), DAG);
28705
28706 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28707 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28708
28709 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28710 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28711 }
28712 default:
28713 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28713)
;
28714 }
28715 }
28716 }
28717}
28718
28719SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28720 SelectionDAG &DAG) const {
28721 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28722 MFI.setReturnAddressIsTaken(true);
28723
28724 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
28725 return SDValue();
28726
28727 unsigned Depth = Op.getConstantOperandVal(0);
28728 SDLoc dl(Op);
28729 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28730
28731 if (Depth > 0) {
28732 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28733 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28734 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28735 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28736 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28737 MachinePointerInfo());
28738 }
28739
28740 // Just load the return address.
28741 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28742 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28743 MachinePointerInfo());
28744}
28745
28746SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28747 SelectionDAG &DAG) const {
28748 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
28749 return getReturnAddressFrameIndex(DAG);
28750}
28751
28752SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28753 MachineFunction &MF = DAG.getMachineFunction();
28754 MachineFrameInfo &MFI = MF.getFrameInfo();
28755 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28756 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28757 EVT VT = Op.getValueType();
28758
28759 MFI.setFrameAddressIsTaken(true);
28760
28761 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28762 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28763 // is not possible to crawl up the stack without looking at the unwind codes
28764 // simultaneously.
28765 int FrameAddrIndex = FuncInfo->getFAIndex();
28766 if (!FrameAddrIndex) {
28767 // Set up a frame object for the return address.
28768 unsigned SlotSize = RegInfo->getSlotSize();
28769 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28770 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28771 FuncInfo->setFAIndex(FrameAddrIndex);
28772 }
28773 return DAG.getFrameIndex(FrameAddrIndex, VT);
28774 }
28775
28776 unsigned FrameReg =
28777 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28778 SDLoc dl(Op); // FIXME probably not meaningful
28779 unsigned Depth = Op.getConstantOperandVal(0);
28780 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28782, __extension__
__PRETTY_FUNCTION__))
28781 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28782, __extension__
__PRETTY_FUNCTION__))
28782 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28782, __extension__
__PRETTY_FUNCTION__))
;
28783 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28784 while (Depth--)
28785 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28786 MachinePointerInfo());
28787 return FrameAddr;
28788}
28789
28790// FIXME? Maybe this could be a TableGen attribute on some registers and
28791// this table could be generated automatically from RegInfo.
28792Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
28793 const MachineFunction &MF) const {
28794 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28795
28796 Register Reg = StringSwitch<unsigned>(RegName)
28797 .Case("esp", X86::ESP)
28798 .Case("rsp", X86::RSP)
28799 .Case("ebp", X86::EBP)
28800 .Case("rbp", X86::RBP)
28801 .Default(0);
28802
28803 if (Reg == X86::EBP || Reg == X86::RBP) {
28804 if (!TFI.hasFP(MF))
28805 report_fatal_error("register " + StringRef(RegName) +
28806 " is allocatable: function has no frame pointer");
28807#ifndef NDEBUG
28808 else {
28809 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28810 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28811 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28812, __extension__
__PRETTY_FUNCTION__))
28812 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28812, __extension__
__PRETTY_FUNCTION__))
;
28813 }
28814#endif
28815 }
28816
28817 if (Reg)
28818 return Reg;
28819
28820 report_fatal_error("Invalid register name global variable");
28821}
28822
28823SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28824 SelectionDAG &DAG) const {
28825 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28826 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28827}
28828
28829Register X86TargetLowering::getExceptionPointerRegister(
28830 const Constant *PersonalityFn) const {
28831 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28832 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28833
28834 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28835}
28836
28837Register X86TargetLowering::getExceptionSelectorRegister(
28838 const Constant *PersonalityFn) const {
28839 // Funclet personalities don't use selectors (the runtime does the selection).
28840 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
28841 return X86::NoRegister;
28842 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28843}
28844
28845bool X86TargetLowering::needsFixedCatchObjects() const {
28846 return Subtarget.isTargetWin64();
28847}
28848
28849SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28850 SDValue Chain = Op.getOperand(0);
28851 SDValue Offset = Op.getOperand(1);
28852 SDValue Handler = Op.getOperand(2);
28853 SDLoc dl (Op);
28854
28855 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28856 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28857 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28858 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28860, __extension__
__PRETTY_FUNCTION__))
28859 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28860, __extension__
__PRETTY_FUNCTION__))
28860 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28860, __extension__
__PRETTY_FUNCTION__))
;
28861 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28862 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28863
28864 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28865 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28866 dl));
28867 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28868 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28869 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28870
28871 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28872 DAG.getRegister(StoreAddrReg, PtrVT));
28873}
28874
28875SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28876 SelectionDAG &DAG) const {
28877 SDLoc DL(Op);
28878 // If the subtarget is not 64bit, we may need the global base reg
28879 // after isel expand pseudo, i.e., after CGBR pass ran.
28880 // Therefore, ask for the GlobalBaseReg now, so that the pass
28881 // inserts the code for us in case we need it.
28882 // Otherwise, we will end up in a situation where we will
28883 // reference a virtual register that is not defined!
28884 if (!Subtarget.is64Bit()) {
28885 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28886 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28887 }
28888 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28889 DAG.getVTList(MVT::i32, MVT::Other),
28890 Op.getOperand(0), Op.getOperand(1));
28891}
28892
28893SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28894 SelectionDAG &DAG) const {
28895 SDLoc DL(Op);
28896 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28897 Op.getOperand(0), Op.getOperand(1));
28898}
28899
28900SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28901 SelectionDAG &DAG) const {
28902 SDLoc DL(Op);
28903 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28904 Op.getOperand(0));
28905}
28906
28907static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
28908 return Op.getOperand(0);
28909}
28910
28911SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28912 SelectionDAG &DAG) const {
28913 SDValue Root = Op.getOperand(0);
28914 SDValue Trmp = Op.getOperand(1); // trampoline
28915 SDValue FPtr = Op.getOperand(2); // nested function
28916 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28917 SDLoc dl (Op);
28918
28919 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28920 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28921
28922 if (Subtarget.is64Bit()) {
28923 SDValue OutChains[6];
28924
28925 // Large code-model.
28926 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28927 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28928
28929 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28930 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28931
28932 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28933
28934 // Load the pointer to the nested function into R11.
28935 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28936 SDValue Addr = Trmp;
28937 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28938 Addr, MachinePointerInfo(TrmpAddr));
28939
28940 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28941 DAG.getConstant(2, dl, MVT::i64));
28942 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28943 MachinePointerInfo(TrmpAddr, 2), Align(2));
28944
28945 // Load the 'nest' parameter value into R10.
28946 // R10 is specified in X86CallingConv.td
28947 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28948 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28949 DAG.getConstant(10, dl, MVT::i64));
28950 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28951 Addr, MachinePointerInfo(TrmpAddr, 10));
28952
28953 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28954 DAG.getConstant(12, dl, MVT::i64));
28955 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28956 MachinePointerInfo(TrmpAddr, 12), Align(2));
28957
28958 // Jump to the nested function.
28959 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28960 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28961 DAG.getConstant(20, dl, MVT::i64));
28962 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28963 Addr, MachinePointerInfo(TrmpAddr, 20));
28964
28965 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28966 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28967 DAG.getConstant(22, dl, MVT::i64));
28968 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28969 Addr, MachinePointerInfo(TrmpAddr, 22));
28970
28971 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28972 } else {
28973 const Function *Func =
28974 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28975 CallingConv::ID CC = Func->getCallingConv();
28976 unsigned NestReg;
28977
28978 switch (CC) {
28979 default:
28980 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28980)
;
28981 case CallingConv::C:
28982 case CallingConv::X86_StdCall: {
28983 // Pass 'nest' parameter in ECX.
28984 // Must be kept in sync with X86CallingConv.td
28985 NestReg = X86::ECX;
28986
28987 // Check that ECX wasn't needed by an 'inreg' parameter.
28988 FunctionType *FTy = Func->getFunctionType();
28989 const AttributeList &Attrs = Func->getAttributes();
28990
28991 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28992 unsigned InRegCount = 0;
28993 unsigned Idx = 0;
28994
28995 for (FunctionType::param_iterator I = FTy->param_begin(),
28996 E = FTy->param_end(); I != E; ++I, ++Idx)
28997 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28998 const DataLayout &DL = DAG.getDataLayout();
28999 // FIXME: should only count parameters that are lowered to integers.
29000 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
29001 }
29002
29003 if (InRegCount > 2) {
29004 report_fatal_error("Nest register in use - reduce number of inreg"
29005 " parameters!");
29006 }
29007 }
29008 break;
29009 }
29010 case CallingConv::X86_FastCall:
29011 case CallingConv::X86_ThisCall:
29012 case CallingConv::Fast:
29013 case CallingConv::Tail:
29014 case CallingConv::SwiftTail:
29015 // Pass 'nest' parameter in EAX.
29016 // Must be kept in sync with X86CallingConv.td
29017 NestReg = X86::EAX;
29018 break;
29019 }
29020
29021 SDValue OutChains[4];
29022 SDValue Addr, Disp;
29023
29024 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29025 DAG.getConstant(10, dl, MVT::i32));
29026 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
29027
29028 // This is storing the opcode for MOV32ri.
29029 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
29030 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
29031 OutChains[0] =
29032 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
29033 Trmp, MachinePointerInfo(TrmpAddr));
29034
29035 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29036 DAG.getConstant(1, dl, MVT::i32));
29037 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
29038 MachinePointerInfo(TrmpAddr, 1), Align(1));
29039
29040 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
29041 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29042 DAG.getConstant(5, dl, MVT::i32));
29043 OutChains[2] =
29044 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
29045 MachinePointerInfo(TrmpAddr, 5), Align(1));
29046
29047 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29048 DAG.getConstant(6, dl, MVT::i32));
29049 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
29050 MachinePointerInfo(TrmpAddr, 6), Align(1));
29051
29052 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29053 }
29054}
29055
29056SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
29057 SelectionDAG &DAG) const {
29058 /*
29059 The rounding mode is in bits 11:10 of FPSR, and has the following
29060 settings:
29061 00 Round to nearest
29062 01 Round to -inf
29063 10 Round to +inf
29064 11 Round to 0
29065
29066 GET_ROUNDING, on the other hand, expects the following:
29067 -1 Undefined
29068 0 Round to 0
29069 1 Round to nearest
29070 2 Round to +inf
29071 3 Round to -inf
29072
29073 To perform the conversion, we use a packed lookup table of the four 2-bit
29074 values that we can index by FPSP[11:10]
29075 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
29076
29077 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
29078 */
29079
29080 MachineFunction &MF = DAG.getMachineFunction();
29081 MVT VT = Op.getSimpleValueType();
29082 SDLoc DL(Op);
29083
29084 // Save FP Control Word to stack slot
29085 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
29086 SDValue StackSlot =
29087 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
29088
29089 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
29090
29091 SDValue Chain = Op.getOperand(0);
29092 SDValue Ops[] = {Chain, StackSlot};
29093 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
29094 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
29095 Align(2), MachineMemOperand::MOStore);
29096
29097 // Load FP Control Word from stack slot
29098 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
29099 Chain = CWD.getValue(1);
29100
29101 // Mask and turn the control bits into a shift for the lookup table.
29102 SDValue Shift =
29103 DAG.getNode(ISD::SRL, DL, MVT::i16,
29104 DAG.getNode(ISD::AND, DL, MVT::i16,
29105 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
29106 DAG.getConstant(9, DL, MVT::i8));
29107 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
29108
29109 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
29110 SDValue RetVal =
29111 DAG.getNode(ISD::AND, DL, MVT::i32,
29112 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
29113 DAG.getConstant(3, DL, MVT::i32));
29114
29115 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
29116
29117 return DAG.getMergeValues({RetVal, Chain}, DL);
29118}
29119
29120SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
29121 SelectionDAG &DAG) const {
29122 MachineFunction &MF = DAG.getMachineFunction();
29123 SDLoc DL(Op);
29124 SDValue Chain = Op.getNode()->getOperand(0);
29125
29126 // FP control word may be set only from data in memory. So we need to allocate
29127 // stack space to save/load FP control word.
29128 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
29129 SDValue StackSlot =
29130 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
29131 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
29132 MachineMemOperand *MMO =
29133 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
29134
29135 // Store FP control word into memory.
29136 SDValue Ops[] = {Chain, StackSlot};
29137 Chain = DAG.getMemIntrinsicNode(
29138 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
29139
29140 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
29141 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
29142 Chain = CWD.getValue(1);
29143 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
29144 DAG.getConstant(0xf3ff, DL, MVT::i16));
29145
29146 // Calculate new rounding mode.
29147 SDValue NewRM = Op.getNode()->getOperand(1);
29148 SDValue RMBits;
29149 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
29150 uint64_t RM = CVal->getZExtValue();
29151 int FieldVal;
29152 switch (static_cast<RoundingMode>(RM)) {
29153 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
29154 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
29155 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
29156 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
29157 default:
29158 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29158)
;
29159 }
29160 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
29161 } else {
29162 // Need to convert argument into bits of control word:
29163 // 0 Round to 0 -> 11
29164 // 1 Round to nearest -> 00
29165 // 2 Round to +inf -> 10
29166 // 3 Round to -inf -> 01
29167 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
29168 // To make the conversion, put all these values into a value 0xc9 and shift
29169 // it left depending on the rounding mode:
29170 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
29171 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
29172 // ...
29173 // (0xc9 << (2 * NewRM + 4)) & 0xc00
29174 SDValue ShiftValue =
29175 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
29176 DAG.getNode(ISD::ADD, DL, MVT::i32,
29177 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
29178 DAG.getConstant(1, DL, MVT::i8)),
29179 DAG.getConstant(4, DL, MVT::i32)));
29180 SDValue Shifted =
29181 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
29182 ShiftValue);
29183 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
29184 DAG.getConstant(0xc00, DL, MVT::i16));
29185 }
29186
29187 // Update rounding mode bits and store the new FP Control Word into stack.
29188 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
29189 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
29190
29191 // Load FP control word from the slot.
29192 SDValue OpsLD[] = {Chain, StackSlot};
29193 MachineMemOperand *MMOL =
29194 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
29195 Chain = DAG.getMemIntrinsicNode(
29196 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
29197
29198 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
29199 // same way but in bits 14:13.
29200 if (Subtarget.hasSSE1()) {
29201 // Store MXCSR into memory.
29202 Chain = DAG.getNode(
29203 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29204 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
29205 StackSlot);
29206
29207 // Load MXCSR from stack slot and clear RM field (bits 14:13).
29208 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
29209 Chain = CWD.getValue(1);
29210 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
29211 DAG.getConstant(0xffff9fff, DL, MVT::i32));
29212
29213 // Shift X87 RM bits from 11:10 to 14:13.
29214 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
29215 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
29216 DAG.getConstant(3, DL, MVT::i8));
29217
29218 // Update rounding mode bits and store the new FP Control Word into stack.
29219 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
29220 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
29221
29222 // Load MXCSR from the slot.
29223 Chain = DAG.getNode(
29224 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29225 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
29226 StackSlot);
29227 }
29228
29229 return Chain;
29230}
29231
29232/// Lower a vector CTLZ using native supported vector CTLZ instruction.
29233//
29234// i8/i16 vector implemented using dword LZCNT vector instruction
29235// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
29236// split the vector, perform operation on it's Lo a Hi part and
29237// concatenate the results.
29238static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
29239 const X86Subtarget &Subtarget) {
29240 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29240, __extension__ __PRETTY_FUNCTION__))
;
29241 SDLoc dl(Op);
29242 MVT VT = Op.getSimpleValueType();
29243 MVT EltVT = VT.getVectorElementType();
29244 unsigned NumElems = VT.getVectorNumElements();
29245
29246 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29247, __extension__
__PRETTY_FUNCTION__))
29247 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29247, __extension__
__PRETTY_FUNCTION__))
;
29248
29249 // Split vector, it's Lo and Hi parts will be handled in next iteration.
29250 if (NumElems > 16 ||
29251 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
29252 return splitVectorIntUnary(Op, DAG);
29253
29254 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29255 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29256, __extension__
__PRETTY_FUNCTION__))
29256 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29256, __extension__
__PRETTY_FUNCTION__))
;
29257
29258 // Use native supported vector instruction vplzcntd.
29259 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
29260 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
29261 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
29262 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
29263
29264 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
29265}
29266
29267// Lower CTLZ using a PSHUFB lookup table implementation.
29268static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
29269 const X86Subtarget &Subtarget,
29270 SelectionDAG &DAG) {
29271 MVT VT = Op.getSimpleValueType();
29272 int NumElts = VT.getVectorNumElements();
29273 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
29274 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
29275
29276 // Per-nibble leading zero PSHUFB lookup table.
29277 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
29278 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
29279 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
29280 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29281
29282 SmallVector<SDValue, 64> LUTVec;
29283 for (int i = 0; i < NumBytes; ++i)
29284 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29285 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29286
29287 // Begin by bitcasting the input to byte vector, then split those bytes
29288 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
29289 // If the hi input nibble is zero then we add both results together, otherwise
29290 // we just take the hi result (by masking the lo result to zero before the
29291 // add).
29292 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29293 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29294
29295 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29296 SDValue Lo = Op0;
29297 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29298 SDValue HiZ;
29299 if (CurrVT.is512BitVector()) {
29300 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29301 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29302 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29303 } else {
29304 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29305 }
29306
29307 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29308 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29309 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29310 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29311
29312 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29313 // of the current vector width in the same way we did for the nibbles.
29314 // If the upper half of the input element is zero then add the halves'
29315 // leading zero counts together, otherwise just use the upper half's.
29316 // Double the width of the result until we are at target width.
29317 while (CurrVT != VT) {
29318 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29319 int CurrNumElts = CurrVT.getVectorNumElements();
29320 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29321 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29322 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29323
29324 // Check if the upper half of the input element is zero.
29325 if (CurrVT.is512BitVector()) {
29326 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29327 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29328 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29329 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29330 } else {
29331 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29332 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29333 }
29334 HiZ = DAG.getBitcast(NextVT, HiZ);
29335
29336 // Move the upper/lower halves to the lower bits as we'll be extending to
29337 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29338 // together.
29339 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29340 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29341 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29342 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29343 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29344 CurrVT = NextVT;
29345 }
29346
29347 return Res;
29348}
29349
29350static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
29351 const X86Subtarget &Subtarget,
29352 SelectionDAG &DAG) {
29353 MVT VT = Op.getSimpleValueType();
29354
29355 if (Subtarget.hasCDI() &&
29356 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29357 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29358 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29359
29360 // Decompose 256-bit ops into smaller 128-bit ops.
29361 if (VT.is256BitVector() && !Subtarget.hasInt256())
29362 return splitVectorIntUnary(Op, DAG);
29363
29364 // Decompose 512-bit ops into smaller 256-bit ops.
29365 if (VT.is512BitVector() && !Subtarget.hasBWI())
29366 return splitVectorIntUnary(Op, DAG);
29367
29368 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29368, __extension__
__PRETTY_FUNCTION__))
;
29369 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29370}
29371
29372static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29373 SelectionDAG &DAG) {
29374 MVT VT = Op.getSimpleValueType();
29375 MVT OpVT = VT;
29376 unsigned NumBits = VT.getSizeInBits();
29377 SDLoc dl(Op);
29378 unsigned Opc = Op.getOpcode();
29379
29380 if (VT.isVector())
29381 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29382
29383 Op = Op.getOperand(0);
29384 if (VT == MVT::i8) {
29385 // Zero extend to i32 since there is not an i8 bsr.
29386 OpVT = MVT::i32;
29387 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29388 }
29389
29390 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29391 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29392 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
29393
29394 if (Opc == ISD::CTLZ) {
29395 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29396 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29397 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29398 Op.getValue(1)};
29399 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29400 }
29401
29402 // Finally xor with NumBits-1.
29403 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29404 DAG.getConstant(NumBits - 1, dl, OpVT));
29405
29406 if (VT == MVT::i8)
29407 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29408 return Op;
29409}
29410
29411static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29412 SelectionDAG &DAG) {
29413 MVT VT = Op.getSimpleValueType();
29414 unsigned NumBits = VT.getScalarSizeInBits();
29415 SDValue N0 = Op.getOperand(0);
29416 SDLoc dl(Op);
29417
29418 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29419, __extension__
__PRETTY_FUNCTION__))
29419 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29419, __extension__
__PRETTY_FUNCTION__))
;
29420
29421 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29422 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29423 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
29424
29425 // If src is known never zero we can skip the CMOV.
29426 if (DAG.isKnownNeverZero(N0))
29427 return Op;
29428
29429 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29430 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29431 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29432 Op.getValue(1)};
29433 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29434}
29435
29436static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
29437 const X86Subtarget &Subtarget) {
29438 MVT VT = Op.getSimpleValueType();
29439 if (VT == MVT::i16 || VT == MVT::i32)
29440 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
29441
29442 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29443 return splitVectorIntBinary(Op, DAG);
29444
29445 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29447, __extension__
__PRETTY_FUNCTION__))
29446 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29447, __extension__
__PRETTY_FUNCTION__))
29447 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29447, __extension__
__PRETTY_FUNCTION__))
;
29448 return splitVectorIntBinary(Op, DAG);
29449}
29450
29451static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
29452 const X86Subtarget &Subtarget) {
29453 MVT VT = Op.getSimpleValueType();
29454 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29455 unsigned Opcode = Op.getOpcode();
29456 SDLoc DL(Op);
29457
29458 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29459 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29460 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29461, __extension__
__PRETTY_FUNCTION__))
29461 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29461, __extension__
__PRETTY_FUNCTION__))
;
29462 return splitVectorIntBinary(Op, DAG);
29463 }
29464
29465 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29466 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29467 EVT SetCCResultType =
29468 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29469
29470 unsigned BitWidth = VT.getScalarSizeInBits();
29471 if (Opcode == ISD::USUBSAT) {
29472 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29473 // Handle a special-case with a bit-hack instead of cmp+select:
29474 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29475 // If the target can use VPTERNLOG, DAGToDAG will match this as
29476 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29477 // "broadcast" constant load.
29478 ConstantSDNode *C = isConstOrConstSplat(Y, true);
29479 if (C && C->getAPIntValue().isSignMask()) {
29480 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29481 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29482 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29483 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29484 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29485 }
29486 }
29487 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29488 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29489 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29490 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29491 // TODO: Move this to DAGCombiner?
29492 if (SetCCResultType == VT &&
29493 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29494 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29495 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29496 }
29497 }
29498
29499 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29500 (!VT.isVector() || VT == MVT::v2i64)) {
29501 APInt MinVal = APInt::getSignedMinValue(BitWidth);
29502 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
29503 SDValue Zero = DAG.getConstant(0, DL, VT);
29504 SDValue Result =
29505 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29506 DAG.getVTList(VT, SetCCResultType), X, Y);
29507 SDValue SumDiff = Result.getValue(0);
29508 SDValue Overflow = Result.getValue(1);
29509 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29510 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29511 SDValue SumNeg =
29512 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29513 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29514 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29515 }
29516
29517 // Use default expansion.
29518 return SDValue();
29519}
29520
29521static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29522 SelectionDAG &DAG) {
29523 MVT VT = Op.getSimpleValueType();
29524 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29525 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29526 // 8-bit integer abs to NEG and CMOV.
29527 SDLoc DL(Op);
29528 SDValue N0 = Op.getOperand(0);
29529 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29530 DAG.getConstant(0, DL, VT), N0);
29531 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29532 SDValue(Neg.getNode(), 1)};
29533 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29534 }
29535
29536 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29537 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29538 SDLoc DL(Op);
29539 SDValue Src = Op.getOperand(0);
29540 SDValue Sub =
29541 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
29542 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
29543 }
29544
29545 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29546 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29547, __extension__
__PRETTY_FUNCTION__))
29547 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29547, __extension__
__PRETTY_FUNCTION__))
;
29548 return splitVectorIntUnary(Op, DAG);
29549 }
29550
29551 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29552 return splitVectorIntUnary(Op, DAG);
29553
29554 // Default to expand.
29555 return SDValue();
29556}
29557
29558static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29559 SelectionDAG &DAG) {
29560 MVT VT = Op.getSimpleValueType();
29561
29562 // For AVX1 cases, split to use legal ops.
29563 if (VT.is256BitVector() && !Subtarget.hasInt256())
29564 return splitVectorIntBinary(Op, DAG);
29565
29566 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29567 return splitVectorIntBinary(Op, DAG);
29568
29569 // Default to expand.
29570 return SDValue();
29571}
29572
29573static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29574 SelectionDAG &DAG) {
29575 MVT VT = Op.getSimpleValueType();
29576
29577 // For AVX1 cases, split to use legal ops.
29578 if (VT.is256BitVector() && !Subtarget.hasInt256())
29579 return splitVectorIntBinary(Op, DAG);
29580
29581 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29582 return splitVectorIntBinary(Op, DAG);
29583
29584 // Default to expand.
29585 return SDValue();
29586}
29587
29588static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29589 SelectionDAG &DAG) {
29590 SDLoc dl(Op);
29591 MVT VT = Op.getSimpleValueType();
29592
29593 // Decompose 256-bit ops into 128-bit ops.
29594 if (VT.is256BitVector() && !Subtarget.hasInt256())
29595 return splitVectorIntBinary(Op, DAG);
29596
29597 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29598 return splitVectorIntBinary(Op, DAG);
29599
29600 SDValue A = Op.getOperand(0);
29601 SDValue B = Op.getOperand(1);
29602
29603 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29604 // vector pairs, multiply and truncate.
29605 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29606 unsigned NumElts = VT.getVectorNumElements();
29607
29608 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29609 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29610 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29611 return DAG.getNode(
29612 ISD::TRUNCATE, dl, VT,
29613 DAG.getNode(ISD::MUL, dl, ExVT,
29614 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29615 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29616 }
29617
29618 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29619
29620 // Extract the lo/hi parts to any extend to i16.
29621 // We're going to mask off the low byte of each result element of the
29622 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29623 // element.
29624 SDValue Undef = DAG.getUNDEF(VT);
29625 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29626 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29627
29628 SDValue BLo, BHi;
29629 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29630 // If the RHS is a constant, manually unpackl/unpackh.
29631 SmallVector<SDValue, 16> LoOps, HiOps;
29632 for (unsigned i = 0; i != NumElts; i += 16) {
29633 for (unsigned j = 0; j != 8; ++j) {
29634 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29635 MVT::i16));
29636 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29637 MVT::i16));
29638 }
29639 }
29640
29641 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29642 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29643 } else {
29644 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29645 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29646 }
29647
29648 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29649 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29650 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29651 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29652 }
29653
29654 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29655 if (VT == MVT::v4i32) {
29656 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29657, __extension__
__PRETTY_FUNCTION__))
29657 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29657, __extension__
__PRETTY_FUNCTION__))
;
29658
29659 // Extract the odd parts.
29660 static const int UnpackMask[] = { 1, -1, 3, -1 };
29661 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29662 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29663
29664 // Multiply the even parts.
29665 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29666 DAG.getBitcast(MVT::v2i64, A),
29667 DAG.getBitcast(MVT::v2i64, B));
29668 // Now multiply odd parts.
29669 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29670 DAG.getBitcast(MVT::v2i64, Aodds),
29671 DAG.getBitcast(MVT::v2i64, Bodds));
29672
29673 Evens = DAG.getBitcast(VT, Evens);
29674 Odds = DAG.getBitcast(VT, Odds);
29675
29676 // Merge the two vectors back together with a shuffle. This expands into 2
29677 // shuffles.
29678 static const int ShufMask[] = { 0, 4, 2, 6 };
29679 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29680 }
29681
29682 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29683, __extension__
__PRETTY_FUNCTION__))
29683 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29683, __extension__
__PRETTY_FUNCTION__))
;
29684 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29684, __extension__
__PRETTY_FUNCTION__))
;
29685
29686 // Ahi = psrlqi(a, 32);
29687 // Bhi = psrlqi(b, 32);
29688 //
29689 // AloBlo = pmuludq(a, b);
29690 // AloBhi = pmuludq(a, Bhi);
29691 // AhiBlo = pmuludq(Ahi, b);
29692 //
29693 // Hi = psllqi(AloBhi + AhiBlo, 32);
29694 // return AloBlo + Hi;
29695 KnownBits AKnown = DAG.computeKnownBits(A);
29696 KnownBits BKnown = DAG.computeKnownBits(B);
29697
29698 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29699 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29700 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29701
29702 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29703 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29704 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29705
29706 SDValue Zero = DAG.getConstant(0, dl, VT);
29707
29708 // Only multiply lo/hi halves that aren't known to be zero.
29709 SDValue AloBlo = Zero;
29710 if (!ALoIsZero && !BLoIsZero)
29711 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29712
29713 SDValue AloBhi = Zero;
29714 if (!ALoIsZero && !BHiIsZero) {
29715 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29716 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29717 }
29718
29719 SDValue AhiBlo = Zero;
29720 if (!AHiIsZero && !BLoIsZero) {
29721 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29722 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29723 }
29724
29725 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29726 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29727
29728 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29729}
29730
29731static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
29732 MVT VT, bool IsSigned,
29733 const X86Subtarget &Subtarget,
29734 SelectionDAG &DAG,
29735 SDValue *Low = nullptr) {
29736 unsigned NumElts = VT.getVectorNumElements();
29737
29738 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29739 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29740 // lane results back together.
29741
29742 // We'll take different approaches for signed and unsigned.
29743 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29744 // and use pmullw to calculate the full 16-bit product.
29745 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29746 // shift them left into the upper byte of each word. This allows us to use
29747 // pmulhw to calculate the full 16-bit product. This trick means we don't
29748 // need to sign extend the bytes to use pmullw.
29749
29750 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29751 SDValue Zero = DAG.getConstant(0, dl, VT);
29752
29753 SDValue ALo, AHi;
29754 if (IsSigned) {
29755 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29756 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29757 } else {
29758 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29759 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29760 }
29761
29762 SDValue BLo, BHi;
29763 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29764 // If the RHS is a constant, manually unpackl/unpackh and extend.
29765 SmallVector<SDValue, 16> LoOps, HiOps;
29766 for (unsigned i = 0; i != NumElts; i += 16) {
29767 for (unsigned j = 0; j != 8; ++j) {
29768 SDValue LoOp = B.getOperand(i + j);
29769 SDValue HiOp = B.getOperand(i + j + 8);
29770
29771 if (IsSigned) {
29772 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29773 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29774 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29775 DAG.getConstant(8, dl, MVT::i16));
29776 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29777 DAG.getConstant(8, dl, MVT::i16));
29778 } else {
29779 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29780 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29781 }
29782
29783 LoOps.push_back(LoOp);
29784 HiOps.push_back(HiOp);
29785 }
29786 }
29787
29788 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29789 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29790 } else if (IsSigned) {
29791 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29792 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29793 } else {
29794 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29795 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29796 }
29797
29798 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29799 // pack back to vXi8.
29800 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29801 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29802 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29803
29804 if (Low)
29805 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29806
29807 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29808}
29809
29810static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29811 SelectionDAG &DAG) {
29812 SDLoc dl(Op);
29813 MVT VT = Op.getSimpleValueType();
29814 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29815 unsigned NumElts = VT.getVectorNumElements();
29816 SDValue A = Op.getOperand(0);
29817 SDValue B = Op.getOperand(1);
29818
29819 // Decompose 256-bit ops into 128-bit ops.
29820 if (VT.is256BitVector() && !Subtarget.hasInt256())
29821 return splitVectorIntBinary(Op, DAG);
29822
29823 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29824 return splitVectorIntBinary(Op, DAG);
29825
29826 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29827 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29829, __extension__
__PRETTY_FUNCTION__))
29828 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29829, __extension__
__PRETTY_FUNCTION__))
29829 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29829, __extension__
__PRETTY_FUNCTION__))
;
29830
29831 // PMULxD operations multiply each even value (starting at 0) of LHS with
29832 // the related value of RHS and produce a widen result.
29833 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29834 // => <2 x i64> <ae|cg>
29835 //
29836 // In other word, to have all the results, we need to perform two PMULxD:
29837 // 1. one with the even values.
29838 // 2. one with the odd values.
29839 // To achieve #2, with need to place the odd values at an even position.
29840 //
29841 // Place the odd value at an even position (basically, shift all values 1
29842 // step to the left):
29843 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29844 9, -1, 11, -1, 13, -1, 15, -1};
29845 // <a|b|c|d> => <b|undef|d|undef>
29846 SDValue Odd0 =
29847 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29848 // <e|f|g|h> => <f|undef|h|undef>
29849 SDValue Odd1 =
29850 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29851
29852 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29853 // ints.
29854 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29855 unsigned Opcode =
29856 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29857 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29858 // => <2 x i64> <ae|cg>
29859 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29860 DAG.getBitcast(MulVT, A),
29861 DAG.getBitcast(MulVT, B)));
29862 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29863 // => <2 x i64> <bf|dh>
29864 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29865 DAG.getBitcast(MulVT, Odd0),
29866 DAG.getBitcast(MulVT, Odd1)));
29867
29868 // Shuffle it back into the right order.
29869 SmallVector<int, 16> ShufMask(NumElts);
29870 for (int i = 0; i != (int)NumElts; ++i)
29871 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29872
29873 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29874
29875 // If we have a signed multiply but no PMULDQ fix up the result of an
29876 // unsigned multiply.
29877 if (IsSigned && !Subtarget.hasSSE41()) {
29878 SDValue Zero = DAG.getConstant(0, dl, VT);
29879 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29880 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29881 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29882 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29883
29884 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29885 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29886 }
29887
29888 return Res;
29889 }
29890
29891 // Only i8 vectors should need custom lowering after this.
29892 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29894, __extension__
__PRETTY_FUNCTION__))
29893 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29894, __extension__
__PRETTY_FUNCTION__))
29894 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29894, __extension__
__PRETTY_FUNCTION__))
;
29895
29896 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29897 // logical shift down the upper half and pack back to i8.
29898
29899 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29900 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29901
29902 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29903 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29904 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29905 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29906 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29907 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29908 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29909 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29910 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29911 }
29912
29913 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29914}
29915
29916// Custom lowering for SMULO/UMULO.
29917static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29918 SelectionDAG &DAG) {
29919 MVT VT = Op.getSimpleValueType();
29920
29921 // Scalars defer to LowerXALUO.
29922 if (!VT.isVector())
29923 return LowerXALUO(Op, DAG);
29924
29925 SDLoc dl(Op);
29926 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29927 SDValue A = Op.getOperand(0);
29928 SDValue B = Op.getOperand(1);
29929 EVT OvfVT = Op->getValueType(1);
29930
29931 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29932 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29933 // Extract the LHS Lo/Hi vectors
29934 SDValue LHSLo, LHSHi;
29935 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29936
29937 // Extract the RHS Lo/Hi vectors
29938 SDValue RHSLo, RHSHi;
29939 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29940
29941 EVT LoOvfVT, HiOvfVT;
29942 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29943 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29944 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29945
29946 // Issue the split operations.
29947 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29948 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29949
29950 // Join the separate data results and the overflow results.
29951 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29952 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29953 Hi.getValue(1));
29954
29955 return DAG.getMergeValues({Res, Ovf}, dl);
29956 }
29957
29958 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29959 EVT SetccVT =
29960 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29961
29962 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29963 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29964 unsigned NumElts = VT.getVectorNumElements();
29965 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29966 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29967 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29968 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29969 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29970
29971 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29972
29973 SDValue Ovf;
29974 if (IsSigned) {
29975 SDValue High, LowSign;
29976 if (OvfVT.getVectorElementType() == MVT::i1 &&
29977 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29978 // Rather the truncating try to do the compare on vXi16 or vXi32.
29979 // Shift the high down filling with sign bits.
29980 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29981 // Fill all 16 bits with the sign bit from the low.
29982 LowSign =
29983 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29984 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29985 15, DAG);
29986 SetccVT = OvfVT;
29987 if (!Subtarget.hasBWI()) {
29988 // We can't do a vXi16 compare so sign extend to v16i32.
29989 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29990 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29991 }
29992 } else {
29993 // Otherwise do the compare at vXi8.
29994 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29995 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29996 LowSign =
29997 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29998 }
29999
30000 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30001 } else {
30002 SDValue High =
30003 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30004 if (OvfVT.getVectorElementType() == MVT::i1 &&
30005 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30006 // Rather the truncating try to do the compare on vXi16 or vXi32.
30007 SetccVT = OvfVT;
30008 if (!Subtarget.hasBWI()) {
30009 // We can't do a vXi16 compare so sign extend to v16i32.
30010 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30011 }
30012 } else {
30013 // Otherwise do the compare at vXi8.
30014 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30015 }
30016
30017 Ovf =
30018 DAG.getSetCC(dl, SetccVT, High,
30019 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30020 }
30021
30022 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30023
30024 return DAG.getMergeValues({Low, Ovf}, dl);
30025 }
30026
30027 SDValue Low;
30028 SDValue High =
30029 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30030
30031 SDValue Ovf;
30032 if (IsSigned) {
30033 // SMULO overflows if the high bits don't match the sign of the low.
30034 SDValue LowSign =
30035 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30036 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30037 } else {
30038 // UMULO overflows if the high bits are non-zero.
30039 Ovf =
30040 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30041 }
30042
30043 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30044
30045 return DAG.getMergeValues({Low, Ovf}, dl);
30046}
30047
30048SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30049 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30049, __extension__
__PRETTY_FUNCTION__))
;
30050 EVT VT = Op.getValueType();
30051 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30052, __extension__
__PRETTY_FUNCTION__))
30052 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30052, __extension__
__PRETTY_FUNCTION__))
;
30053
30054 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30055 SmallVector<SDValue> Result;
30056 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30057 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30058 }
30059
30060 RTLIB::Libcall LC;
30061 bool isSigned;
30062 switch (Op->getOpcode()) {
30063 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30063)
;
30064 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30065 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30066 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30067 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30068 }
30069
30070 SDLoc dl(Op);
30071 SDValue InChain = DAG.getEntryNode();
30072
30073 TargetLowering::ArgListTy Args;
30074 TargetLowering::ArgListEntry Entry;
30075 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30076 EVT ArgVT = Op->getOperand(i).getValueType();
30077 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30078, __extension__
__PRETTY_FUNCTION__))
30078 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30078, __extension__
__PRETTY_FUNCTION__))
;
30079 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30080 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30081 MachinePointerInfo MPI =
30082 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30083 Entry.Node = StackPtr;
30084 InChain =
30085 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30086 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30087 Entry.Ty = PointerType::get(ArgTy,0);
30088 Entry.IsSExt = false;
30089 Entry.IsZExt = false;
30090 Args.push_back(Entry);
30091 }
30092
30093 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
30094 getPointerTy(DAG.getDataLayout()));
30095
30096 TargetLowering::CallLoweringInfo CLI(DAG);
30097 CLI.setDebugLoc(dl)
30098 .setChain(InChain)
30099 .setLibCallee(
30100 getLibcallCallingConv(LC),
30101 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30102 std::move(Args))
30103 .setInRegister()
30104 .setSExtResult(isSigned)
30105 .setZExtResult(!isSigned);
30106
30107 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30108 return DAG.getBitcast(VT, CallInfo.first);
30109}
30110
30111SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30112 SelectionDAG &DAG,
30113 SDValue &Chain) const {
30114 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))
;
30115 EVT VT = Op.getValueType();
30116 bool IsStrict = Op->isStrictFPOpcode();
30117
30118 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30119 EVT ArgVT = Arg.getValueType();
30120
30121 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30122, __extension__
__PRETTY_FUNCTION__))
30122 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30122, __extension__
__PRETTY_FUNCTION__))
;
30123
30124 RTLIB::Libcall LC;
30125 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30126 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30127 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30128 else
30129 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30130 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30130, __extension__
__PRETTY_FUNCTION__))
;
30131
30132 SDLoc dl(Op);
30133 MakeLibCallOptions CallOptions;
30134 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30135
30136 SDValue Result;
30137 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30138 // expected VT (i128).
30139 std::tie(Result, Chain) =
30140 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30141 Result = DAG.getBitcast(VT, Result);
30142 return Result;
30143}
30144
30145SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30146 SelectionDAG &DAG) const {
30147 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30147, __extension__
__PRETTY_FUNCTION__))
;
30148 EVT VT = Op.getValueType();
30149 bool IsStrict = Op->isStrictFPOpcode();
30150
30151 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30152 EVT ArgVT = Arg.getValueType();
30153
30154 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30155, __extension__
__PRETTY_FUNCTION__))
30155 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30155, __extension__
__PRETTY_FUNCTION__))
;
30156
30157 RTLIB::Libcall LC;
30158 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30159 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30160 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30161 else
30162 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30163 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30163, __extension__
__PRETTY_FUNCTION__))
;
30164
30165 SDLoc dl(Op);
30166 MakeLibCallOptions CallOptions;
30167 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30168
30169 // Pass the i128 argument as an indirect argument on the stack.
30170 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30171 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30172 MachinePointerInfo MPI =
30173 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30174 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30175
30176 SDValue Result;
30177 std::tie(Result, Chain) =
30178 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30179 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30180}
30181
30182// Return true if the required (according to Opcode) shift-imm form is natively
30183// supported by the Subtarget
30184static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
30185 unsigned Opcode) {
30186 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30187 return false;
30188
30189 if (VT.getScalarSizeInBits() < 16)
30190 return false;
30191
30192 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30193 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30194 return true;
30195
30196 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30197 (VT.is256BitVector() && Subtarget.hasInt256());
30198
30199 bool AShift = LShift && (Subtarget.hasAVX512() ||
30200 (VT != MVT::v2i64 && VT != MVT::v4i64));
30201 return (Opcode == ISD::SRA) ? AShift : LShift;
30202}
30203
30204// The shift amount is a variable, but it is the same for all vector lanes.
30205// These instructions are defined together with shift-immediate.
30206static
30207bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
30208 unsigned Opcode) {
30209 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30210}
30211
30212// Return true if the required (according to Opcode) variable-shift form is
30213// natively supported by the Subtarget
30214static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
30215 unsigned Opcode) {
30216 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30217 return false;
30218
30219 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30220 return false;
30221
30222 // vXi16 supported only on AVX-512, BWI
30223 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30224 return false;
30225
30226 if (Subtarget.hasAVX512() &&
30227 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30228 return true;
30229
30230 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30231 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30232 return (Opcode == ISD::SRA) ? AShift : LShift;
30233}
30234
30235static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
30236 const X86Subtarget &Subtarget) {
30237 MVT VT = Op.getSimpleValueType();
30238 SDLoc dl(Op);
30239 SDValue R = Op.getOperand(0);
30240 SDValue Amt = Op.getOperand(1);
30241 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30242
30243 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30244 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__))
;
30245 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30246 SDValue Ex = DAG.getBitcast(ExVT, R);
30247
30248 // ashr(R, 63) === cmp_slt(R, 0)
30249 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30250 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30251, __extension__
__PRETTY_FUNCTION__))
30251 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30251, __extension__
__PRETTY_FUNCTION__))
;
30252 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30253 }
30254
30255 if (ShiftAmt >= 32) {
30256 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30257 SDValue Upper =
30258 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30259 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
30260 ShiftAmt - 32, DAG);
30261 if (VT == MVT::v2i64)
30262 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30263 if (VT == MVT::v4i64)
30264 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30265 {9, 1, 11, 3, 13, 5, 15, 7});
30266 } else {
30267 // SRA upper i32, SRL whole i64 and select lower i32.
30268 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
30269 ShiftAmt, DAG);
30270 SDValue Lower =
30271 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30272 Lower = DAG.getBitcast(ExVT, Lower);
30273 if (VT == MVT::v2i64)
30274 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30275 if (VT == MVT::v4i64)
30276 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30277 {8, 1, 10, 3, 12, 5, 14, 7});
30278 }
30279 return DAG.getBitcast(VT, Ex);
30280 };
30281
30282 // Optimize shl/srl/sra with constant shift amount.
30283 APInt APIntShiftAmt;
30284 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30285 return SDValue();
30286
30287 // If the shift amount is out of range, return undef.
30288 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
30289 return DAG.getUNDEF(VT);
30290
30291 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30292
30293 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30294 // Hardware support for vector shifts is sparse which makes us scalarize the
30295 // vector operations in many cases. Also, on sandybridge ADD is faster than
30296 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30297 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30298 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30299 // must be 0). (add undef, undef) however can be any value. To make this
30300 // safe, we must freeze R to ensure that register allocation uses the same
30301 // register for an undefined value. This ensures that the result will
30302 // still be even and preserves the original semantics.
30303 R = DAG.getFreeze(R);
30304 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30305 }
30306
30307 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30308 }
30309
30310 // i64 SRA needs to be performed as partial shifts.
30311 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30312 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30313 Op.getOpcode() == ISD::SRA)
30314 return ArithmeticShiftRight64(ShiftAmt);
30315
30316 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30317 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30318 unsigned NumElts = VT.getVectorNumElements();
30319 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30320
30321 // Simple i8 add case
30322 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30323 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30324 // must be 0). (add undef, undef) however can be any value. To make this
30325 // safe, we must freeze R to ensure that register allocation uses the same
30326 // register for an undefined value. This ensures that the result will
30327 // still be even and preserves the original semantics.
30328 R = DAG.getFreeze(R);
30329 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30330 }
30331
30332 // ashr(R, 7) === cmp_slt(R, 0)
30333 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30334 SDValue Zeros = DAG.getConstant(0, dl, VT);
30335 if (VT.is512BitVector()) {
30336 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30336, __extension__
__PRETTY_FUNCTION__))
;
30337 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30338 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30339 }
30340 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30341 }
30342
30343 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30344 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30345 return SDValue();
30346
30347 if (Op.getOpcode() == ISD::SHL) {
30348 // Make a large shift.
30349 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30350 ShiftAmt, DAG);
30351 SHL = DAG.getBitcast(VT, SHL);
30352 // Zero out the rightmost bits.
30353 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30354 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30355 }
30356 if (Op.getOpcode() == ISD::SRL) {
30357 // Make a large shift.
30358 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30359 ShiftAmt, DAG);
30360 SRL = DAG.getBitcast(VT, SRL);
30361 // Zero out the leftmost bits.
30362 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30363 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30364 }
30365 if (Op.getOpcode() == ISD::SRA) {
30366 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30367 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30368
30369 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30370 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30371 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30372 return Res;
30373 }
30374 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30374)
;
30375 }
30376
30377 return SDValue();
30378}
30379
30380static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
30381 const X86Subtarget &Subtarget) {
30382 MVT VT = Op.getSimpleValueType();
30383 SDLoc dl(Op);
30384 SDValue R = Op.getOperand(0);
30385 SDValue Amt = Op.getOperand(1);
30386 unsigned Opcode = Op.getOpcode();
30387 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30388
30389 int BaseShAmtIdx = -1;
30390 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30391 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30392 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30393 Subtarget, DAG);
30394
30395 // vXi8 shifts - shift as v8i16 + mask result.
30396 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30397 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30398 VT == MVT::v64i8) &&
30399 !Subtarget.hasXOP()) {
30400 unsigned NumElts = VT.getVectorNumElements();
30401 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30402 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30403 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30404 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30405
30406 // Create the mask using vXi16 shifts. For shift-rights we need to move
30407 // the upper byte down before splatting the vXi8 mask.
30408 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
30409 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30410 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30411 if (Opcode != ISD::SHL)
30412 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30413 8, DAG);
30414 BitMask = DAG.getBitcast(VT, BitMask);
30415 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30416 SmallVector<int, 64>(NumElts, 0));
30417
30418 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30419 DAG.getBitcast(ExtVT, R), BaseShAmt,
30420 BaseShAmtIdx, Subtarget, DAG);
30421 Res = DAG.getBitcast(VT, Res);
30422 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30423
30424 if (Opcode == ISD::SRA) {
30425 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30426 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30427 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30428 SignMask =
30429 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30430 BaseShAmtIdx, Subtarget, DAG);
30431 SignMask = DAG.getBitcast(VT, SignMask);
30432 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30433 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30434 }
30435 return Res;
30436 }
30437 }
30438 }
30439
30440 return SDValue();
30441}
30442
30443// Convert a shift/rotate left amount to a multiplication scale factor.
30444static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
30445 const X86Subtarget &Subtarget,
30446 SelectionDAG &DAG) {
30447 MVT VT = Amt.getSimpleValueType();
30448 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30449 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30450 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30451 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30452 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30453 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30454 return SDValue();
30455
30456 MVT SVT = VT.getVectorElementType();
30457 unsigned SVTBits = SVT.getSizeInBits();
30458 unsigned NumElems = VT.getVectorNumElements();
30459
30460 APInt UndefElts;
30461 SmallVector<APInt> EltBits;
30462 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30463 APInt One(SVTBits, 1);
30464 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30465 for (unsigned I = 0; I != NumElems; ++I) {
30466 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30467 continue;
30468 uint64_t ShAmt = EltBits[I].getZExtValue();
30469 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30470 }
30471 return DAG.getBuildVector(VT, dl, Elts);
30472 }
30473
30474 // If the target doesn't support variable shifts, use either FP conversion
30475 // or integer multiplication to avoid shifting each element individually.
30476 if (VT == MVT::v4i32) {
30477 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30478 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30479 DAG.getConstant(0x3f800000U, dl, VT));
30480 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30481 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30482 }
30483
30484 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30485 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30486 SDValue Z = DAG.getConstant(0, dl, VT);
30487 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30488 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30489 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30490 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30491 if (Subtarget.hasSSE41())
30492 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30493 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30494 }
30495
30496 return SDValue();
30497}
30498
30499static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30500 SelectionDAG &DAG) {
30501 MVT VT = Op.getSimpleValueType();
30502 SDLoc dl(Op);
30503 SDValue R = Op.getOperand(0);
30504 SDValue Amt = Op.getOperand(1);
30505 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30506 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30507
30508 unsigned Opc = Op.getOpcode();
30509 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30510 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30511
30512 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30512, __extension__
__PRETTY_FUNCTION__))
;
30513 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__))
;
30514
30515 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30516 return V;
30517
30518 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30519 return V;
30520
30521 if (supportedVectorVarShift(VT, Subtarget, Opc))
30522 return Op;
30523
30524 // i64 vector arithmetic shift can be emulated with the transform:
30525 // M = lshr(SIGN_MASK, Amt)
30526 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30527 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30528 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30529 Opc == ISD::SRA) {
30530 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30531 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30532 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30533 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30534 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30535 return R;
30536 }
30537
30538 // XOP has 128-bit variable logical/arithmetic shifts.
30539 // +ve/-ve Amt = shift left/right.
30540 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30541 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30542 if (Opc == ISD::SRL || Opc == ISD::SRA) {
30543 SDValue Zero = DAG.getConstant(0, dl, VT);
30544 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
30545 }
30546 if (Opc == ISD::SHL || Opc == ISD::SRL)
30547 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30548 if (Opc == ISD::SRA)
30549 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30550 }
30551
30552 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30553 // shifts per-lane and then shuffle the partial results back together.
30554 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30555 // Splat the shift amounts so the scalar shifts above will catch it.
30556 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30557 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30558 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30559 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30560 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30561 }
30562
30563 // If possible, lower this shift as a sequence of two shifts by
30564 // constant plus a BLENDing shuffle instead of scalarizing it.
30565 // Example:
30566 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30567 //
30568 // Could be rewritten as:
30569 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30570 //
30571 // The advantage is that the two shifts from the example would be
30572 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30573 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30574 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30575 SDValue Amt1, Amt2;
30576 unsigned NumElts = VT.getVectorNumElements();
30577 SmallVector<int, 8> ShuffleMask;
30578 for (unsigned i = 0; i != NumElts; ++i) {
30579 SDValue A = Amt->getOperand(i);
30580 if (A.isUndef()) {
30581 ShuffleMask.push_back(SM_SentinelUndef);
30582 continue;
30583 }
30584 if (!Amt1 || Amt1 == A) {
30585 ShuffleMask.push_back(i);
30586 Amt1 = A;
30587 continue;
30588 }
30589 if (!Amt2 || Amt2 == A) {
30590 ShuffleMask.push_back(i + NumElts);
30591 Amt2 = A;
30592 continue;
30593 }
30594 break;
30595 }
30596
30597 // Only perform this blend if we can perform it without loading a mask.
30598 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
30599 (VT != MVT::v16i16 ||
30600 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30601 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30602 canWidenShuffleElements(ShuffleMask))) {
30603 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
30604 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
30605 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
30606 Cst2->getAPIntValue().ult(EltSizeInBits)) {
30607 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
30608 Cst1->getZExtValue(), DAG);
30609 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
30610 Cst2->getZExtValue(), DAG);
30611 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30612 }
30613 }
30614 }
30615
30616 // If possible, lower this packed shift into a vector multiply instead of
30617 // expanding it into a sequence of scalar shifts.
30618 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30619 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30620 Subtarget.canExtendTo512BW())))
30621 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30622 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30623
30624 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30625 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30626 if (Opc == ISD::SRL && ConstantAmt &&
30627 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30628 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30629 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30630 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30631 SDValue Zero = DAG.getConstant(0, dl, VT);
30632 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30633 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30634 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30635 }
30636 }
30637
30638 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30639 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30640 // TODO: Special case handling for shift by 0/1, really we can afford either
30641 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30642 if (Opc == ISD::SRA && ConstantAmt &&
30643 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30644 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30645 !Subtarget.hasAVX512()) ||
30646 DAG.isKnownNeverZero(Amt))) {
30647 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30648 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30649 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30650 SDValue Amt0 =
30651 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30652 SDValue Amt1 =
30653 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30654 SDValue Sra1 =
30655 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30656 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30657 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30658 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30659 }
30660 }
30661
30662 // v4i32 Non Uniform Shifts.
30663 // If the shift amount is constant we can shift each lane using the SSE2
30664 // immediate shifts, else we need to zero-extend each lane to the lower i64
30665 // and shift using the SSE2 variable shifts.
30666 // The separate results can then be blended together.
30667 if (VT == MVT::v4i32) {
30668 SDValue Amt0, Amt1, Amt2, Amt3;
30669 if (ConstantAmt) {
30670 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30671 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30672 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30673 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30674 } else {
30675 // The SSE2 shifts use the lower i64 as the same shift amount for
30676 // all lanes and the upper i64 is ignored. On AVX we're better off
30677 // just zero-extending, but for SSE just duplicating the top 16-bits is
30678 // cheaper and has the same effect for out of range values.
30679 if (Subtarget.hasAVX()) {
30680 SDValue Z = DAG.getConstant(0, dl, VT);
30681 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30682 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30683 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30684 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30685 } else {
30686 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30687 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30688 {4, 5, 6, 7, -1, -1, -1, -1});
30689 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30690 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30691 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30692 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30693 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30694 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30695 }
30696 }
30697
30698 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30699 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30700 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30701 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30702 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30703
30704 // Merge the shifted lane results optimally with/without PBLENDW.
30705 // TODO - ideally shuffle combining would handle this.
30706 if (Subtarget.hasSSE41()) {
30707 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30708 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30709 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30710 }
30711 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30712 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30713 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30714 }
30715
30716 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30717 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30718 // make the existing SSE solution better.
30719 // NOTE: We honor prefered vector width before promoting to 512-bits.
30720 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30721 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30722 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30723 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30724 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30725 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30726, __extension__
__PRETTY_FUNCTION__))
30726 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30726, __extension__
__PRETTY_FUNCTION__))
;
30727 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30728 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
30729 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30730 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30731 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30732 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30733 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30734 }
30735
30736 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30737 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30738 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30739 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30740 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30741 !Subtarget.hasXOP()) {
30742 int NumElts = VT.getVectorNumElements();
30743 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30744
30745 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30746 // isn't legal).
30747 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30748 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30749 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30750 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30751 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30752, __extension__
__PRETTY_FUNCTION__))
30752 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30752, __extension__
__PRETTY_FUNCTION__))
;
30753
30754 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30755 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
30756 : DAG.getZExtOrTrunc(R, dl, ExVT);
30757 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30758 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30759 return DAG.getZExtOrTrunc(R, dl, VT);
30760 }
30761
30762 SmallVector<SDValue, 16> LoAmt, HiAmt;
30763 for (int i = 0; i != NumElts; i += 16) {
30764 for (int j = 0; j != 8; ++j) {
30765 LoAmt.push_back(Amt.getOperand(i + j));
30766 HiAmt.push_back(Amt.getOperand(i + j + 8));
30767 }
30768 }
30769
30770 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30771 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30772 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30773
30774 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30775 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30776 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30777 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30778 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30779 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30780 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30781 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30782 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30783 }
30784
30785 if (VT == MVT::v16i8 ||
30786 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30787 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30788 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
30789
30790 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30791 if (VT.is512BitVector()) {
30792 // On AVX512BW targets we make use of the fact that VSELECT lowers
30793 // to a masked blend which selects bytes based just on the sign bit
30794 // extracted to a mask.
30795 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
30796 V0 = DAG.getBitcast(VT, V0);
30797 V1 = DAG.getBitcast(VT, V1);
30798 Sel = DAG.getBitcast(VT, Sel);
30799 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
30800 ISD::SETGT);
30801 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
30802 } else if (Subtarget.hasSSE41()) {
30803 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30804 // on the sign bit.
30805 V0 = DAG.getBitcast(VT, V0);
30806 V1 = DAG.getBitcast(VT, V1);
30807 Sel = DAG.getBitcast(VT, Sel);
30808 return DAG.getBitcast(SelVT,
30809 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
30810 }
30811 // On pre-SSE41 targets we test for the sign bit by comparing to
30812 // zero - a negative value will set all bits of the lanes to true
30813 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30814 SDValue Z = DAG.getConstant(0, dl, SelVT);
30815 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
30816 return DAG.getSelect(dl, SelVT, C, V0, V1);
30817 };
30818
30819 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30820 // We can safely do this using i16 shifts as we're only interested in
30821 // the 3 lower bits of each byte.
30822 Amt = DAG.getBitcast(ExtVT, Amt);
30823 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
30824 Amt = DAG.getBitcast(VT, Amt);
30825
30826 if (Opc == ISD::SHL || Opc == ISD::SRL) {
30827 // r = VSELECT(r, shift(r, 4), a);
30828 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
30829 R = SignBitSelect(VT, Amt, M, R);
30830
30831 // a += a
30832 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30833
30834 // r = VSELECT(r, shift(r, 2), a);
30835 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
30836 R = SignBitSelect(VT, Amt, M, R);
30837
30838 // a += a
30839 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30840
30841 // return VSELECT(r, shift(r, 1), a);
30842 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
30843 R = SignBitSelect(VT, Amt, M, R);
30844 return R;
30845 }
30846
30847 if (Opc == ISD::SRA) {
30848 // For SRA we need to unpack each byte to the higher byte of a i16 vector
30849 // so we can correctly sign extend. We don't care what happens to the
30850 // lower byte.
30851 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30852 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30853 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
30854 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
30855 ALo = DAG.getBitcast(ExtVT, ALo);
30856 AHi = DAG.getBitcast(ExtVT, AHi);
30857 RLo = DAG.getBitcast(ExtVT, RLo);
30858 RHi = DAG.getBitcast(ExtVT, RHi);
30859
30860 // r = VSELECT(r, shift(r, 4), a);
30861 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
30862 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
30863 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30864 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30865
30866 // a += a
30867 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30868 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30869
30870 // r = VSELECT(r, shift(r, 2), a);
30871 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
30872 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
30873 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30874 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30875
30876 // a += a
30877 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30878 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30879
30880 // r = VSELECT(r, shift(r, 1), a);
30881 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
30882 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
30883 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30884 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30885
30886 // Logical shift the result back to the lower byte, leaving a zero upper
30887 // byte meaning that we can safely pack with PACKUSWB.
30888 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
30889 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
30890 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
30891 }
30892 }
30893
30894 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
30895 MVT ExtVT = MVT::v8i32;
30896 SDValue Z = DAG.getConstant(0, dl, VT);
30897 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
30898 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
30899 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
30900 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
30901 ALo = DAG.getBitcast(ExtVT, ALo);
30902 AHi = DAG.getBitcast(ExtVT, AHi);
30903 RLo = DAG.getBitcast(ExtVT, RLo);
30904 RHi = DAG.getBitcast(ExtVT, RHi);
30905 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
30906 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
30907 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
30908 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
30909 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30910 }
30911
30912 if (VT == MVT::v8i16) {
30913 // If we have a constant shift amount, the non-SSE41 path is best as
30914 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
30915 bool UseSSE41 = Subtarget.hasSSE41() &&
30916 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30917
30918 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
30919 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
30920 // the sign bit.
30921 if (UseSSE41) {
30922 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
30923 V0 = DAG.getBitcast(ExtVT, V0);
30924 V1 = DAG.getBitcast(ExtVT, V1);
30925 Sel = DAG.getBitcast(ExtVT, Sel);
30926 return DAG.getBitcast(
30927 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
30928 }
30929 // On pre-SSE41 targets we splat the sign bit - a negative value will
30930 // set all bits of the lanes to true and VSELECT uses that in
30931 // its OR(AND(V0,C),AND(V1,~C)) lowering.
30932 SDValue C =
30933 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
30934 return DAG.getSelect(dl, VT, C, V0, V1);
30935 };
30936
30937 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
30938 if (UseSSE41) {
30939 // On SSE41 targets we need to replicate the shift mask in both
30940 // bytes for PBLENDVB.
30941 Amt = DAG.getNode(
30942 ISD::OR, dl, VT,
30943 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
30944 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
30945 } else {
30946 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
30947 }
30948
30949 // r = VSELECT(r, shift(r, 8), a);
30950 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
30951 R = SignBitSelect(Amt, M, R);
30952
30953 // a += a
30954 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30955
30956 // r = VSELECT(r, shift(r, 4), a);
30957 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
30958 R = SignBitSelect(Amt, M, R);
30959
30960 // a += a
30961 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30962
30963 // r = VSELECT(r, shift(r, 2), a);
30964 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
30965 R = SignBitSelect(Amt, M, R);
30966
30967 // a += a
30968 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30969
30970 // return VSELECT(r, shift(r, 1), a);
30971 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
30972 R = SignBitSelect(Amt, M, R);
30973 return R;
30974 }
30975
30976 // Decompose 256-bit shifts into 128-bit shifts.
30977 if (VT.is256BitVector())
30978 return splitVectorIntBinary(Op, DAG);
30979
30980 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30981 return splitVectorIntBinary(Op, DAG);
30982
30983 return SDValue();
30984}
30985
30986static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
30987 SelectionDAG &DAG) {
30988 MVT VT = Op.getSimpleValueType();
30989 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30990, __extension__
__PRETTY_FUNCTION__))
30990 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30990, __extension__
__PRETTY_FUNCTION__))
;
30991
30992 SDLoc DL(Op);
30993 SDValue Op0 = Op.getOperand(0);
30994 SDValue Op1 = Op.getOperand(1);
30995 SDValue Amt = Op.getOperand(2);
30996 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30997 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
30998
30999 if (VT.isVector()) {
31000 APInt APIntShiftAmt;
31001 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31002
31003 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31004 if (IsFSHR)
31005 std::swap(Op0, Op1);
31006
31007 if (IsCstSplat) {
31008 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31009 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31010 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31011 {Op0, Op1, Imm}, DAG, Subtarget);
31012 }
31013 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31014 {Op0, Op1, Amt}, DAG, Subtarget);
31015 }
31016 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))
31017 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))
31018 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))
31019 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))
;
31020
31021 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31022 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31023 if (IsCstSplat)
31024 return SDValue();
31025
31026 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31027 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31028 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31029
31030 // Constant vXi16 funnel shifts can be efficiently handled by default.
31031 if (IsCst && EltSizeInBits == 16)
31032 return SDValue();
31033
31034 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31035 unsigned NumElts = VT.getVectorNumElements();
31036 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31037 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31038
31039 // Split 256-bit integers on XOP/pre-AVX2 targets.
31040 // Split 512-bit integers on non 512-bit BWI targets.
31041 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31042 !Subtarget.hasAVX2())) ||
31043 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31044 EltSizeInBits < 32)) {
31045 // Pre-mask the amount modulo using the wider vector.
31046 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31047 return splitVectorOp(Op, DAG);
31048 }
31049
31050 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31051 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31052 int ScalarAmtIdx = -1;
31053 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31054 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31055 if (EltSizeInBits == 16)
31056 return SDValue();
31057
31058 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31059 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31060 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31061 ScalarAmtIdx, Subtarget, DAG);
31062 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31063 ScalarAmtIdx, Subtarget, DAG);
31064 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31065 }
31066 }
31067
31068 MVT WideSVT = MVT::getIntegerVT(
31069 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31070 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31071
31072 // If per-element shifts are legal, fallback to generic expansion.
31073 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31074 return SDValue();
31075
31076 // Attempt to fold as:
31077 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31078 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31079 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31080 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31081 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31082 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31083 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31084 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31085 EltSizeInBits, DAG);
31086 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31087 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31088 if (!IsFSHR)
31089 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31090 EltSizeInBits, DAG);
31091 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31092 }
31093
31094 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31095 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31096 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31097 SDValue Z = DAG.getConstant(0, DL, VT);
31098 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31099 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31100 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31101 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31102 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31103 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31104 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31105 }
31106
31107 // Fallback to generic expansion.
31108 return SDValue();
31109 }
31110 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31112, __extension__
__PRETTY_FUNCTION__))
31111 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31112, __extension__
__PRETTY_FUNCTION__))
31112 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31112, __extension__
__PRETTY_FUNCTION__))
;
31113
31114 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31115 bool OptForSize = DAG.shouldOptForSize();
31116 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31117
31118 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31119 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31120 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31121 !isa<ConstantSDNode>(Amt)) {
31122 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31123 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31124 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31125 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31126 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31127 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31128 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31129 if (IsFSHR) {
31130 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31131 } else {
31132 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31133 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31134 }
31135 return DAG.getZExtOrTrunc(Res, DL, VT);
31136 }
31137
31138 if (VT == MVT::i8 || ExpandFunnel)
31139 return SDValue();
31140
31141 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31142 if (VT == MVT::i16) {
31143 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31144 DAG.getConstant(15, DL, Amt.getValueType()));
31145 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31146 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31147 }
31148
31149 return Op;
31150}
31151
31152static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31153 SelectionDAG &DAG) {
31154 MVT VT = Op.getSimpleValueType();
31155 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31155, __extension__
__PRETTY_FUNCTION__))
;
31156
31157 SDLoc DL(Op);
31158 SDValue R = Op.getOperand(0);
31159 SDValue Amt = Op.getOperand(1);
31160 unsigned Opcode = Op.getOpcode();
31161 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31162 int NumElts = VT.getVectorNumElements();
31163 bool IsROTL = Opcode == ISD::ROTL;
31164
31165 // Check for constant splat rotation amount.
31166 APInt CstSplatValue;
31167 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31168
31169 // Check for splat rotate by zero.
31170 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31171 return R;
31172
31173 // AVX512 implicitly uses modulo rotation amounts.
31174 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
31175 // Attempt to rotate by immediate.
31176 if (IsCstSplat) {
31177 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31178 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31179 return DAG.getNode(RotOpc, DL, VT, R,
31180 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31181 }
31182
31183 // Else, fall-back on VPROLV/VPRORV.
31184 return Op;
31185 }
31186
31187 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31188 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31189 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31190 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31191 }
31192
31193 SDValue Z = DAG.getConstant(0, DL, VT);
31194
31195 if (!IsROTL) {
31196 // If the ISD::ROTR amount is constant, we're always better converting to
31197 // ISD::ROTL.
31198 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31199 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31200
31201 // XOP targets always prefers ISD::ROTL.
31202 if (Subtarget.hasXOP())
31203 return DAG.getNode(ISD::ROTL, DL, VT, R,
31204 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31205 }
31206
31207 // Split 256-bit integers on XOP/pre-AVX2 targets.
31208 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31209 return splitVectorIntBinary(Op, DAG);
31210
31211 // XOP has 128-bit vector variable + immediate rotates.
31212 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31213 // XOP implicitly uses modulo rotation amounts.
31214 if (Subtarget.hasXOP()) {
31215 assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31215, __extension__
__PRETTY_FUNCTION__))
;
31216 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31216, __extension__
__PRETTY_FUNCTION__))
;
31217
31218 // Attempt to rotate by immediate.
31219 if (IsCstSplat) {
31220 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31221 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31222 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31223 }
31224
31225 // Use general rotate by variable (per-element).
31226 return Op;
31227 }
31228
31229 // Rotate by an uniform constant - expand back to shifts.
31230 if (IsCstSplat)
31231 return SDValue();
31232
31233 // Split 512-bit integers on non 512-bit BWI targets.
31234 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31235 return splitVectorIntBinary(Op, DAG);
31236
31237 assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))
31238 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))
31239 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))
31240 Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))
31241 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))
31242 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))
;
31243
31244 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31245 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31246
31247 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31248 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31249
31250 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31251 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31252 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31253 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31254 int BaseRotAmtIdx = -1;
31255 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31256 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31257 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31258 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31259 }
31260 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31261 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31262 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31263 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31264 BaseRotAmtIdx, Subtarget, DAG);
31265 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31266 BaseRotAmtIdx, Subtarget, DAG);
31267 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31268 }
31269 }
31270
31271 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31272 // the amount bit.
31273 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31274 if (EltSizeInBits == 8) {
31275 bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31276 MVT WideVT =
31277 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31278 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31279
31280 // Attempt to fold as:
31281 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31282 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31283 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31284 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31285 // If we're rotating by constant, just use default promotion.
31286 if (IsConstAmt)
31287 return SDValue();
31288 // See if we can perform this by widening to vXi16 or vXi32.
31289 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31290 R = DAG.getNode(
31291 ISD::OR, DL, WideVT, R,
31292 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31293 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31294 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31295 if (IsROTL)
31296 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31297 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31298 }
31299
31300 // Attempt to fold as unpack(x,x) << zext(y):
31301 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31302 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31303 if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31304 // See if we can perform this by unpacking to lo/hi vXi16.
31305 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31306 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31307 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31308 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31309 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31310 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31311 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31312 }
31313 assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31313, __extension__
__PRETTY_FUNCTION__))
;
31314
31315 // We don't need ModuloAmt here as we just peek at individual bits.
31316 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31317 if (Subtarget.hasSSE41()) {
31318 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31319 // on the sign bit.
31320 V0 = DAG.getBitcast(VT, V0);
31321 V1 = DAG.getBitcast(VT, V1);
31322 Sel = DAG.getBitcast(VT, Sel);
31323 return DAG.getBitcast(SelVT,
31324 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31325 }
31326 // On pre-SSE41 targets we test for the sign bit by comparing to
31327 // zero - a negative value will set all bits of the lanes to true
31328 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31329 SDValue Z = DAG.getConstant(0, DL, SelVT);
31330 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31331 return DAG.getSelect(DL, SelVT, C, V0, V1);
31332 };
31333
31334 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31335 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31336 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31337 IsROTL = true;
31338 }
31339
31340 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31341 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31342
31343 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31344 // We can safely do this using i16 shifts as we're only interested in
31345 // the 3 lower bits of each byte.
31346 Amt = DAG.getBitcast(ExtVT, Amt);
31347 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31348 Amt = DAG.getBitcast(VT, Amt);
31349
31350 // r = VSELECT(r, rot(r, 4), a);
31351 SDValue M;
31352 M = DAG.getNode(
31353 ISD::OR, DL, VT,
31354 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31355 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31356 R = SignBitSelect(VT, Amt, M, R);
31357
31358 // a += a
31359 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31360
31361 // r = VSELECT(r, rot(r, 2), a);
31362 M = DAG.getNode(
31363 ISD::OR, DL, VT,
31364 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31365 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31366 R = SignBitSelect(VT, Amt, M, R);
31367
31368 // a += a
31369 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31370
31371 // return VSELECT(r, rot(r, 1), a);
31372 M = DAG.getNode(
31373 ISD::OR, DL, VT,
31374 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31375 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31376 return SignBitSelect(VT, Amt, M, R);
31377 }
31378
31379 bool IsSplatAmt = DAG.isSplatValue(Amt);
31380 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31381 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31382 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31383
31384 // Fallback for splats + all supported variable shifts.
31385 // Fallback for non-constants AVX2 vXi16 as well.
31386 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31387 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31388 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31389 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31390 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31391 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31392 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31393 }
31394
31395 // Everything below assumes ISD::ROTL.
31396 if (!IsROTL) {
31397 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31398 IsROTL = true;
31399 }
31400
31401 // ISD::ROT* uses modulo rotate amounts.
31402 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31403
31404 assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31404, __extension__
__PRETTY_FUNCTION__))
;
31405
31406 // As with shifts, attempt to convert the rotation amount to a multiplication
31407 // factor, fallback to general expansion.
31408 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31409 if (!Scale)
31410 return SDValue();
31411
31412 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31413 if (EltSizeInBits == 16) {
31414 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31415 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31416 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31417 }
31418
31419 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31420 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31421 // that can then be OR'd with the lower 32-bits.
31422 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31422, __extension__
__PRETTY_FUNCTION__))
;
31423 static const int OddMask[] = {1, -1, 3, -1};
31424 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31425 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31426
31427 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31428 DAG.getBitcast(MVT::v2i64, R),
31429 DAG.getBitcast(MVT::v2i64, Scale));
31430 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31431 DAG.getBitcast(MVT::v2i64, R13),
31432 DAG.getBitcast(MVT::v2i64, Scale13));
31433 Res02 = DAG.getBitcast(VT, Res02);
31434 Res13 = DAG.getBitcast(VT, Res13);
31435
31436 return DAG.getNode(ISD::OR, DL, VT,
31437 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31438 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31439}
31440
31441/// Returns true if the operand type is exactly twice the native width, and
31442/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31443/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31444/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31445bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31446 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31447
31448 if (OpWidth == 64)
31449 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31450 if (OpWidth == 128)
31451 return Subtarget.canUseCMPXCHG16B();
31452
31453 return false;
31454}
31455
31456TargetLoweringBase::AtomicExpansionKind
31457X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31458 Type *MemType = SI->getValueOperand()->getType();
31459
31460 bool NoImplicitFloatOps =
31461 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
31462 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31463 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
31464 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31465 return AtomicExpansionKind::None;
31466
31467 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31468 : AtomicExpansionKind::None;
31469}
31470
31471// Note: this turns large loads into lock cmpxchg8b/16b.
31472// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
31473TargetLowering::AtomicExpansionKind
31474X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31475 Type *MemType = LI->getType();
31476
31477 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31478 // can use movq to do the load. If we have X87 we can load into an 80-bit
31479 // X87 register and store it to a stack temporary.
31480 bool NoImplicitFloatOps =
31481 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
31482 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31483 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
31484 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31485 return AtomicExpansionKind::None;
31486
31487 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31488 : AtomicExpansionKind::None;
31489}
31490
31491enum BitTestKind : unsigned {
31492 UndefBit,
31493 ConstantBit,
31494 NotConstantBit,
31495 ShiftBit,
31496 NotShiftBit
31497};
31498
31499static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31500 using namespace llvm::PatternMatch;
31501 BitTestKind BTK = UndefBit;
31502 auto *C = dyn_cast<ConstantInt>(V);
31503 if (C) {
31504 // Check if V is a power of 2 or NOT power of 2.
31505 if (isPowerOf2_64(C->getZExtValue()))
31506 BTK = ConstantBit;
31507 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31508 BTK = NotConstantBit;
31509 return {V, BTK};
31510 }
31511
31512 // Check if V is some power of 2 pattern known to be non-zero
31513 auto *I = dyn_cast<Instruction>(V);
31514 if (I) {
31515 bool Not = false;
31516 // Check if we have a NOT
31517 Value *PeekI;
31518 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
31519 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31520 Not = true;
31521 I = dyn_cast<Instruction>(PeekI);
31522
31523 // If I is constant, it will fold and we can evaluate later. If its an
31524 // argument or something of that nature, we can't analyze.
31525 if (I == nullptr)
31526 return {nullptr, UndefBit};
31527 }
31528 // We can only use 1 << X without more sophisticated analysis. C << X where
31529 // C is a power of 2 but not 1 can result in zero which cannot be translated
31530 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31531 if (I->getOpcode() == Instruction::Shl) {
31532 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31533 // -X` and some other provable power of 2 patterns that we can use CTZ on
31534 // may be profitable.
31535 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31536 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31537 // be provably a non-zero power of 2.
31538 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31539 // transformable to bittest.
31540 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31541 if (!ShiftVal)
31542 return {nullptr, UndefBit};
31543 if (ShiftVal->equalsInt(1))
31544 BTK = Not ? NotShiftBit : ShiftBit;
31545
31546 if (BTK == UndefBit)
31547 return {nullptr, UndefBit};
31548
31549 Value *BitV = I->getOperand(1);
31550
31551 Value *AndOp;
31552 const APInt *AndC;
31553 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
31554 // Read past a shiftmask instruction to find count
31555 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
31556 BitV = AndOp;
31557 }
31558 return {BitV, BTK};
31559 }
31560 }
31561 return {nullptr, UndefBit};
31562}
31563
31564TargetLowering::AtomicExpansionKind
31565X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31566 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31567 // prefix to a normal instruction for these operations.
31568 if (AI->use_empty())
31569 return AtomicExpansionKind::None;
31570
31571 // If the atomicrmw's result is used by a single bit AND, we may use
31572 // bts/btr/btc instruction for these operations.
31573 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31574 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31575 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31576 // detect it.
31577 Instruction *I = AI->user_back();
31578 auto BitChange = FindSingleBitChange(AI->getValOperand());
31579 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31580 I->getOpcode() != Instruction::And ||
31581 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31582 AI->getParent() != I->getParent())
31583 return AtomicExpansionKind::CmpXChg;
31584
31585 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31586
31587 // This is a redundant AND, it should get cleaned up elsewhere.
31588 if (AI == I->getOperand(OtherIdx))
31589 return AtomicExpansionKind::CmpXChg;
31590
31591 // The following instruction must be a AND single bit.
31592 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31593 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31594 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31595 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31596 return AtomicExpansionKind::CmpXChg;
31597 }
31598 if (AI->getOperation() == AtomicRMWInst::And) {
31599 return ~C1->getValue() == C2->getValue()
31600 ? AtomicExpansionKind::BitTestIntrinsic
31601 : AtomicExpansionKind::CmpXChg;
31602 }
31603 return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
31604 : AtomicExpansionKind::CmpXChg;
31605 }
31606
31607 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31607, __extension__
__PRETTY_FUNCTION__))
;
31608
31609 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31610 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31611 return AtomicExpansionKind::CmpXChg;
31612
31613 assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31613, __extension__
__PRETTY_FUNCTION__))
;
31614
31615 // If shift amounts are not the same we can't use BitTestIntrinsic.
31616 if (BitChange.first != BitTested.first)
31617 return AtomicExpansionKind::CmpXChg;
31618
31619 // If atomic AND need to be masking all be one bit and testing the one bit
31620 // unset in the mask.
31621 if (AI->getOperation() == AtomicRMWInst::And)
31622 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31623 ? AtomicExpansionKind::BitTestIntrinsic
31624 : AtomicExpansionKind::CmpXChg;
31625
31626 // If atomic XOR/OR need to be setting and testing the same bit.
31627 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31628 ? AtomicExpansionKind::BitTestIntrinsic
31629 : AtomicExpansionKind::CmpXChg;
31630}
31631
31632void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31633 IRBuilder<> Builder(AI);
31634 Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
31635 Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
31636 switch (AI->getOperation()) {
31637 default:
31638 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 31638)
;
31639 case AtomicRMWInst::Or:
31640 IID_C = Intrinsic::x86_atomic_bts;
31641 IID_I = Intrinsic::x86_atomic_bts_rm;
31642 break;
31643 case AtomicRMWInst::Xor:
31644 IID_C = Intrinsic::x86_atomic_btc;
31645 IID_I = Intrinsic::x86_atomic_btc_rm;
31646 break;
31647 case AtomicRMWInst::And:
31648 IID_C = Intrinsic::x86_atomic_btr;
31649 IID_I = Intrinsic::x86_atomic_btr_rm;
31650 break;
31651 }
31652 Instruction *I = AI->user_back();
31653 LLVMContext &Ctx = AI->getContext();
31654 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31655 Type::getInt8PtrTy(Ctx));
31656 Function *BitTest = nullptr;
31657 Value *Result = nullptr;
31658 auto BitTested = FindSingleBitChange(AI->getValOperand());
31659 assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31659, __extension__ __PRETTY_FUNCTION__))
;
31660
31661 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31662 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31663
31664 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
31665
31666 unsigned Imm = countTrailingZeros(C->getZExtValue());
31667 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
31668 } else {
31669 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
31670
31671 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__))
;
31672
31673 Value *SI = BitTested.first;
31674 assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
31674, __extension__ __PRETTY_FUNCTION__))
;
31675
31676 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31677 // mask it.
31678 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31679 Value *BitPos =
31680 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31681 // Todo(1): In many cases it may be provable that SI is less than
31682 // ShiftBits in which case this mask is unnecessary
31683 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31684 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31685 // favor of just a raw BT{S|R|C}.
31686
31687 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
31688 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31689
31690 // If the result is only used for zero/non-zero status then we don't need to
31691 // shift value back. Otherwise do so.
31692 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31693 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31694 if (ICmp->isEquality()) {
31695 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31696 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31697 if (C0 || C1) {
31698 assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31698, __extension__ __PRETTY_FUNCTION__))
;
31699 if ((C0 ? C0 : C1)->isZero())
31700 continue;
31701 }
31702 }
31703 }
31704 Result = Builder.CreateShl(Result, BitPos);
31705 break;
31706 }
31707 }
31708
31709 I->replaceAllUsesWith(Result);
31710 I->eraseFromParent();
31711 AI->eraseFromParent();
31712}
31713
31714static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
31715 using namespace llvm::PatternMatch;
31716 if (!AI->hasOneUse())
31717 return false;
31718
31719 Value *Op = AI->getOperand(1);
31720 ICmpInst::Predicate Pred;
31721 Instruction *I = AI->user_back();
31722 AtomicRMWInst::BinOp Opc = AI->getOperation();
31723 if (Opc == AtomicRMWInst::Add) {
31724 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31725 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31726 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31727 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31728 return Pred == CmpInst::ICMP_SLT;
31729 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
31730 return Pred == CmpInst::ICMP_SGT;
31731 }
31732 return false;
31733 }
31734 if (Opc == AtomicRMWInst::Sub) {
31735 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31736 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31737 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
31738 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31739 return Pred == CmpInst::ICMP_SLT;
31740 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
31741 return Pred == CmpInst::ICMP_SGT;
31742 }
31743 return false;
31744 }
31745 if ((Opc == AtomicRMWInst::Or &&
31746 match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
31747 (Opc == AtomicRMWInst::And &&
31748 match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
31749 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31750 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
31751 Pred == CmpInst::ICMP_SLT;
31752 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
31753 return Pred == CmpInst::ICMP_SGT;
31754 return false;
31755 }
31756 if (Opc == AtomicRMWInst::Xor) {
31757 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31758 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31759 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
31760 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31761 return Pred == CmpInst::ICMP_SLT;
31762 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
31763 return Pred == CmpInst::ICMP_SGT;
31764 }
31765 return false;
31766 }
31767
31768 return false;
31769}
31770
31771void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
31772 AtomicRMWInst *AI) const {
31773 IRBuilder<> Builder(AI);
31774 Instruction *TempI = nullptr;
31775 LLVMContext &Ctx = AI->getContext();
31776 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
31777 if (!ICI) {
31778 TempI = AI->user_back();
31779 assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31779, __extension__
__PRETTY_FUNCTION__))
;
31780 ICI = cast<ICmpInst>(TempI->user_back());
31781 }
31782 X86::CondCode CC = X86::COND_INVALID;
31783 ICmpInst::Predicate Pred = ICI->getPredicate();
31784 switch (Pred) {
31785 default:
31786 llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31786)
;
31787 case CmpInst::ICMP_EQ:
31788 CC = X86::COND_E;
31789 break;
31790 case CmpInst::ICMP_NE:
31791 CC = X86::COND_NE;
31792 break;
31793 case CmpInst::ICMP_SLT:
31794 CC = X86::COND_S;
31795 break;
31796 case CmpInst::ICMP_SGT:
31797 CC = X86::COND_NS;
31798 break;
31799 }
31800 Intrinsic::ID IID = Intrinsic::not_intrinsic;
31801 switch (AI->getOperation()) {
31802 default:
31803 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 31803)
;
31804 case AtomicRMWInst::Add:
31805 IID = Intrinsic::x86_atomic_add_cc;
31806 break;
31807 case AtomicRMWInst::Sub:
31808 IID = Intrinsic::x86_atomic_sub_cc;
31809 break;
31810 case AtomicRMWInst::Or:
31811 IID = Intrinsic::x86_atomic_or_cc;
31812 break;
31813 case AtomicRMWInst::And:
31814 IID = Intrinsic::x86_atomic_and_cc;
31815 break;
31816 case AtomicRMWInst::Xor:
31817 IID = Intrinsic::x86_atomic_xor_cc;
31818 break;
31819 }
31820 Function *CmpArith =
31821 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
31822 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31823 Type::getInt8PtrTy(Ctx));
31824 Value *Call = Builder.CreateCall(
31825 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
31826 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
31827 ICI->replaceAllUsesWith(Result);
31828 ICI->eraseFromParent();
31829 if (TempI)
31830 TempI->eraseFromParent();
31831 AI->eraseFromParent();
31832}
31833
31834TargetLowering::AtomicExpansionKind
31835X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
31836 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31837 Type *MemType = AI->getType();
31838
31839 // If the operand is too big, we must see if cmpxchg8/16b is available
31840 // and default to library calls otherwise.
31841 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
31842 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31843 : AtomicExpansionKind::None;
31844 }
31845
31846 AtomicRMWInst::BinOp Op = AI->getOperation();
31847 switch (Op) {
31848 case AtomicRMWInst::Xchg:
31849 return AtomicExpansionKind::None;
31850 case AtomicRMWInst::Add:
31851 case AtomicRMWInst::Sub:
31852 if (shouldExpandCmpArithRMWInIR(AI))
31853 return AtomicExpansionKind::CmpArithIntrinsic;
31854 // It's better to use xadd, xsub or xchg for these in other cases.
31855 return AtomicExpansionKind::None;
31856 case AtomicRMWInst::Or:
31857 case AtomicRMWInst::And:
31858 case AtomicRMWInst::Xor:
31859 if (shouldExpandCmpArithRMWInIR(AI))
31860 return AtomicExpansionKind::CmpArithIntrinsic;
31861 return shouldExpandLogicAtomicRMWInIR(AI);
31862 case AtomicRMWInst::Nand:
31863 case AtomicRMWInst::Max:
31864 case AtomicRMWInst::Min:
31865 case AtomicRMWInst::UMax:
31866 case AtomicRMWInst::UMin:
31867 case AtomicRMWInst::FAdd:
31868 case AtomicRMWInst::FSub:
31869 case AtomicRMWInst::FMax:
31870 case AtomicRMWInst::FMin:
31871 case AtomicRMWInst::UIncWrap:
31872 case AtomicRMWInst::UDecWrap:
31873 default:
31874 // These always require a non-trivial set of data operations on x86. We must
31875 // use a cmpxchg loop.
31876 return AtomicExpansionKind::CmpXChg;
31877 }
31878}
31879
31880LoadInst *
31881X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
31882 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31883 Type *MemType = AI->getType();
31884 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
31885 // there is no benefit in turning such RMWs into loads, and it is actually
31886 // harmful as it introduces a mfence.
31887 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
31888 return nullptr;
31889
31890 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
31891 // lowering available in lowerAtomicArith.
31892 // TODO: push more cases through this path.
31893 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
31894 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
31895 AI->use_empty())
31896 return nullptr;
31897
31898 IRBuilder<> Builder(AI);
31899 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31900 auto SSID = AI->getSyncScopeID();
31901 // We must restrict the ordering to avoid generating loads with Release or
31902 // ReleaseAcquire orderings.
31903 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
31904
31905 // Before the load we need a fence. Here is an example lifted from
31906 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
31907 // is required:
31908 // Thread 0:
31909 // x.store(1, relaxed);
31910 // r1 = y.fetch_add(0, release);
31911 // Thread 1:
31912 // y.fetch_add(42, acquire);
31913 // r2 = x.load(relaxed);
31914 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
31915 // lowered to just a load without a fence. A mfence flushes the store buffer,
31916 // making the optimization clearly correct.
31917 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
31918 // otherwise, we might be able to be more aggressive on relaxed idempotent
31919 // rmw. In practice, they do not look useful, so we don't try to be
31920 // especially clever.
31921 if (SSID == SyncScope::SingleThread)
31922 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
31923 // the IR level, so we must wrap it in an intrinsic.
31924 return nullptr;
31925
31926 if (!Subtarget.hasMFence())
31927 // FIXME: it might make sense to use a locked operation here but on a
31928 // different cache-line to prevent cache-line bouncing. In practice it
31929 // is probably a small win, and x86 processors without mfence are rare
31930 // enough that we do not bother.
31931 return nullptr;
31932
31933 Function *MFence =
31934 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
31935 Builder.CreateCall(MFence, {});
31936
31937 // Finally we can emit the atomic load.
31938 LoadInst *Loaded = Builder.CreateAlignedLoad(
31939 AI->getType(), AI->getPointerOperand(), AI->getAlign());
31940 Loaded->setAtomic(Order, SSID);
31941 AI->replaceAllUsesWith(Loaded);
31942 AI->eraseFromParent();
31943 return Loaded;
31944}
31945
31946bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
31947 if (!SI.isUnordered())
31948 return false;
31949 return ExperimentalUnorderedISEL;
31950}
31951bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
31952 if (!LI.isUnordered())
31953 return false;
31954 return ExperimentalUnorderedISEL;
31955}
31956
31957
31958/// Emit a locked operation on a stack location which does not change any
31959/// memory location, but does involve a lock prefix. Location is chosen to be
31960/// a) very likely accessed only by a single thread to minimize cache traffic,
31961/// and b) definitely dereferenceable. Returns the new Chain result.
31962static SDValue emitLockedStackOp(SelectionDAG &DAG,
31963 const X86Subtarget &Subtarget, SDValue Chain,
31964 const SDLoc &DL) {
31965 // Implementation notes:
31966 // 1) LOCK prefix creates a full read/write reordering barrier for memory
31967 // operations issued by the current processor. As such, the location
31968 // referenced is not relevant for the ordering properties of the instruction.
31969 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31970 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
31971 // 2) Using an immediate operand appears to be the best encoding choice
31972 // here since it doesn't require an extra register.
31973 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
31974 // is small enough it might just be measurement noise.)
31975 // 4) When choosing offsets, there are several contributing factors:
31976 // a) If there's no redzone, we default to TOS. (We could allocate a cache
31977 // line aligned stack object to improve this case.)
31978 // b) To minimize our chances of introducing a false dependence, we prefer
31979 // to offset the stack usage from TOS slightly.
31980 // c) To minimize concerns about cross thread stack usage - in particular,
31981 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
31982 // captures state in the TOS frame and accesses it from many threads -
31983 // we want to use an offset such that the offset is in a distinct cache
31984 // line from the TOS frame.
31985 //
31986 // For a general discussion of the tradeoffs and benchmark results, see:
31987 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31988
31989 auto &MF = DAG.getMachineFunction();
31990 auto &TFL = *Subtarget.getFrameLowering();
31991 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31992
31993 if (Subtarget.is64Bit()) {
31994 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31995 SDValue Ops[] = {
31996 DAG.getRegister(X86::RSP, MVT::i64), // Base
31997 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31998 DAG.getRegister(0, MVT::i64), // Index
31999 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32000 DAG.getRegister(0, MVT::i16), // Segment.
32001 Zero,
32002 Chain};
32003 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32004 MVT::Other, Ops);
32005 return SDValue(Res, 1);
32006 }
32007
32008 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32009 SDValue Ops[] = {
32010 DAG.getRegister(X86::ESP, MVT::i32), // Base
32011 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32012 DAG.getRegister(0, MVT::i32), // Index
32013 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32014 DAG.getRegister(0, MVT::i16), // Segment.
32015 Zero,
32016 Chain
32017 };
32018 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32019 MVT::Other, Ops);
32020 return SDValue(Res, 1);
32021}
32022
32023static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
32024 SelectionDAG &DAG) {
32025 SDLoc dl(Op);
32026 AtomicOrdering FenceOrdering =
32027 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32028 SyncScope::ID FenceSSID =
32029 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32030
32031 // The only fence that needs an instruction is a sequentially-consistent
32032 // cross-thread fence.
32033 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32034 FenceSSID == SyncScope::System) {
32035 if (Subtarget.hasMFence())
32036 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32037
32038 SDValue Chain = Op.getOperand(0);
32039 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32040 }
32041
32042 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32043 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32044}
32045
32046static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
32047 SelectionDAG &DAG) {
32048 MVT T = Op.getSimpleValueType();
32049 SDLoc DL(Op);
32050 unsigned Reg = 0;
32051 unsigned size = 0;
32052 switch(T.SimpleTy) {
32053 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32053)
;
32054 case MVT::i8: Reg = X86::AL; size = 1; break;
32055 case MVT::i16: Reg = X86::AX; size = 2; break;
32056 case MVT::i32: Reg = X86::EAX; size = 4; break;
32057 case MVT::i64:
32058 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32058, __extension__
__PRETTY_FUNCTION__))
;
32059 Reg = X86::RAX; size = 8;
32060 break;
32061 }
32062 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32063 Op.getOperand(2), SDValue());
32064 SDValue Ops[] = { cpIn.getValue(0),
32065 Op.getOperand(1),
32066 Op.getOperand(3),
32067 DAG.getTargetConstant(size, DL, MVT::i8),
32068 cpIn.getValue(1) };
32069 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32070 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32071 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
32072 Ops, T, MMO);
32073
32074 SDValue cpOut =
32075 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32076 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32077 MVT::i32, cpOut.getValue(2));
32078 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32079
32080 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32081 cpOut, Success, EFLAGS.getValue(1));
32082}
32083
32084// Create MOVMSKB, taking into account whether we need to split for AVX1.
32085static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
32086 const X86Subtarget &Subtarget) {
32087 MVT InVT = V.getSimpleValueType();
32088
32089 if (InVT == MVT::v64i8) {
32090 SDValue Lo, Hi;
32091 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32092 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32093 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32094 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32095 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32096 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32097 DAG.getConstant(32, DL, MVT::i8));
32098 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32099 }
32100 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32101 SDValue Lo, Hi;
32102 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32103 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32104 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32105 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32106 DAG.getConstant(16, DL, MVT::i8));
32107 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32108 }
32109
32110 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32111}
32112
32113static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32114 SelectionDAG &DAG) {
32115 SDValue Src = Op.getOperand(0);
32116 MVT SrcVT = Src.getSimpleValueType();
32117 MVT DstVT = Op.getSimpleValueType();
32118
32119 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32120 // half to v32i1 and concatenating the result.
32121 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32122 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32122, __extension__
__PRETTY_FUNCTION__))
;
32123 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32123, __extension__
__PRETTY_FUNCTION__))
;
32124 SDLoc dl(Op);
32125 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
32126 DAG.getIntPtrConstant(0, dl));
32127 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32128 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
32129 DAG.getIntPtrConstant(1, dl));
32130 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32131 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32132 }
32133
32134 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32135 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32136 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32136, __extension__
__PRETTY_FUNCTION__))
;
32137 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32138 SDLoc DL(Op);
32139 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32140 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32141 return DAG.getZExtOrTrunc(V, DL, DstVT);
32142 }
32143
32144 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32145, __extension__
__PRETTY_FUNCTION__))
32145 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32145, __extension__
__PRETTY_FUNCTION__))
;
32146
32147 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32147, __extension__
__PRETTY_FUNCTION__))
;
32148 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32149 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32150 // This conversion needs to be expanded.
32151 return SDValue();
32152
32153 SDLoc dl(Op);
32154 if (SrcVT.isVector()) {
32155 // Widen the vector in input in the case of MVT::v2i32.
32156 // Example: from MVT::v2i32 to MVT::v4i32.
32157 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
32158 SrcVT.getVectorNumElements() * 2);
32159 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32160 DAG.getUNDEF(SrcVT));
32161 } else {
32162 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32163, __extension__
__PRETTY_FUNCTION__))
32163 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32163, __extension__
__PRETTY_FUNCTION__))
;
32164 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32165 }
32166
32167 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32168 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32169
32170 if (DstVT == MVT::x86mmx)
32171 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32172
32173 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32174 DAG.getIntPtrConstant(0, dl));
32175}
32176
32177/// Compute the horizontal sum of bytes in V for the elements of VT.
32178///
32179/// Requires V to be a byte vector and VT to be an integer vector type with
32180/// wider elements than V's type. The width of the elements of VT determines
32181/// how many bytes of V are summed horizontally to produce each element of the
32182/// result.
32183static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
32184 const X86Subtarget &Subtarget,
32185 SelectionDAG &DAG) {
32186 SDLoc DL(V);
32187 MVT ByteVecVT = V.getSimpleValueType();
32188 MVT EltVT = VT.getVectorElementType();
32189 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32190, __extension__
__PRETTY_FUNCTION__))
32190 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32190, __extension__
__PRETTY_FUNCTION__))
;
32191 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32192, __extension__
__PRETTY_FUNCTION__))
32192 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32192, __extension__
__PRETTY_FUNCTION__))
;
32193 unsigned VecSize = VT.getSizeInBits();
32194 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32194, __extension__
__PRETTY_FUNCTION__))
;
32195
32196 // PSADBW instruction horizontally add all bytes and leave the result in i64
32197 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32198 if (EltVT == MVT::i64) {
32199 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32200 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32201 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32202 return DAG.getBitcast(VT, V);
32203 }
32204
32205 if (EltVT == MVT::i32) {
32206 // We unpack the low half and high half into i32s interleaved with zeros so
32207 // that we can use PSADBW to horizontally sum them. The most useful part of
32208 // this is that it lines up the results of two PSADBW instructions to be
32209 // two v2i64 vectors which concatenated are the 4 population counts. We can
32210 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32211 SDValue Zeros = DAG.getConstant(0, DL, VT);
32212 SDValue V32 = DAG.getBitcast(VT, V);
32213 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32214 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32215
32216 // Do the horizontal sums into two v2i64s.
32217 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32218 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32219 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32220 DAG.getBitcast(ByteVecVT, Low), Zeros);
32221 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32222 DAG.getBitcast(ByteVecVT, High), Zeros);
32223
32224 // Merge them together.
32225 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32226 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32227 DAG.getBitcast(ShortVecVT, Low),
32228 DAG.getBitcast(ShortVecVT, High));
32229
32230 return DAG.getBitcast(VT, V);
32231 }
32232
32233 // The only element type left is i16.
32234 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32234, __extension__
__PRETTY_FUNCTION__))
;
32235
32236 // To obtain pop count for each i16 element starting from the pop count for
32237 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32238 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32239 // directly supported.
32240 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32241 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32242 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32243 DAG.getBitcast(ByteVecVT, V));
32244 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32245}
32246
32247static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
32248 const X86Subtarget &Subtarget,
32249 SelectionDAG &DAG) {
32250 MVT VT = Op.getSimpleValueType();
32251 MVT EltVT = VT.getVectorElementType();
32252 int NumElts = VT.getVectorNumElements();
32253 (void)EltVT;
32254 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32254, __extension__
__PRETTY_FUNCTION__))
;
32255
32256 // Implement a lookup table in register by using an algorithm based on:
32257 // http://wm.ite.pl/articles/sse-popcount.html
32258 //
32259 // The general idea is that every lower byte nibble in the input vector is an
32260 // index into a in-register pre-computed pop count table. We then split up the
32261 // input vector in two new ones: (1) a vector with only the shifted-right
32262 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32263 // masked out higher ones) for each byte. PSHUFB is used separately with both
32264 // to index the in-register table. Next, both are added and the result is a
32265 // i8 vector where each element contains the pop count for input byte.
32266 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32267 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32268 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32269 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32270
32271 SmallVector<SDValue, 64> LUTVec;
32272 for (int i = 0; i < NumElts; ++i)
32273 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32274 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32275 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32276
32277 // High nibbles
32278 SDValue FourV = DAG.getConstant(4, DL, VT);
32279 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32280
32281 // Low nibbles
32282 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32283
32284 // The input vector is used as the shuffle mask that index elements into the
32285 // LUT. After counting low and high nibbles, add the vector to obtain the
32286 // final pop count per i8 element.
32287 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32288 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32289 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32290}
32291
32292// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32293// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32294static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
32295 SelectionDAG &DAG) {
32296 MVT VT = Op.getSimpleValueType();
32297 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32298, __extension__
__PRETTY_FUNCTION__))
32298 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32298, __extension__
__PRETTY_FUNCTION__))
;
32299 SDLoc DL(Op.getNode());
32300 SDValue Op0 = Op.getOperand(0);
32301
32302 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32303 if (Subtarget.hasVPOPCNTDQ()) {
32304 unsigned NumElems = VT.getVectorNumElements();
32305 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32306, __extension__
__PRETTY_FUNCTION__))
32306 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32306, __extension__
__PRETTY_FUNCTION__))
;
32307 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32308 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32309 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32310 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32311 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32312 }
32313 }
32314
32315 // Decompose 256-bit ops into smaller 128-bit ops.
32316 if (VT.is256BitVector() && !Subtarget.hasInt256())
32317 return splitVectorIntUnary(Op, DAG);
32318
32319 // Decompose 512-bit ops into smaller 256-bit ops.
32320 if (VT.is512BitVector() && !Subtarget.hasBWI())
32321 return splitVectorIntUnary(Op, DAG);
32322
32323 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32324 if (VT.getScalarType() != MVT::i8) {
32325 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32326 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32327 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32328 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32329 }
32330
32331 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32332 if (!Subtarget.hasSSSE3())
32333 return SDValue();
32334
32335 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32336}
32337
32338static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
32339 SelectionDAG &DAG) {
32340 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32341, __extension__
__PRETTY_FUNCTION__))
32341 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32341, __extension__
__PRETTY_FUNCTION__))
;
32342 return LowerVectorCTPOP(Op, Subtarget, DAG);
32343}
32344
32345static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
32346 MVT VT = Op.getSimpleValueType();
32347 SDValue In = Op.getOperand(0);
32348 SDLoc DL(Op);
32349
32350 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32351 // perform the BITREVERSE.
32352 if (!VT.isVector()) {
32353 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32354 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32355 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32356 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32357 DAG.getIntPtrConstant(0, DL));
32358 }
32359
32360 int NumElts = VT.getVectorNumElements();
32361 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32362
32363 // Decompose 256-bit ops into smaller 128-bit ops.
32364 if (VT.is256BitVector())
32365 return splitVectorIntUnary(Op, DAG);
32366
32367 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32368, __extension__
__PRETTY_FUNCTION__))
32368 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32368, __extension__
__PRETTY_FUNCTION__))
;
32369
32370 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32371 // perform the BSWAP in the shuffle.
32372 // Its best to shuffle using the second operand as this will implicitly allow
32373 // memory folding for multiple vectors.
32374 SmallVector<SDValue, 16> MaskElts;
32375 for (int i = 0; i != NumElts; ++i) {
32376 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32377 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32378 int PermuteByte = SourceByte | (2 << 5);
32379 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32380 }
32381 }
32382
32383 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32384 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32385 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32386 Res, Mask);
32387 return DAG.getBitcast(VT, Res);
32388}
32389
32390static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
32391 SelectionDAG &DAG) {
32392 MVT VT = Op.getSimpleValueType();
32393
32394 if (Subtarget.hasXOP() && !VT.is512BitVector())
32395 return LowerBITREVERSE_XOP(Op, DAG);
32396
32397 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32397, __extension__
__PRETTY_FUNCTION__))
;
32398
32399 SDValue In = Op.getOperand(0);
32400 SDLoc DL(Op);
32401
32402 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32403, __extension__
__PRETTY_FUNCTION__))
32403 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32403, __extension__
__PRETTY_FUNCTION__))
;
32404
32405 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
32406 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
32407 return splitVectorIntUnary(Op, DAG);
32408
32409 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32410 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
32411 return splitVectorIntUnary(Op, DAG);
32412
32413 unsigned NumElts = VT.getVectorNumElements();
32414
32415 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32416 if (Subtarget.hasGFNI()) {
32417 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
32418 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
32419 Matrix = DAG.getBitcast(VT, Matrix);
32420 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32421 DAG.getTargetConstant(0, DL, MVT::i8));
32422 }
32423
32424 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32425 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32426 // 0-15 value (moved to the other nibble).
32427 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32428 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32429 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32430
32431 const int LoLUT[16] = {
32432 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32433 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32434 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32435 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32436 const int HiLUT[16] = {
32437 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32438 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32439 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32440 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32441
32442 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32443 for (unsigned i = 0; i < NumElts; ++i) {
32444 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32445 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32446 }
32447
32448 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32449 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32450 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32451 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32452 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32453}
32454
32455static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32456 SelectionDAG &DAG) {
32457 SDLoc DL(Op);
32458 SDValue X = Op.getOperand(0);
32459 MVT VT = Op.getSimpleValueType();
32460
32461 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32462 if (VT == MVT::i8 ||
32463 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
32464 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32465 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32466 DAG.getConstant(0, DL, MVT::i8));
32467 // Copy the inverse of the parity flag into a register with setcc.
32468 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32469 // Extend to the original type.
32470 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32471 }
32472
32473 // If we have POPCNT, use the default expansion.
32474 if (Subtarget.hasPOPCNT())
32475 return SDValue();
32476
32477 if (VT == MVT::i64) {
32478 // Xor the high and low 16-bits together using a 32-bit operation.
32479 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32480 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32481 DAG.getConstant(32, DL, MVT::i8)));
32482 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32483 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32484 }
32485
32486 if (VT != MVT::i16) {
32487 // Xor the high and low 16-bits together using a 32-bit operation.
32488 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32489 DAG.getConstant(16, DL, MVT::i8));
32490 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32491 } else {
32492 // If the input is 16-bits, we need to extend to use an i32 shift below.
32493 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32494 }
32495
32496 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32497 // This should allow an h-reg to be used to save a shift.
32498 SDValue Hi = DAG.getNode(
32499 ISD::TRUNCATE, DL, MVT::i8,
32500 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32501 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32502 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32503 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32504
32505 // Copy the inverse of the parity flag into a register with setcc.
32506 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32507 // Extend to the original type.
32508 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32509}
32510
32511static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
32512 const X86Subtarget &Subtarget) {
32513 unsigned NewOpc = 0;
32514 switch (N->getOpcode()) {
32515 case ISD::ATOMIC_LOAD_ADD:
32516 NewOpc = X86ISD::LADD;
32517 break;
32518 case ISD::ATOMIC_LOAD_SUB:
32519 NewOpc = X86ISD::LSUB;
32520 break;
32521 case ISD::ATOMIC_LOAD_OR:
32522 NewOpc = X86ISD::LOR;
32523 break;
32524 case ISD::ATOMIC_LOAD_XOR:
32525 NewOpc = X86ISD::LXOR;
32526 break;
32527 case ISD::ATOMIC_LOAD_AND:
32528 NewOpc = X86ISD::LAND;
32529 break;
32530 default:
32531 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32531)
;
32532 }
32533
32534 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32535
32536 return DAG.getMemIntrinsicNode(
32537 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32538 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32539 /*MemVT=*/N->getSimpleValueType(0), MMO);
32540}
32541
32542/// Lower atomic_load_ops into LOCK-prefixed operations.
32543static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
32544 const X86Subtarget &Subtarget) {
32545 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32546 SDValue Chain = N->getOperand(0);
32547 SDValue LHS = N->getOperand(1);
32548 SDValue RHS = N->getOperand(2);
32549 unsigned Opc = N->getOpcode();
32550 MVT VT = N->getSimpleValueType(0);
32551 SDLoc DL(N);
32552
32553 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32554 // can only be lowered when the result is unused. They should have already
32555 // been transformed into a cmpxchg loop in AtomicExpand.
32556 if (N->hasAnyUseOfValue(0)) {
32557 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32558 // select LXADD if LOCK_SUB can't be selected.
32559 if (Opc == ISD::ATOMIC_LOAD_SUB) {
32560 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
32561 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32562 RHS, AN->getMemOperand());
32563 }
32564 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32565, __extension__
__PRETTY_FUNCTION__))
32565 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32565, __extension__
__PRETTY_FUNCTION__))
;
32566 return N;
32567 }
32568
32569 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32570 // The core idea here is that since the memory location isn't actually
32571 // changing, all we need is a lowering for the *ordering* impacts of the
32572 // atomicrmw. As such, we can chose a different operation and memory
32573 // location to minimize impact on other code.
32574 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
32575 // On X86, the only ordering which actually requires an instruction is
32576 // seq_cst which isn't SingleThread, everything just needs to be preserved
32577 // during codegen and then dropped. Note that we expect (but don't assume),
32578 // that orderings other than seq_cst and acq_rel have been canonicalized to
32579 // a store or load.
32580 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
32581 AN->getSyncScopeID() == SyncScope::System) {
32582 // Prefer a locked operation against a stack location to minimize cache
32583 // traffic. This assumes that stack locations are very likely to be
32584 // accessed only by the owning thread.
32585 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32586 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32586, __extension__ __PRETTY_FUNCTION__))
;
32587 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32588 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32589 DAG.getUNDEF(VT), NewChain);
32590 }
32591 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32592 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32593 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32593, __extension__ __PRETTY_FUNCTION__))
;
32594 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32595 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32596 DAG.getUNDEF(VT), NewChain);
32597 }
32598
32599 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32600 // RAUW the chain, but don't worry about the result, as it's unused.
32601 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32601, __extension__ __PRETTY_FUNCTION__))
;
32602 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32603 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32604 DAG.getUNDEF(VT), LockOp.getValue(1));
32605}
32606
32607static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
32608 const X86Subtarget &Subtarget) {
32609 auto *Node = cast<AtomicSDNode>(Op.getNode());
32610 SDLoc dl(Node);
32611 EVT VT = Node->getMemoryVT();
32612
32613 bool IsSeqCst =
32614 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32615 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32616
32617 // If this store is not sequentially consistent and the type is legal
32618 // we can just keep it.
32619 if (!IsSeqCst && IsTypeLegal)
32620 return Op;
32621
32622 if (VT == MVT::i64 && !IsTypeLegal) {
32623 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32624 // is enabled.
32625 bool NoImplicitFloatOps =
32626 DAG.getMachineFunction().getFunction().hasFnAttribute(
32627 Attribute::NoImplicitFloat);
32628 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
32629 SDValue Chain;
32630 if (Subtarget.hasSSE1()) {
32631 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
32632 Node->getOperand(2));
32633 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32634 SclToVec = DAG.getBitcast(StVT, SclToVec);
32635 SDVTList Tys = DAG.getVTList(MVT::Other);
32636 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32637 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
32638 MVT::i64, Node->getMemOperand());
32639 } else if (Subtarget.hasX87()) {
32640 // First load this into an 80-bit X87 register using a stack temporary.
32641 // This will put the whole integer into the significand.
32642 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32643 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32644 MachinePointerInfo MPI =
32645 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
32646 Chain =
32647 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
32648 MPI, MaybeAlign(), MachineMemOperand::MOStore);
32649 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32650 SDValue LdOps[] = {Chain, StackPtr};
32651 SDValue Value = DAG.getMemIntrinsicNode(
32652 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
32653 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
32654 Chain = Value.getValue(1);
32655
32656 // Now use an FIST to do the atomic store.
32657 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
32658 Chain =
32659 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
32660 StoreOps, MVT::i64, Node->getMemOperand());
32661 }
32662
32663 if (Chain) {
32664 // If this is a sequentially consistent store, also emit an appropriate
32665 // barrier.
32666 if (IsSeqCst)
32667 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
32668
32669 return Chain;
32670 }
32671 }
32672 }
32673
32674 // Convert seq_cst store -> xchg
32675 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
32676 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
32677 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
32678 Node->getMemoryVT(),
32679 Node->getOperand(0),
32680 Node->getOperand(1), Node->getOperand(2),
32681 Node->getMemOperand());
32682 return Swap.getValue(1);
32683}
32684
32685static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
32686 SDNode *N = Op.getNode();
32687 MVT VT = N->getSimpleValueType(0);
32688 unsigned Opc = Op.getOpcode();
32689
32690 // Let legalize expand this if it isn't a legal type yet.
32691 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32692 return SDValue();
32693
32694 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
32695 SDLoc DL(N);
32696
32697 // Set the carry flag.
32698 SDValue Carry = Op.getOperand(2);
32699 EVT CarryVT = Carry.getValueType();
32700 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
32701 Carry, DAG.getAllOnesConstant(DL, CarryVT));
32702
32703 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
32704 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
32705 Op.getOperand(0), Op.getOperand(1),
32706 Carry.getValue(1));
32707
32708 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
32709 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
32710 Sum.getValue(1), DL, DAG);
32711 if (N->getValueType(1) == MVT::i1)
32712 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
32713
32714 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
32715}
32716
32717static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
32718 SelectionDAG &DAG) {
32719 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32719, __extension__
__PRETTY_FUNCTION__))
;
32720
32721 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
32722 // which returns the values as { float, float } (in XMM0) or
32723 // { double, double } (which is returned in XMM0, XMM1).
32724 SDLoc dl(Op);
32725 SDValue Arg = Op.getOperand(0);
32726 EVT ArgVT = Arg.getValueType();
32727 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
32728
32729 TargetLowering::ArgListTy Args;
32730 TargetLowering::ArgListEntry Entry;
32731
32732 Entry.Node = Arg;
32733 Entry.Ty = ArgTy;
32734 Entry.IsSExt = false;
32735 Entry.IsZExt = false;
32736 Args.push_back(Entry);
32737
32738 bool isF64 = ArgVT == MVT::f64;
32739 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
32740 // the small struct {f32, f32} is returned in (eax, edx). For f64,
32741 // the results are returned via SRet in memory.
32742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32743 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
32744 const char *LibcallName = TLI.getLibcallName(LC);
32745 SDValue Callee =
32746 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
32747
32748 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
32749 : (Type *)FixedVectorType::get(ArgTy, 4);
32750
32751 TargetLowering::CallLoweringInfo CLI(DAG);
32752 CLI.setDebugLoc(dl)
32753 .setChain(DAG.getEntryNode())
32754 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
32755
32756 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
32757
32758 if (isF64)
32759 // Returned in xmm0 and xmm1.
32760 return CallResult.first;
32761
32762 // Returned in bits 0:31 and 32:64 xmm0.
32763 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
32764 CallResult.first, DAG.getIntPtrConstant(0, dl));
32765 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
32766 CallResult.first, DAG.getIntPtrConstant(1, dl));
32767 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
32768 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
32769}
32770
32771/// Widen a vector input to a vector of NVT. The
32772/// input vector must have the same element type as NVT.
32773static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
32774 bool FillWithZeroes = false) {
32775 // Check if InOp already has the right width.
32776 MVT InVT = InOp.getSimpleValueType();
32777 if (InVT == NVT)
32778 return InOp;
32779
32780 if (InOp.isUndef())
32781 return DAG.getUNDEF(NVT);
32782
32783 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32784, __extension__
__PRETTY_FUNCTION__))
32784 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32784, __extension__
__PRETTY_FUNCTION__))
;
32785
32786 unsigned InNumElts = InVT.getVectorNumElements();
32787 unsigned WidenNumElts = NVT.getVectorNumElements();
32788 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32789, __extension__
__PRETTY_FUNCTION__))
32789 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32789, __extension__
__PRETTY_FUNCTION__))
;
32790
32791 SDLoc dl(InOp);
32792 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
32793 InOp.getNumOperands() == 2) {
32794 SDValue N1 = InOp.getOperand(1);
32795 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
32796 N1.isUndef()) {
32797 InOp = InOp.getOperand(0);
32798 InVT = InOp.getSimpleValueType();
32799 InNumElts = InVT.getVectorNumElements();
32800 }
32801 }
32802 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
32803 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
32804 SmallVector<SDValue, 16> Ops;
32805 for (unsigned i = 0; i < InNumElts; ++i)
32806 Ops.push_back(InOp.getOperand(i));
32807
32808 EVT EltVT = InOp.getOperand(0).getValueType();
32809
32810 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
32811 DAG.getUNDEF(EltVT);
32812 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
32813 Ops.push_back(FillVal);
32814 return DAG.getBuildVector(NVT, dl, Ops);
32815 }
32816 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
32817 DAG.getUNDEF(NVT);
32818 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
32819 InOp, DAG.getIntPtrConstant(0, dl));
32820}
32821
32822static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
32823 SelectionDAG &DAG) {
32824 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32825, __extension__
__PRETTY_FUNCTION__))
32825 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32825, __extension__
__PRETTY_FUNCTION__))
;
32826
32827 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
32828 SDValue Src = N->getValue();
32829 MVT VT = Src.getSimpleValueType();
32830 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32830, __extension__
__PRETTY_FUNCTION__))
;
32831 SDLoc dl(Op);
32832
32833 SDValue Scale = N->getScale();
32834 SDValue Index = N->getIndex();
32835 SDValue Mask = N->getMask();
32836 SDValue Chain = N->getChain();
32837 SDValue BasePtr = N->getBasePtr();
32838
32839 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
32840 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32840, __extension__
__PRETTY_FUNCTION__))
;
32841 // If the index is v2i64 and we have VLX we can use xmm for data and index.
32842 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
32843 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32844 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
32845 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
32846 SDVTList VTs = DAG.getVTList(MVT::Other);
32847 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32848 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32849 N->getMemoryVT(), N->getMemOperand());
32850 }
32851 return SDValue();
32852 }
32853
32854 MVT IndexVT = Index.getSimpleValueType();
32855
32856 // If the index is v2i32, we're being called by type legalization and we
32857 // should just let the default handling take care of it.
32858 if (IndexVT == MVT::v2i32)
32859 return SDValue();
32860
32861 // If we don't have VLX and neither the passthru or index is 512-bits, we
32862 // need to widen until one is.
32863 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
32864 !Index.getSimpleValueType().is512BitVector()) {
32865 // Determine how much we need to widen by to get a 512-bit type.
32866 unsigned Factor = std::min(512/VT.getSizeInBits(),
32867 512/IndexVT.getSizeInBits());
32868 unsigned NumElts = VT.getVectorNumElements() * Factor;
32869
32870 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32871 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32872 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32873
32874 Src = ExtendToType(Src, VT, DAG);
32875 Index = ExtendToType(Index, IndexVT, DAG);
32876 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32877 }
32878
32879 SDVTList VTs = DAG.getVTList(MVT::Other);
32880 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32881 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32882 N->getMemoryVT(), N->getMemOperand());
32883}
32884
32885static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
32886 SelectionDAG &DAG) {
32887
32888 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
32889 MVT VT = Op.getSimpleValueType();
32890 MVT ScalarVT = VT.getScalarType();
32891 SDValue Mask = N->getMask();
32892 MVT MaskVT = Mask.getSimpleValueType();
32893 SDValue PassThru = N->getPassThru();
32894 SDLoc dl(Op);
32895
32896 // Handle AVX masked loads which don't support passthru other than 0.
32897 if (MaskVT.getVectorElementType() != MVT::i1) {
32898 // We also allow undef in the isel pattern.
32899 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
32900 return Op;
32901
32902 SDValue NewLoad = DAG.getMaskedLoad(
32903 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32904 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32905 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32906 N->isExpandingLoad());
32907 // Emit a blend.
32908 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
32909 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
32910 }
32911
32912 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32913, __extension__
__PRETTY_FUNCTION__))
32913 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32913, __extension__
__PRETTY_FUNCTION__))
;
32914
32915 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32916, __extension__
__PRETTY_FUNCTION__))
32916 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32916, __extension__
__PRETTY_FUNCTION__))
;
32917
32918 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32919, __extension__
__PRETTY_FUNCTION__))
32919 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32919, __extension__
__PRETTY_FUNCTION__))
;
32920
32921 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))
32922 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))
32923 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))
32924 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))
;
32925
32926 // This operation is legal for targets with VLX, but without
32927 // VLX the vector should be widened to 512 bit
32928 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
32929 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32930 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
32931
32932 // Mask element has to be i1.
32933 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32934, __extension__
__PRETTY_FUNCTION__))
32934 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32934, __extension__
__PRETTY_FUNCTION__))
;
32935
32936 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32937
32938 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32939 SDValue NewLoad = DAG.getMaskedLoad(
32940 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32941 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32942 N->getExtensionType(), N->isExpandingLoad());
32943
32944 SDValue Extract =
32945 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32946 DAG.getIntPtrConstant(0, dl));
32947 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32948 return DAG.getMergeValues(RetOps, dl);
32949}
32950
32951static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32952 SelectionDAG &DAG) {
32953 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32954 SDValue DataToStore = N->getValue();
32955 MVT VT = DataToStore.getSimpleValueType();
32956 MVT ScalarVT = VT.getScalarType();
32957 SDValue Mask = N->getMask();
32958 SDLoc dl(Op);
32959
32960 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32961, __extension__
__PRETTY_FUNCTION__))
32961 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32961, __extension__
__PRETTY_FUNCTION__))
;
32962
32963 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32964, __extension__
__PRETTY_FUNCTION__))
32964 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32964, __extension__
__PRETTY_FUNCTION__))
;
32965
32966 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32967, __extension__
__PRETTY_FUNCTION__))
32967 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32967, __extension__
__PRETTY_FUNCTION__))
;
32968
32969 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))
32970 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))
32971 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))
32972 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))
;
32973
32974 // This operation is legal for targets with VLX, but without
32975 // VLX the vector should be widened to 512 bit
32976 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32977 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32978
32979 // Mask element has to be i1.
32980 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__))
32981 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__))
;
32982
32983 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32984
32985 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32986 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32987 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32988 N->getOffset(), Mask, N->getMemoryVT(),
32989 N->getMemOperand(), N->getAddressingMode(),
32990 N->isTruncatingStore(), N->isCompressingStore());
32991}
32992
32993static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
32994 SelectionDAG &DAG) {
32995 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32996, __extension__
__PRETTY_FUNCTION__))
32996 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32996, __extension__
__PRETTY_FUNCTION__))
;
32997
32998 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32999 SDLoc dl(Op);
33000 MVT VT = Op.getSimpleValueType();
33001 SDValue Index = N->getIndex();
33002 SDValue Mask = N->getMask();
33003 SDValue PassThru = N->getPassThru();
33004 MVT IndexVT = Index.getSimpleValueType();
33005
33006 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33006, __extension__
__PRETTY_FUNCTION__))
;
33007
33008 // If the index is v2i32, we're being called by type legalization.
33009 if (IndexVT == MVT::v2i32)
33010 return SDValue();
33011
33012 // If we don't have VLX and neither the passthru or index is 512-bits, we
33013 // need to widen until one is.
33014 MVT OrigVT = VT;
33015 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33016 !IndexVT.is512BitVector()) {
33017 // Determine how much we need to widen by to get a 512-bit type.
33018 unsigned Factor = std::min(512/VT.getSizeInBits(),
33019 512/IndexVT.getSizeInBits());
33020
33021 unsigned NumElts = VT.getVectorNumElements() * Factor;
33022
33023 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33024 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33025 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33026
33027 PassThru = ExtendToType(PassThru, VT, DAG);
33028 Index = ExtendToType(Index, IndexVT, DAG);
33029 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33030 }
33031
33032 // Break dependency on the data register.
33033 if (PassThru.isUndef())
33034 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33035
33036 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33037 N->getScale() };
33038 SDValue NewGather = DAG.getMemIntrinsicNode(
33039 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33040 N->getMemOperand());
33041 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
33042 NewGather, DAG.getIntPtrConstant(0, dl));
33043 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33044}
33045
33046static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
33047 SDLoc dl(Op);
33048 SDValue Src = Op.getOperand(0);
33049 MVT DstVT = Op.getSimpleValueType();
33050
33051 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33052 unsigned SrcAS = N->getSrcAddressSpace();
33053
33054 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__))
33055 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__))
;
33056
33057 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33058 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33059 } else if (DstVT == MVT::i64) {
33060 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33061 } else if (DstVT == MVT::i32) {
33062 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33063 } else {
33064 report_fatal_error("Bad address space in addrspacecast");
33065 }
33066 return Op;
33067}
33068
33069SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33070 SelectionDAG &DAG) const {
33071 // TODO: Eventually, the lowering of these nodes should be informed by or
33072 // deferred to the GC strategy for the function in which they appear. For
33073 // now, however, they must be lowered to something. Since they are logically
33074 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33075 // require special handling for these nodes), lower them as literal NOOPs for
33076 // the time being.
33077 SmallVector<SDValue, 2> Ops;
33078 Ops.push_back(Op.getOperand(0));
33079 if (Op->getGluedNode())
33080 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33081
33082 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33083 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33084}
33085
33086// Custom split CVTPS2PH with wide types.
33087static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
33088 SDLoc dl(Op);
33089 EVT VT = Op.getValueType();
33090 SDValue Lo, Hi;
33091 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33092 EVT LoVT, HiVT;
33093 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33094 SDValue RC = Op.getOperand(1);
33095 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33096 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33097 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33098}
33099
33100static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
33101 unsigned OpNo) {
33102 const APInt Operand(32, OpNo);
33103 std::string OpNoStr = llvm::toString(Operand, 10, false);
33104 std::string Str(" $");
33105
33106 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33107 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33108
33109 auto I = StringRef::npos;
33110 for (auto &AsmStr : AsmStrs) {
33111 // Match the OpNo string. We should match exactly to exclude match
33112 // sub-string, e.g. "$12" contain "$1"
33113 if (AsmStr.endswith(OpNoStr1))
33114 I = AsmStr.size() - OpNoStr1.size();
33115
33116 // Get the index of operand in AsmStr.
33117 if (I == StringRef::npos)
33118 I = AsmStr.find(OpNoStr1 + ",");
33119 if (I == StringRef::npos)
33120 I = AsmStr.find(OpNoStr2);
33121
33122 if (I == StringRef::npos)
33123 continue;
33124
33125 assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33125, __extension__
__PRETTY_FUNCTION__))
;
33126 // Remove the operand string and label (if exsit).
33127 // For example:
33128 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33129 // ==>
33130 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33131 // ==>
33132 // "call dword ptr "
33133 auto TmpStr = AsmStr.substr(0, I);
33134 I = TmpStr.rfind(':');
33135 if (I == StringRef::npos)
33136 return TmpStr;
33137
33138 assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33138, __extension__
__PRETTY_FUNCTION__))
;
33139 auto Asm = TmpStr.drop_front(I + 1);
33140 return Asm;
33141 }
33142
33143 return StringRef();
33144}
33145
33146bool X86TargetLowering::isInlineAsmTargetBranch(
33147 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33148 StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);
33149
33150 if (InstrStr.contains("call"))
33151 return true;
33152
33153 return false;
33154}
33155
33156/// Provide custom lowering hooks for some operations.
33157SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
33158 switch (Op.getOpcode()) {
33159 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33159)
;
33160 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33161 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33162 return LowerCMP_SWAP(Op, Subtarget, DAG);
33163 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33164 case ISD::ATOMIC_LOAD_ADD:
33165 case ISD::ATOMIC_LOAD_SUB:
33166 case ISD::ATOMIC_LOAD_OR:
33167 case ISD::ATOMIC_LOAD_XOR:
33168 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33169 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33170 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33171 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33172 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33173 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33174 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33175 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33176 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33177 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33178 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33179 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33180 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33181 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33182 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33183 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33184 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33185 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33186 case ISD::SHL_PARTS:
33187 case ISD::SRA_PARTS:
33188 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33189 case ISD::FSHL:
33190 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33191 case ISD::STRICT_SINT_TO_FP:
33192 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33193 case ISD::STRICT_UINT_TO_FP:
33194 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33195 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33196 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33197 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33198 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33199 case ISD::ZERO_EXTEND_VECTOR_INREG:
33200 case ISD::SIGN_EXTEND_VECTOR_INREG:
33201 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33202 case ISD::FP_TO_SINT:
33203 case ISD::STRICT_FP_TO_SINT:
33204 case ISD::FP_TO_UINT:
33205 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33206 case ISD::FP_TO_SINT_SAT:
33207 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33208 case ISD::FP_EXTEND:
33209 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33210 case ISD::FP_ROUND:
33211 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33212 case ISD::FP16_TO_FP:
33213 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33214 case ISD::FP_TO_FP16:
33215 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33216 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33217 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33218 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33219 case ISD::FADD:
33220 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33221 case ISD::FROUND: return LowerFROUND(Op, DAG);
33222 case ISD::FABS:
33223 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33224 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33225 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33226 case ISD::LRINT:
33227 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33228 case ISD::SETCC:
33229 case ISD::STRICT_FSETCC:
33230 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33231 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33232 case ISD::SELECT: return LowerSELECT(Op, DAG);
33233 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33234 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33235 case ISD::VASTART: return LowerVASTART(Op, DAG);
33236 case ISD::VAARG: return LowerVAARG(Op, DAG);
33237 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33238 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33239 case ISD::INTRINSIC_VOID:
33240 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33241 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33242 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33243 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33244 case ISD::FRAME_TO_ARGS_OFFSET:
33245 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33246 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33247 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33248 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33249 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33250 case ISD::EH_SJLJ_SETUP_DISPATCH:
33251 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33252 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33253 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33254 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33255 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33256 case ISD::CTLZ:
33257 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33258 case ISD::CTTZ:
33259 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33260 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33261 case ISD::MULHS:
33262 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33263 case ISD::ROTL:
33264 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33265 case ISD::SRA:
33266 case ISD::SRL:
33267 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33268 case ISD::SADDO:
33269 case ISD::UADDO:
33270 case ISD::SSUBO:
33271 case ISD::USUBO: return LowerXALUO(Op, DAG);
33272 case ISD::SMULO:
33273 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33274 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33275 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33276 case ISD::SADDO_CARRY:
33277 case ISD::SSUBO_CARRY:
33278 case ISD::ADDCARRY:
33279 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
33280 case ISD::ADD:
33281 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33282 case ISD::UADDSAT:
33283 case ISD::SADDSAT:
33284 case ISD::USUBSAT:
33285 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33286 case ISD::SMAX:
33287 case ISD::SMIN:
33288 case ISD::UMAX:
33289 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33290 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33291 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33292 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33293 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33294 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33295 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33296 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33297 case ISD::GC_TRANSITION_START:
33298 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33299 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33300 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33301 }
33302}
33303
33304/// Replace a node with an illegal result type with a new node built out of
33305/// custom code.
33306void X86TargetLowering::ReplaceNodeResults(SDNode *N,
33307 SmallVectorImpl<SDValue>&Results,
33308 SelectionDAG &DAG) const {
33309 SDLoc dl(N);
33310 switch (N->getOpcode()) {
33311 default:
33312#ifndef NDEBUG
33313 dbgs() << "ReplaceNodeResults: ";
33314 N->dump(&DAG);
33315#endif
33316 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33316)
;
33317 case X86ISD::CVTPH2PS: {
33318 EVT VT = N->getValueType(0);
33319 SDValue Lo, Hi;
33320 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33321 EVT LoVT, HiVT;
33322 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33323 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33324 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33325 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33326 Results.push_back(Res);
33327 return;
33328 }
33329 case X86ISD::STRICT_CVTPH2PS: {
33330 EVT VT = N->getValueType(0);
33331 SDValue Lo, Hi;
33332 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33333 EVT LoVT, HiVT;
33334 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33335 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33336 {N->getOperand(0), Lo});
33337 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33338 {N->getOperand(0), Hi});
33339 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33340 Lo.getValue(1), Hi.getValue(1));
33341 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33342 Results.push_back(Res);
33343 Results.push_back(Chain);
33344 return;
33345 }
33346 case X86ISD::CVTPS2PH:
33347 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33348 return;
33349 case ISD::CTPOP: {
33350 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33350, __extension__
__PRETTY_FUNCTION__))
;
33351 // Use a v2i64 if possible.
33352 bool NoImplicitFloatOps =
33353 DAG.getMachineFunction().getFunction().hasFnAttribute(
33354 Attribute::NoImplicitFloat);
33355 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33356 SDValue Wide =
33357 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33358 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33359 // Bit count should fit in 32-bits, extract it as that and then zero
33360 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33361 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33362 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33363 DAG.getIntPtrConstant(0, dl));
33364 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33365 Results.push_back(Wide);
33366 }
33367 return;
33368 }
33369 case ISD::MUL: {
33370 EVT VT = N->getValueType(0);
33371 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33372, __extension__
__PRETTY_FUNCTION__))
33372 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33372, __extension__
__PRETTY_FUNCTION__))
;
33373 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33374 // elements are needed.
33375 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33376 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33377 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33378 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33379 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33380 unsigned NumConcats = 16 / VT.getVectorNumElements();
33381 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33382 ConcatOps[0] = Res;
33383 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33384 Results.push_back(Res);
33385 return;
33386 }
33387 case ISD::SMULO:
33388 case ISD::UMULO: {
33389 EVT VT = N->getValueType(0);
33390 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33391, __extension__
__PRETTY_FUNCTION__))
33391 VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33391, __extension__
__PRETTY_FUNCTION__))
;
33392 bool IsSigned = N->getOpcode() == ISD::SMULO;
33393 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33394 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33395 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33396 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33397 // Extract the high 32 bits from each result using PSHUFD.
33398 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33399 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33400 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33401 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33402 DAG.getIntPtrConstant(0, dl));
33403
33404 // Truncate the low bits of the result. This will become PSHUFD.
33405 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33406
33407 SDValue HiCmp;
33408 if (IsSigned) {
33409 // SMULO overflows if the high bits don't match the sign of the low.
33410 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33411 } else {
33412 // UMULO overflows if the high bits are non-zero.
33413 HiCmp = DAG.getConstant(0, dl, VT);
33414 }
33415 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33416
33417 // Widen the result with by padding with undef.
33418 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33419 DAG.getUNDEF(VT));
33420 Results.push_back(Res);
33421 Results.push_back(Ovf);
33422 return;
33423 }
33424 case X86ISD::VPMADDWD: {
33425 // Legalize types for X86ISD::VPMADDWD by widening.
33426 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33426, __extension__
__PRETTY_FUNCTION__))
;
33427
33428 EVT VT = N->getValueType(0);
33429 EVT InVT = N->getOperand(0).getValueType();
33430 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__))
33431 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__))
;
33432 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33433, __extension__
__PRETTY_FUNCTION__))
33433 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33433, __extension__
__PRETTY_FUNCTION__))
;
33434 unsigned NumConcat = 128 / InVT.getSizeInBits();
33435
33436 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33437 InVT.getVectorElementType(),
33438 NumConcat * InVT.getVectorNumElements());
33439 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33440 VT.getVectorElementType(),
33441 NumConcat * VT.getVectorNumElements());
33442
33443 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33444 Ops[0] = N->getOperand(0);
33445 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33446 Ops[0] = N->getOperand(1);
33447 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33448
33449 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
33450 Results.push_back(Res);
33451 return;
33452 }
33453 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33454 case X86ISD::FMINC:
33455 case X86ISD::FMIN:
33456 case X86ISD::FMAXC:
33457 case X86ISD::FMAX: {
33458 EVT VT = N->getValueType(0);
33459 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33459, __extension__
__PRETTY_FUNCTION__))
;
33460 SDValue UNDEF = DAG.getUNDEF(VT);
33461 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33462 N->getOperand(0), UNDEF);
33463 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33464 N->getOperand(1), UNDEF);
33465 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
33466 return;
33467 }
33468 case ISD::SDIV:
33469 case ISD::UDIV:
33470 case ISD::SREM:
33471 case ISD::UREM: {
33472 EVT VT = N->getValueType(0);
33473 if (VT.isVector()) {
33474 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33475, __extension__
__PRETTY_FUNCTION__))
33475 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33475, __extension__
__PRETTY_FUNCTION__))
;
33476 // If this RHS is a constant splat vector we can widen this and let
33477 // division/remainder by constant optimize it.
33478 // TODO: Can we do something for non-splat?
33479 APInt SplatVal;
33480 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33481 unsigned NumConcats = 128 / VT.getSizeInBits();
33482 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33483 Ops0[0] = N->getOperand(0);
33484 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33485 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33486 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33487 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
33488 Results.push_back(Res);
33489 }
33490 return;
33491 }
33492
33493 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33494 Results.push_back(V);
33495 return;
33496 }
33497 case ISD::TRUNCATE: {
33498 MVT VT = N->getSimpleValueType(0);
33499 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33500 return;
33501
33502 // The generic legalizer will try to widen the input type to the same
33503 // number of elements as the widened result type. But this isn't always
33504 // the best thing so do some custom legalization to avoid some cases.
33505 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33506 SDValue In = N->getOperand(0);
33507 EVT InVT = In.getValueType();
33508
33509 unsigned InBits = InVT.getSizeInBits();
33510 if (128 % InBits == 0) {
33511 // 128 bit and smaller inputs should avoid truncate all together and
33512 // just use a build_vector that will become a shuffle.
33513 // TODO: Widen and use a shuffle directly?
33514 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
33515 EVT EltVT = VT.getVectorElementType();
33516 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33517 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
33518 // Use the original element count so we don't do more scalar opts than
33519 // necessary.
33520 unsigned MinElts = VT.getVectorNumElements();
33521 for (unsigned i=0; i < MinElts; ++i) {
33522 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
33523 DAG.getIntPtrConstant(i, dl));
33524 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
33525 }
33526 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
33527 return;
33528 }
33529 // With AVX512 there are some cases that can use a target specific
33530 // truncate node to go from 256/512 to less than 128 with zeros in the
33531 // upper elements of the 128 bit result.
33532 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
33533 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
33534 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
33535 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33536 return;
33537 }
33538 // There's one case we can widen to 512 bits and use VTRUNC.
33539 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
33540 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
33541 DAG.getUNDEF(MVT::v4i64));
33542 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33543 return;
33544 }
33545 }
33546 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
33547 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
33548 isTypeLegal(MVT::v4i64)) {
33549 // Input needs to be split and output needs to widened. Let's use two
33550 // VTRUNCs, and shuffle their results together into the wider type.
33551 SDValue Lo, Hi;
33552 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
33553
33554 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
33555 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
33556 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
33557 { 0, 1, 2, 3, 16, 17, 18, 19,
33558 -1, -1, -1, -1, -1, -1, -1, -1 });
33559 Results.push_back(Res);
33560 return;
33561 }
33562
33563 return;
33564 }
33565 case ISD::ANY_EXTEND:
33566 // Right now, only MVT::v8i8 has Custom action for an illegal type.
33567 // It's intended to custom handle the input type.
33568 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33569, __extension__
__PRETTY_FUNCTION__))
33569 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33569, __extension__
__PRETTY_FUNCTION__))
;
33570 return;
33571 case ISD::SIGN_EXTEND:
33572 case ISD::ZERO_EXTEND: {
33573 EVT VT = N->getValueType(0);
33574 SDValue In = N->getOperand(0);
33575 EVT InVT = In.getValueType();
33576 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
33577 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
33578 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33579, __extension__
__PRETTY_FUNCTION__))
33579 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33579, __extension__
__PRETTY_FUNCTION__))
;
33580 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33580, __extension__
__PRETTY_FUNCTION__))
;
33581 // Custom split this so we can extend i8/i16->i32 invec. This is better
33582 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
33583 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
33584 // we allow the sra from the extend to i32 to be shared by the split.
33585 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
33586
33587 // Fill a vector with sign bits for each element.
33588 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
33589 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
33590
33591 // Create an unpackl and unpackh to interleave the sign bits then bitcast
33592 // to v2i64.
33593 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33594 {0, 4, 1, 5});
33595 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
33596 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33597 {2, 6, 3, 7});
33598 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
33599
33600 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33601 Results.push_back(Res);
33602 return;
33603 }
33604
33605 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
33606 if (!InVT.is128BitVector()) {
33607 // Not a 128 bit vector, but maybe type legalization will promote
33608 // it to 128 bits.
33609 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
33610 return;
33611 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
33612 if (!InVT.is128BitVector())
33613 return;
33614
33615 // Promote the input to 128 bits. Type legalization will turn this into
33616 // zext_inreg/sext_inreg.
33617 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
33618 }
33619
33620 // Perform custom splitting instead of the two stage extend we would get
33621 // by default.
33622 EVT LoVT, HiVT;
33623 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
33624 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33624, __extension__
__PRETTY_FUNCTION__))
;
33625
33626 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
33627
33628 // We need to shift the input over by half the number of elements.
33629 unsigned NumElts = InVT.getVectorNumElements();
33630 unsigned HalfNumElts = NumElts / 2;
33631 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
33632 for (unsigned i = 0; i != HalfNumElts; ++i)
33633 ShufMask[i] = i + HalfNumElts;
33634
33635 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
33636 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
33637
33638 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33639 Results.push_back(Res);
33640 }
33641 return;
33642 }
33643 case ISD::FP_TO_SINT:
33644 case ISD::STRICT_FP_TO_SINT:
33645 case ISD::FP_TO_UINT:
33646 case ISD::STRICT_FP_TO_UINT: {
33647 bool IsStrict = N->isStrictFPOpcode();
33648 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
33649 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
33650 EVT VT = N->getValueType(0);
33651 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33652 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33653 EVT SrcVT = Src.getValueType();
33654
33655 SDValue Res;
33656 if (isSoftFP16(SrcVT)) {
33657 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
33658 if (IsStrict) {
33659 Res =
33660 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
33661 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
33662 {NVT, MVT::Other}, {Chain, Src})});
33663 Chain = Res.getValue(1);
33664 } else {
33665 Res = DAG.getNode(N->getOpcode(), dl, VT,
33666 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
33667 }
33668 Results.push_back(Res);
33669 if (IsStrict)
33670 Results.push_back(Chain);
33671
33672 return;
33673 }
33674
33675 if (VT.isVector() && Subtarget.hasFP16() &&
33676 SrcVT.getVectorElementType() == MVT::f16) {
33677 EVT EleVT = VT.getVectorElementType();
33678 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
33679
33680 if (SrcVT != MVT::v8f16) {
33681 SDValue Tmp =
33682 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
33683 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
33684 Ops[0] = Src;
33685 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
33686 }
33687
33688 if (IsStrict) {
33689 unsigned Opc =
33690 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
33691 Res =
33692 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
33693 Chain = Res.getValue(1);
33694 } else {
33695 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33696 Res = DAG.getNode(Opc, dl, ResVT, Src);
33697 }
33698
33699 // TODO: Need to add exception check code for strict FP.
33700 if (EleVT.getSizeInBits() < 16) {
33701 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
33702 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
33703
33704 // Now widen to 128 bits.
33705 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
33706 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
33707 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
33708 ConcatOps[0] = Res;
33709 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33710 }
33711
33712 Results.push_back(Res);
33713 if (IsStrict)
33714 Results.push_back(Chain);
33715
33716 return;
33717 }
33718
33719 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
33720 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33721, __extension__
__PRETTY_FUNCTION__))
33721 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33721, __extension__
__PRETTY_FUNCTION__))
;
33722
33723 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
33724 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
33725 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
33726 VT.getVectorNumElements());
33727 SDValue Res;
33728 SDValue Chain;
33729 if (IsStrict) {
33730 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
33731 {N->getOperand(0), Src});
33732 Chain = Res.getValue(1);
33733 } else
33734 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
33735
33736 // Preserve what we know about the size of the original result. If the
33737 // result is v2i32, we have to manually widen the assert.
33738 if (PromoteVT == MVT::v2i32)
33739 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33740 DAG.getUNDEF(MVT::v2i32));
33741
33742 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
33743 Res.getValueType(), Res,
33744 DAG.getValueType(VT.getVectorElementType()));
33745
33746 if (PromoteVT == MVT::v2i32)
33747 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
33748 DAG.getIntPtrConstant(0, dl));
33749
33750 // Truncate back to the original width.
33751 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33752
33753 // Now widen to 128 bits.
33754 unsigned NumConcats = 128 / VT.getSizeInBits();
33755 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
33756 VT.getVectorNumElements() * NumConcats);
33757 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33758 ConcatOps[0] = Res;
33759 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33760 Results.push_back(Res);
33761 if (IsStrict)
33762 Results.push_back(Chain);
33763 return;
33764 }
33765
33766
33767 if (VT == MVT::v2i32) {
33768 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33769, __extension__
__PRETTY_FUNCTION__))
33769 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33769, __extension__
__PRETTY_FUNCTION__))
;
33770 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33770, __extension__
__PRETTY_FUNCTION__))
;
33771 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
33772 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))
;
33773 if (Src.getValueType() == MVT::v2f64) {
33774 if (!IsSigned && !Subtarget.hasAVX512()) {
33775 SDValue Res =
33776 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
33777 Results.push_back(Res);
33778 return;
33779 }
33780
33781 unsigned Opc;
33782 if (IsStrict)
33783 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
33784 else
33785 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33786
33787 // If we have VLX we can emit a target specific FP_TO_UINT node,.
33788 if (!IsSigned && !Subtarget.hasVLX()) {
33789 // Otherwise we can defer to the generic legalizer which will widen
33790 // the input as well. This will be further widened during op
33791 // legalization to v8i32<-v8f64.
33792 // For strict nodes we'll need to widen ourselves.
33793 // FIXME: Fix the type legalizer to safely widen strict nodes?
33794 if (!IsStrict)
33795 return;
33796 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
33797 DAG.getConstantFP(0.0, dl, MVT::v2f64));
33798 Opc = N->getOpcode();
33799 }
33800 SDValue Res;
33801 SDValue Chain;
33802 if (IsStrict) {
33803 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33804 {N->getOperand(0), Src});
33805 Chain = Res.getValue(1);
33806 } else {
33807 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
33808 }
33809 Results.push_back(Res);
33810 if (IsStrict)
33811 Results.push_back(Chain);
33812 return;
33813 }
33814
33815 // Custom widen strict v2f32->v2i32 by padding with zeros.
33816 // FIXME: Should generic type legalizer do this?
33817 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
33818 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
33819 DAG.getConstantFP(0.0, dl, MVT::v2f32));
33820 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
33821 {N->getOperand(0), Src});
33822 Results.push_back(Res);
33823 Results.push_back(Res.getValue(1));
33824 return;
33825 }
33826
33827 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
33828 // so early out here.
33829 return;
33830 }
33831
33832 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33832, __extension__
__PRETTY_FUNCTION__))
;
33833
33834 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
33835 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
33836 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
33837 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33837, __extension__
__PRETTY_FUNCTION__))
;
33838 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
33839 // If we use a 128-bit result we might need to use a target specific node.
33840 unsigned SrcElts =
33841 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
33842 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
33843 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
33844 unsigned Opc = N->getOpcode();
33845 if (NumElts != SrcElts) {
33846 if (IsStrict)
33847 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
33848 else
33849 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33850 }
33851
33852 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
33853 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
33854 DAG.getConstantFP(0.0, dl, VecInVT), Src,
33855 ZeroIdx);
33856 SDValue Chain;
33857 if (IsStrict) {
33858 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
33859 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
33860 Chain = Res.getValue(1);
33861 } else
33862 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
33863 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
33864 Results.push_back(Res);
33865 if (IsStrict)
33866 Results.push_back(Chain);
33867 return;
33868 }
33869
33870 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
33871 SDValue Chain;
33872 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
33873 Results.push_back(V);
33874 if (IsStrict)
33875 Results.push_back(Chain);
33876 return;
33877 }
33878
33879 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
33880 Results.push_back(V);
33881 if (IsStrict)
33882 Results.push_back(Chain);
33883 }
33884 return;
33885 }
33886 case ISD::LRINT:
33887 case ISD::LLRINT: {
33888 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
33889 Results.push_back(V);
33890 return;
33891 }
33892
33893 case ISD::SINT_TO_FP:
33894 case ISD::STRICT_SINT_TO_FP:
33895 case ISD::UINT_TO_FP:
33896 case ISD::STRICT_UINT_TO_FP: {
33897 bool IsStrict = N->isStrictFPOpcode();
33898 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
33899 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
33900 EVT VT = N->getValueType(0);
33901 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33902 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
33903 Subtarget.hasVLX()) {
33904 if (Src.getValueType().getVectorElementType() == MVT::i16)
33905 return;
33906
33907 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
33908 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33909 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
33910 : DAG.getUNDEF(MVT::v2i32));
33911 if (IsStrict) {
33912 unsigned Opc =
33913 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
33914 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
33915 {N->getOperand(0), Src});
33916 Results.push_back(Res);
33917 Results.push_back(Res.getValue(1));
33918 } else {
33919 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33920 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
33921 }
33922 return;
33923 }
33924 if (VT != MVT::v2f32)
33925 return;
33926 EVT SrcVT = Src.getValueType();
33927 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
33928 if (IsStrict) {
33929 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
33930 : X86ISD::STRICT_CVTUI2P;
33931 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33932 {N->getOperand(0), Src});
33933 Results.push_back(Res);
33934 Results.push_back(Res.getValue(1));
33935 } else {
33936 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33937 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
33938 }
33939 return;
33940 }
33941 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
33942 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
33943 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
33944 SDValue One = DAG.getConstant(1, dl, SrcVT);
33945 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
33946 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
33947 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
33948 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
33949 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
33950 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
33951 for (int i = 0; i != 2; ++i) {
33952 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
33953 SignSrc, DAG.getIntPtrConstant(i, dl));
33954 if (IsStrict)
33955 SignCvts[i] =
33956 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33957 {N->getOperand(0), Elt});
33958 else
33959 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33960 };
33961 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33962 SDValue Slow, Chain;
33963 if (IsStrict) {
33964 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33965 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33966 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33967 {Chain, SignCvt, SignCvt});
33968 Chain = Slow.getValue(1);
33969 } else {
33970 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33971 }
33972 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33973 IsNeg =
33974 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33975 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33976 Results.push_back(Cvt);
33977 if (IsStrict)
33978 Results.push_back(Chain);
33979 return;
33980 }
33981
33982 if (SrcVT != MVT::v2i32)
33983 return;
33984
33985 if (IsSigned || Subtarget.hasAVX512()) {
33986 if (!IsStrict)
33987 return;
33988
33989 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33990 // FIXME: Should generic type legalizer do this?
33991 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33992 DAG.getConstant(0, dl, MVT::v2i32));
33993 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33994 {N->getOperand(0), Src});
33995 Results.push_back(Res);
33996 Results.push_back(Res.getValue(1));
33997 return;
33998 }
33999
34000 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34000, __extension__
__PRETTY_FUNCTION__))
;
34001 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34002 SDValue VBias =
34003 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
34004 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34005 DAG.getBitcast(MVT::v2i64, VBias));
34006 Or = DAG.getBitcast(MVT::v2f64, Or);
34007 if (IsStrict) {
34008 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34009 {N->getOperand(0), Or, VBias});
34010 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
34011 {MVT::v4f32, MVT::Other},
34012 {Sub.getValue(1), Sub});
34013 Results.push_back(Res);
34014 Results.push_back(Res.getValue(1));
34015 } else {
34016 // TODO: Are there any fast-math-flags to propagate here?
34017 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34018 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34019 }
34020 return;
34021 }
34022 case ISD::STRICT_FP_ROUND:
34023 case ISD::FP_ROUND: {
34024 bool IsStrict = N->isStrictFPOpcode();
34025 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34026 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34027 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34028 EVT SrcVT = Src.getValueType();
34029 EVT VT = N->getValueType(0);
34030 SDValue V;
34031 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34032 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34033 : DAG.getUNDEF(MVT::v2f32);
34034 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34035 }
34036 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34037 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34037, __extension__
__PRETTY_FUNCTION__))
;
34038 if (SrcVT.getVectorElementType() != MVT::f32)
34039 return;
34040
34041 if (IsStrict)
34042 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34043 {Chain, Src, Rnd});
34044 else
34045 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34046
34047 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34048 if (IsStrict)
34049 Results.push_back(V.getValue(1));
34050 return;
34051 }
34052 if (!isTypeLegal(Src.getValueType()))
34053 return;
34054 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34055 if (IsStrict)
34056 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34057 {Chain, Src});
34058 else
34059 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34060 Results.push_back(V);
34061 if (IsStrict)
34062 Results.push_back(V.getValue(1));
34063 return;
34064 }
34065 case ISD::FP_EXTEND:
34066 case ISD::STRICT_FP_EXTEND: {
34067 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34068 // No other ValueType for FP_EXTEND should reach this point.
34069 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34070, __extension__
__PRETTY_FUNCTION__))
34070 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34070, __extension__
__PRETTY_FUNCTION__))
;
34071 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34072 return;
34073 bool IsStrict = N->isStrictFPOpcode();
34074 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34075 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34076 : DAG.getUNDEF(MVT::v2f16);
34077 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34078 if (IsStrict)
34079 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34080 {N->getOperand(0), V});
34081 else
34082 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34083 Results.push_back(V);
34084 if (IsStrict)
34085 Results.push_back(V.getValue(1));
34086 return;
34087 }
34088 case ISD::INTRINSIC_W_CHAIN: {
34089 unsigned IntNo = N->getConstantOperandVal(1);
34090 switch (IntNo) {
34091 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34092)
34092 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34092)
;
34093 case Intrinsic::x86_rdtsc:
34094 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34095 Results);
34096 case Intrinsic::x86_rdtscp:
34097 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34098 Results);
34099 case Intrinsic::x86_rdpmc:
34100 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34101 Results);
34102 return;
34103 case Intrinsic::x86_rdpru:
34104 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34105 Results);
34106 return;
34107 case Intrinsic::x86_xgetbv:
34108 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34109 Results);
34110 return;
34111 }
34112 }
34113 case ISD::READCYCLECOUNTER: {
34114 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34115 }
34116 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34117 EVT T = N->getValueType(0);
34118 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34118, __extension__
__PRETTY_FUNCTION__))
;
34119 bool Regs64bit = T == MVT::i128;
34120 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34121, __extension__
__PRETTY_FUNCTION__))
34121 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34121, __extension__
__PRETTY_FUNCTION__))
;
34122 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34123 SDValue cpInL, cpInH;
34124 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
34125 DAG.getConstant(0, dl, HalfT));
34126 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
34127 DAG.getConstant(1, dl, HalfT));
34128 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34129 Regs64bit ? X86::RAX : X86::EAX,
34130 cpInL, SDValue());
34131 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
34132 Regs64bit ? X86::RDX : X86::EDX,
34133 cpInH, cpInL.getValue(1));
34134 SDValue swapInL, swapInH;
34135 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
34136 DAG.getConstant(0, dl, HalfT));
34137 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
34138 DAG.getConstant(1, dl, HalfT));
34139 swapInH =
34140 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34141 swapInH, cpInH.getValue(1));
34142
34143 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34144 // until later. So we keep the RBX input in a vreg and use a custom
34145 // inserter.
34146 // Since RBX will be a reserved register the register allocator will not
34147 // make sure its value will be properly saved and restored around this
34148 // live-range.
34149 SDValue Result;
34150 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34151 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34152 if (Regs64bit) {
34153 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34154 swapInH.getValue(1)};
34155 Result =
34156 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34157 } else {
34158 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34159 swapInH.getValue(1));
34160 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34161 swapInL.getValue(1)};
34162 Result =
34163 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34164 }
34165
34166 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34167 Regs64bit ? X86::RAX : X86::EAX,
34168 HalfT, Result.getValue(1));
34169 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34170 Regs64bit ? X86::RDX : X86::EDX,
34171 HalfT, cpOutL.getValue(2));
34172 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34173
34174 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34175 MVT::i32, cpOutH.getValue(2));
34176 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34177 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34178
34179 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34180 Results.push_back(Success);
34181 Results.push_back(EFLAGS.getValue(1));
34182 return;
34183 }
34184 case ISD::ATOMIC_LOAD: {
34185 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34185, __extension__
__PRETTY_FUNCTION__))
;
34186 bool NoImplicitFloatOps =
34187 DAG.getMachineFunction().getFunction().hasFnAttribute(
34188 Attribute::NoImplicitFloat);
34189 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34190 auto *Node = cast<AtomicSDNode>(N);
34191 if (Subtarget.hasSSE1()) {
34192 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34193 // Then extract the lower 64-bits.
34194 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34195 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34196 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34197 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34198 MVT::i64, Node->getMemOperand());
34199 if (Subtarget.hasSSE2()) {
34200 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34201 DAG.getIntPtrConstant(0, dl));
34202 Results.push_back(Res);
34203 Results.push_back(Ld.getValue(1));
34204 return;
34205 }
34206 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34207 // then casts to i64. This avoids a 128-bit stack temporary being
34208 // created by type legalization if we were to cast v4f32->v2i64.
34209 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34210 DAG.getIntPtrConstant(0, dl));
34211 Res = DAG.getBitcast(MVT::i64, Res);
34212 Results.push_back(Res);
34213 Results.push_back(Ld.getValue(1));
34214 return;
34215 }
34216 if (Subtarget.hasX87()) {
34217 // First load this into an 80-bit X87 register. This will put the whole
34218 // integer into the significand.
34219 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34220 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34221 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
34222 dl, Tys, Ops, MVT::i64,
34223 Node->getMemOperand());
34224 SDValue Chain = Result.getValue(1);
34225
34226 // Now store the X87 register to a stack temporary and convert to i64.
34227 // This store is not atomic and doesn't need to be.
34228 // FIXME: We don't need a stack temporary if the result of the load
34229 // is already being stored. We could just directly store there.
34230 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34231 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34232 MachinePointerInfo MPI =
34233 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
34234 SDValue StoreOps[] = { Chain, Result, StackPtr };
34235 Chain = DAG.getMemIntrinsicNode(
34236 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34237 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34238
34239 // Finally load the value back from the stack temporary and return it.
34240 // This load is not atomic and doesn't need to be.
34241 // This load will be further type legalized.
34242 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34243 Results.push_back(Result);
34244 Results.push_back(Result.getValue(1));
34245 return;
34246 }
34247 }
34248 // TODO: Use MOVLPS when SSE1 is available?
34249 // Delegate to generic TypeLegalization. Situations we can really handle
34250 // should have already been dealt with by AtomicExpandPass.cpp.
34251 break;
34252 }
34253 case ISD::ATOMIC_SWAP:
34254 case ISD::ATOMIC_LOAD_ADD:
34255 case ISD::ATOMIC_LOAD_SUB:
34256 case ISD::ATOMIC_LOAD_AND:
34257 case ISD::ATOMIC_LOAD_OR:
34258 case ISD::ATOMIC_LOAD_XOR:
34259 case ISD::ATOMIC_LOAD_NAND:
34260 case ISD::ATOMIC_LOAD_MIN:
34261 case ISD::ATOMIC_LOAD_MAX:
34262 case ISD::ATOMIC_LOAD_UMIN:
34263 case ISD::ATOMIC_LOAD_UMAX:
34264 // Delegate to generic TypeLegalization. Situations we can really handle
34265 // should have already been dealt with by AtomicExpandPass.cpp.
34266 break;
34267
34268 case ISD::BITCAST: {
34269 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34269, __extension__
__PRETTY_FUNCTION__))
;
34270 EVT DstVT = N->getValueType(0);
34271 EVT SrcVT = N->getOperand(0).getValueType();
34272
34273 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34274 // we can split using the k-register rather than memory.
34275 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34276 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34276, __extension__
__PRETTY_FUNCTION__))
;
34277 SDValue Lo, Hi;
34278 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34279 Lo = DAG.getBitcast(MVT::i32, Lo);
34280 Hi = DAG.getBitcast(MVT::i32, Hi);
34281 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34282 Results.push_back(Res);
34283 return;
34284 }
34285
34286 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34287 // FIXME: Use v4f32 for SSE1?
34288 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34288, __extension__
__PRETTY_FUNCTION__))
;
34289 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34290, __extension__
__PRETTY_FUNCTION__))
34290 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34290, __extension__
__PRETTY_FUNCTION__))
;
34291 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34292 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34293 N->getOperand(0));
34294 Res = DAG.getBitcast(WideVT, Res);
34295 Results.push_back(Res);
34296 return;
34297 }
34298
34299 return;
34300 }
34301 case ISD::MGATHER: {
34302 EVT VT = N->getValueType(0);
34303 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34304 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34305 auto *Gather = cast<MaskedGatherSDNode>(N);
34306 SDValue Index = Gather->getIndex();
34307 if (Index.getValueType() != MVT::v2i64)
34308 return;
34309 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__))
34310 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__))
;
34311 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34312 SDValue Mask = Gather->getMask();
34313 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34313, __extension__
__PRETTY_FUNCTION__))
;
34314 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34315 Gather->getPassThru(),
34316 DAG.getUNDEF(VT));
34317 if (!Subtarget.hasVLX()) {
34318 // We need to widen the mask, but the instruction will only use 2
34319 // of its elements. So we can use undef.
34320 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34321 DAG.getUNDEF(MVT::v2i1));
34322 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34323 }
34324 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34325 Gather->getBasePtr(), Index, Gather->getScale() };
34326 SDValue Res = DAG.getMemIntrinsicNode(
34327 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34328 Gather->getMemoryVT(), Gather->getMemOperand());
34329 Results.push_back(Res);
34330 Results.push_back(Res.getValue(1));
34331 return;
34332 }
34333 return;
34334 }
34335 case ISD::LOAD: {
34336 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34337 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34338 // cast since type legalization will try to use an i64 load.
34339 MVT VT = N->getSimpleValueType(0);
34340 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34340, __extension__
__PRETTY_FUNCTION__))
;
34341 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34342, __extension__
__PRETTY_FUNCTION__))
34342 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34342, __extension__
__PRETTY_FUNCTION__))
;
34343 if (!ISD::isNON_EXTLoad(N))
34344 return;
34345 auto *Ld = cast<LoadSDNode>(N);
34346 if (Subtarget.hasSSE2()) {
34347 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34348 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34349 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34350 Ld->getMemOperand()->getFlags());
34351 SDValue Chain = Res.getValue(1);
34352 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34353 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34354 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34355 Res = DAG.getBitcast(WideVT, Res);
34356 Results.push_back(Res);
34357 Results.push_back(Chain);
34358 return;
34359 }
34360 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34360, __extension__
__PRETTY_FUNCTION__))
;
34361 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34362 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34363 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34364 MVT::i64, Ld->getMemOperand());
34365 Results.push_back(Res);
34366 Results.push_back(Res.getValue(1));
34367 return;
34368 }
34369 case ISD::ADDRSPACECAST: {
34370 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34371 Results.push_back(V);
34372 return;
34373 }
34374 case ISD::BITREVERSE: {
34375 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34375, __extension__
__PRETTY_FUNCTION__))
;
34376 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34376, __extension__
__PRETTY_FUNCTION__))
;
34377 // We can use VPPERM by copying to a vector register and back. We'll need
34378 // to move the scalar in two i32 pieces.
34379 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34380 return;
34381 }
34382 case ISD::EXTRACT_VECTOR_ELT: {
34383 // f16 = extract vXf16 %vec, i64 %idx
34384 assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34385, __extension__
__PRETTY_FUNCTION__))
34385 "Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34385, __extension__
__PRETTY_FUNCTION__))
;
34386 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34386, __extension__
__PRETTY_FUNCTION__))
;
34387 SDValue VecOp = N->getOperand(0);
34388 EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
34389 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34390 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34391 N->getOperand(1));
34392 Split = DAG.getBitcast(MVT::f16, Split);
34393 Results.push_back(Split);
34394 return;
34395 }
34396 }
34397}
34398
34399const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34400 switch ((X86ISD::NodeType)Opcode) {
34401 case X86ISD::FIRST_NUMBER: break;
34402#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34403 NODE_NAME_CASE(BSF)
34404 NODE_NAME_CASE(BSR)
34405 NODE_NAME_CASE(FSHL)
34406 NODE_NAME_CASE(FSHR)
34407 NODE_NAME_CASE(FAND)
34408 NODE_NAME_CASE(FANDN)
34409 NODE_NAME_CASE(FOR)
34410 NODE_NAME_CASE(FXOR)
34411 NODE_NAME_CASE(FILD)
34412 NODE_NAME_CASE(FIST)
34413 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34414 NODE_NAME_CASE(FLD)
34415 NODE_NAME_CASE(FST)
34416 NODE_NAME_CASE(CALL)
34417 NODE_NAME_CASE(CALL_RVMARKER)
34418 NODE_NAME_CASE(BT)
34419 NODE_NAME_CASE(CMP)
34420 NODE_NAME_CASE(FCMP)
34421 NODE_NAME_CASE(STRICT_FCMP)
34422 NODE_NAME_CASE(STRICT_FCMPS)
34423 NODE_NAME_CASE(COMI)
34424 NODE_NAME_CASE(UCOMI)
34425 NODE_NAME_CASE(CMPM)
34426 NODE_NAME_CASE(CMPMM)
34427 NODE_NAME_CASE(STRICT_CMPM)
34428 NODE_NAME_CASE(CMPMM_SAE)
34429 NODE_NAME_CASE(SETCC)
34430 NODE_NAME_CASE(SETCC_CARRY)
34431 NODE_NAME_CASE(FSETCC)
34432 NODE_NAME_CASE(FSETCCM)
34433 NODE_NAME_CASE(FSETCCM_SAE)
34434 NODE_NAME_CASE(CMOV)
34435 NODE_NAME_CASE(BRCOND)
34436 NODE_NAME_CASE(RET_FLAG)
34437 NODE_NAME_CASE(IRET)
34438 NODE_NAME_CASE(REP_STOS)
34439 NODE_NAME_CASE(REP_MOVS)
34440 NODE_NAME_CASE(GlobalBaseReg)
34441 NODE_NAME_CASE(Wrapper)
34442 NODE_NAME_CASE(WrapperRIP)
34443 NODE_NAME_CASE(MOVQ2DQ)
34444 NODE_NAME_CASE(MOVDQ2Q)
34445 NODE_NAME_CASE(MMX_MOVD2W)
34446 NODE_NAME_CASE(MMX_MOVW2D)
34447 NODE_NAME_CASE(PEXTRB)
34448 NODE_NAME_CASE(PEXTRW)
34449 NODE_NAME_CASE(INSERTPS)
34450 NODE_NAME_CASE(PINSRB)
34451 NODE_NAME_CASE(PINSRW)
34452 NODE_NAME_CASE(PSHUFB)
34453 NODE_NAME_CASE(ANDNP)
34454 NODE_NAME_CASE(BLENDI)
34455 NODE_NAME_CASE(BLENDV)
34456 NODE_NAME_CASE(HADD)
34457 NODE_NAME_CASE(HSUB)
34458 NODE_NAME_CASE(FHADD)
34459 NODE_NAME_CASE(FHSUB)
34460 NODE_NAME_CASE(CONFLICT)
34461 NODE_NAME_CASE(FMAX)
34462 NODE_NAME_CASE(FMAXS)
34463 NODE_NAME_CASE(FMAX_SAE)
34464 NODE_NAME_CASE(FMAXS_SAE)
34465 NODE_NAME_CASE(FMIN)
34466 NODE_NAME_CASE(FMINS)
34467 NODE_NAME_CASE(FMIN_SAE)
34468 NODE_NAME_CASE(FMINS_SAE)
34469 NODE_NAME_CASE(FMAXC)
34470 NODE_NAME_CASE(FMINC)
34471 NODE_NAME_CASE(FRSQRT)
34472 NODE_NAME_CASE(FRCP)
34473 NODE_NAME_CASE(EXTRQI)
34474 NODE_NAME_CASE(INSERTQI)
34475 NODE_NAME_CASE(TLSADDR)
34476 NODE_NAME_CASE(TLSBASEADDR)
34477 NODE_NAME_CASE(TLSCALL)
34478 NODE_NAME_CASE(EH_SJLJ_SETJMP)
34479 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
34480 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
34481 NODE_NAME_CASE(EH_RETURN)
34482 NODE_NAME_CASE(TC_RETURN)
34483 NODE_NAME_CASE(FNSTCW16m)
34484 NODE_NAME_CASE(FLDCW16m)
34485 NODE_NAME_CASE(LCMPXCHG_DAG)
34486 NODE_NAME_CASE(LCMPXCHG8_DAG)
34487 NODE_NAME_CASE(LCMPXCHG16_DAG)
34488 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
34489 NODE_NAME_CASE(LADD)
34490 NODE_NAME_CASE(LSUB)
34491 NODE_NAME_CASE(LOR)
34492 NODE_NAME_CASE(LXOR)
34493 NODE_NAME_CASE(LAND)
34494 NODE_NAME_CASE(LBTS)
34495 NODE_NAME_CASE(LBTC)
34496 NODE_NAME_CASE(LBTR)
34497 NODE_NAME_CASE(LBTS_RM)
34498 NODE_NAME_CASE(LBTC_RM)
34499 NODE_NAME_CASE(LBTR_RM)
34500 NODE_NAME_CASE(AADD)
34501 NODE_NAME_CASE(AOR)
34502 NODE_NAME_CASE(AXOR)
34503 NODE_NAME_CASE(AAND)
34504 NODE_NAME_CASE(VZEXT_MOVL)
34505 NODE_NAME_CASE(VZEXT_LOAD)
34506 NODE_NAME_CASE(VEXTRACT_STORE)
34507 NODE_NAME_CASE(VTRUNC)
34508 NODE_NAME_CASE(VTRUNCS)
34509 NODE_NAME_CASE(VTRUNCUS)
34510 NODE_NAME_CASE(VMTRUNC)
34511 NODE_NAME_CASE(VMTRUNCS)
34512 NODE_NAME_CASE(VMTRUNCUS)
34513 NODE_NAME_CASE(VTRUNCSTORES)
34514 NODE_NAME_CASE(VTRUNCSTOREUS)
34515 NODE_NAME_CASE(VMTRUNCSTORES)
34516 NODE_NAME_CASE(VMTRUNCSTOREUS)
34517 NODE_NAME_CASE(VFPEXT)
34518 NODE_NAME_CASE(STRICT_VFPEXT)
34519 NODE_NAME_CASE(VFPEXT_SAE)
34520 NODE_NAME_CASE(VFPEXTS)
34521 NODE_NAME_CASE(VFPEXTS_SAE)
34522 NODE_NAME_CASE(VFPROUND)
34523 NODE_NAME_CASE(STRICT_VFPROUND)
34524 NODE_NAME_CASE(VMFPROUND)
34525 NODE_NAME_CASE(VFPROUND_RND)
34526 NODE_NAME_CASE(VFPROUNDS)
34527 NODE_NAME_CASE(VFPROUNDS_RND)
34528 NODE_NAME_CASE(VSHLDQ)
34529 NODE_NAME_CASE(VSRLDQ)
34530 NODE_NAME_CASE(VSHL)
34531 NODE_NAME_CASE(VSRL)
34532 NODE_NAME_CASE(VSRA)
34533 NODE_NAME_CASE(VSHLI)
34534 NODE_NAME_CASE(VSRLI)
34535 NODE_NAME_CASE(VSRAI)
34536 NODE_NAME_CASE(VSHLV)
34537 NODE_NAME_CASE(VSRLV)
34538 NODE_NAME_CASE(VSRAV)
34539 NODE_NAME_CASE(VROTLI)
34540 NODE_NAME_CASE(VROTRI)
34541 NODE_NAME_CASE(VPPERM)
34542 NODE_NAME_CASE(CMPP)
34543 NODE_NAME_CASE(STRICT_CMPP)
34544 NODE_NAME_CASE(PCMPEQ)
34545 NODE_NAME_CASE(PCMPGT)
34546 NODE_NAME_CASE(PHMINPOS)
34547 NODE_NAME_CASE(ADD)
34548 NODE_NAME_CASE(SUB)
34549 NODE_NAME_CASE(ADC)
34550 NODE_NAME_CASE(SBB)
34551 NODE_NAME_CASE(SMUL)
34552 NODE_NAME_CASE(UMUL)
34553 NODE_NAME_CASE(OR)
34554 NODE_NAME_CASE(XOR)
34555 NODE_NAME_CASE(AND)
34556 NODE_NAME_CASE(BEXTR)
34557 NODE_NAME_CASE(BEXTRI)
34558 NODE_NAME_CASE(BZHI)
34559 NODE_NAME_CASE(PDEP)
34560 NODE_NAME_CASE(PEXT)
34561 NODE_NAME_CASE(MUL_IMM)
34562 NODE_NAME_CASE(MOVMSK)
34563 NODE_NAME_CASE(PTEST)
34564 NODE_NAME_CASE(TESTP)
34565 NODE_NAME_CASE(KORTEST)
34566 NODE_NAME_CASE(KTEST)
34567 NODE_NAME_CASE(KADD)
34568 NODE_NAME_CASE(KSHIFTL)
34569 NODE_NAME_CASE(KSHIFTR)
34570 NODE_NAME_CASE(PACKSS)
34571 NODE_NAME_CASE(PACKUS)
34572 NODE_NAME_CASE(PALIGNR)
34573 NODE_NAME_CASE(VALIGN)
34574 NODE_NAME_CASE(VSHLD)
34575 NODE_NAME_CASE(VSHRD)
34576 NODE_NAME_CASE(VSHLDV)
34577 NODE_NAME_CASE(VSHRDV)
34578 NODE_NAME_CASE(PSHUFD)
34579 NODE_NAME_CASE(PSHUFHW)
34580 NODE_NAME_CASE(PSHUFLW)
34581 NODE_NAME_CASE(SHUFP)
34582 NODE_NAME_CASE(SHUF128)
34583 NODE_NAME_CASE(MOVLHPS)
34584 NODE_NAME_CASE(MOVHLPS)
34585 NODE_NAME_CASE(MOVDDUP)
34586 NODE_NAME_CASE(MOVSHDUP)
34587 NODE_NAME_CASE(MOVSLDUP)
34588 NODE_NAME_CASE(MOVSD)
34589 NODE_NAME_CASE(MOVSS)
34590 NODE_NAME_CASE(MOVSH)
34591 NODE_NAME_CASE(UNPCKL)
34592 NODE_NAME_CASE(UNPCKH)
34593 NODE_NAME_CASE(VBROADCAST)
34594 NODE_NAME_CASE(VBROADCAST_LOAD)
34595 NODE_NAME_CASE(VBROADCASTM)
34596 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
34597 NODE_NAME_CASE(VPERMILPV)
34598 NODE_NAME_CASE(VPERMILPI)
34599 NODE_NAME_CASE(VPERM2X128)
34600 NODE_NAME_CASE(VPERMV)
34601 NODE_NAME_CASE(VPERMV3)
34602 NODE_NAME_CASE(VPERMI)
34603 NODE_NAME_CASE(VPTERNLOG)
34604 NODE_NAME_CASE(VFIXUPIMM)
34605 NODE_NAME_CASE(VFIXUPIMM_SAE)
34606 NODE_NAME_CASE(VFIXUPIMMS)
34607 NODE_NAME_CASE(VFIXUPIMMS_SAE)
34608 NODE_NAME_CASE(VRANGE)
34609 NODE_NAME_CASE(VRANGE_SAE)
34610 NODE_NAME_CASE(VRANGES)
34611 NODE_NAME_CASE(VRANGES_SAE)
34612 NODE_NAME_CASE(PMULUDQ)
34613 NODE_NAME_CASE(PMULDQ)
34614 NODE_NAME_CASE(PSADBW)
34615 NODE_NAME_CASE(DBPSADBW)
34616 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
34617 NODE_NAME_CASE(VAARG_64)
34618 NODE_NAME_CASE(VAARG_X32)
34619 NODE_NAME_CASE(DYN_ALLOCA)
34620 NODE_NAME_CASE(MFENCE)
34621 NODE_NAME_CASE(SEG_ALLOCA)
34622 NODE_NAME_CASE(PROBED_ALLOCA)
34623 NODE_NAME_CASE(RDRAND)
34624 NODE_NAME_CASE(RDSEED)
34625 NODE_NAME_CASE(RDPKRU)
34626 NODE_NAME_CASE(WRPKRU)
34627 NODE_NAME_CASE(VPMADDUBSW)
34628 NODE_NAME_CASE(VPMADDWD)
34629 NODE_NAME_CASE(VPSHA)
34630 NODE_NAME_CASE(VPSHL)
34631 NODE_NAME_CASE(VPCOM)
34632 NODE_NAME_CASE(VPCOMU)
34633 NODE_NAME_CASE(VPERMIL2)
34634 NODE_NAME_CASE(FMSUB)
34635 NODE_NAME_CASE(STRICT_FMSUB)
34636 NODE_NAME_CASE(FNMADD)
34637 NODE_NAME_CASE(STRICT_FNMADD)
34638 NODE_NAME_CASE(FNMSUB)
34639 NODE_NAME_CASE(STRICT_FNMSUB)
34640 NODE_NAME_CASE(FMADDSUB)
34641 NODE_NAME_CASE(FMSUBADD)
34642 NODE_NAME_CASE(FMADD_RND)
34643 NODE_NAME_CASE(FNMADD_RND)
34644 NODE_NAME_CASE(FMSUB_RND)
34645 NODE_NAME_CASE(FNMSUB_RND)
34646 NODE_NAME_CASE(FMADDSUB_RND)
34647 NODE_NAME_CASE(FMSUBADD_RND)
34648 NODE_NAME_CASE(VFMADDC)
34649 NODE_NAME_CASE(VFMADDC_RND)
34650 NODE_NAME_CASE(VFCMADDC)
34651 NODE_NAME_CASE(VFCMADDC_RND)
34652 NODE_NAME_CASE(VFMULC)
34653 NODE_NAME_CASE(VFMULC_RND)
34654 NODE_NAME_CASE(VFCMULC)
34655 NODE_NAME_CASE(VFCMULC_RND)
34656 NODE_NAME_CASE(VFMULCSH)
34657 NODE_NAME_CASE(VFMULCSH_RND)
34658 NODE_NAME_CASE(VFCMULCSH)
34659 NODE_NAME_CASE(VFCMULCSH_RND)
34660 NODE_NAME_CASE(VFMADDCSH)
34661 NODE_NAME_CASE(VFMADDCSH_RND)
34662 NODE_NAME_CASE(VFCMADDCSH)
34663 NODE_NAME_CASE(VFCMADDCSH_RND)
34664 NODE_NAME_CASE(VPMADD52H)
34665 NODE_NAME_CASE(VPMADD52L)
34666 NODE_NAME_CASE(VRNDSCALE)
34667 NODE_NAME_CASE(STRICT_VRNDSCALE)
34668 NODE_NAME_CASE(VRNDSCALE_SAE)
34669 NODE_NAME_CASE(VRNDSCALES)
34670 NODE_NAME_CASE(VRNDSCALES_SAE)
34671 NODE_NAME_CASE(VREDUCE)
34672 NODE_NAME_CASE(VREDUCE_SAE)
34673 NODE_NAME_CASE(VREDUCES)
34674 NODE_NAME_CASE(VREDUCES_SAE)
34675 NODE_NAME_CASE(VGETMANT)
34676 NODE_NAME_CASE(VGETMANT_SAE)
34677 NODE_NAME_CASE(VGETMANTS)
34678 NODE_NAME_CASE(VGETMANTS_SAE)
34679 NODE_NAME_CASE(PCMPESTR)
34680 NODE_NAME_CASE(PCMPISTR)
34681 NODE_NAME_CASE(XTEST)
34682 NODE_NAME_CASE(COMPRESS)
34683 NODE_NAME_CASE(EXPAND)
34684 NODE_NAME_CASE(SELECTS)
34685 NODE_NAME_CASE(ADDSUB)
34686 NODE_NAME_CASE(RCP14)
34687 NODE_NAME_CASE(RCP14S)
34688 NODE_NAME_CASE(RCP28)
34689 NODE_NAME_CASE(RCP28_SAE)
34690 NODE_NAME_CASE(RCP28S)
34691 NODE_NAME_CASE(RCP28S_SAE)
34692 NODE_NAME_CASE(EXP2)
34693 NODE_NAME_CASE(EXP2_SAE)
34694 NODE_NAME_CASE(RSQRT14)
34695 NODE_NAME_CASE(RSQRT14S)
34696 NODE_NAME_CASE(RSQRT28)
34697 NODE_NAME_CASE(RSQRT28_SAE)
34698 NODE_NAME_CASE(RSQRT28S)
34699 NODE_NAME_CASE(RSQRT28S_SAE)
34700 NODE_NAME_CASE(FADD_RND)
34701 NODE_NAME_CASE(FADDS)
34702 NODE_NAME_CASE(FADDS_RND)
34703 NODE_NAME_CASE(FSUB_RND)
34704 NODE_NAME_CASE(FSUBS)
34705 NODE_NAME_CASE(FSUBS_RND)
34706 NODE_NAME_CASE(FMUL_RND)
34707 NODE_NAME_CASE(FMULS)
34708 NODE_NAME_CASE(FMULS_RND)
34709 NODE_NAME_CASE(FDIV_RND)
34710 NODE_NAME_CASE(FDIVS)
34711 NODE_NAME_CASE(FDIVS_RND)
34712 NODE_NAME_CASE(FSQRT_RND)
34713 NODE_NAME_CASE(FSQRTS)
34714 NODE_NAME_CASE(FSQRTS_RND)
34715 NODE_NAME_CASE(FGETEXP)
34716 NODE_NAME_CASE(FGETEXP_SAE)
34717 NODE_NAME_CASE(FGETEXPS)
34718 NODE_NAME_CASE(FGETEXPS_SAE)
34719 NODE_NAME_CASE(SCALEF)
34720 NODE_NAME_CASE(SCALEF_RND)
34721 NODE_NAME_CASE(SCALEFS)
34722 NODE_NAME_CASE(SCALEFS_RND)
34723 NODE_NAME_CASE(MULHRS)
34724 NODE_NAME_CASE(SINT_TO_FP_RND)
34725 NODE_NAME_CASE(UINT_TO_FP_RND)
34726 NODE_NAME_CASE(CVTTP2SI)
34727 NODE_NAME_CASE(CVTTP2UI)
34728 NODE_NAME_CASE(STRICT_CVTTP2SI)
34729 NODE_NAME_CASE(STRICT_CVTTP2UI)
34730 NODE_NAME_CASE(MCVTTP2SI)
34731 NODE_NAME_CASE(MCVTTP2UI)
34732 NODE_NAME_CASE(CVTTP2SI_SAE)
34733 NODE_NAME_CASE(CVTTP2UI_SAE)
34734 NODE_NAME_CASE(CVTTS2SI)
34735 NODE_NAME_CASE(CVTTS2UI)
34736 NODE_NAME_CASE(CVTTS2SI_SAE)
34737 NODE_NAME_CASE(CVTTS2UI_SAE)
34738 NODE_NAME_CASE(CVTSI2P)
34739 NODE_NAME_CASE(CVTUI2P)
34740 NODE_NAME_CASE(STRICT_CVTSI2P)
34741 NODE_NAME_CASE(STRICT_CVTUI2P)
34742 NODE_NAME_CASE(MCVTSI2P)
34743 NODE_NAME_CASE(MCVTUI2P)
34744 NODE_NAME_CASE(VFPCLASS)
34745 NODE_NAME_CASE(VFPCLASSS)
34746 NODE_NAME_CASE(MULTISHIFT)
34747 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
34748 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
34749 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
34750 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
34751 NODE_NAME_CASE(CVTPS2PH)
34752 NODE_NAME_CASE(STRICT_CVTPS2PH)
34753 NODE_NAME_CASE(CVTPS2PH_SAE)
34754 NODE_NAME_CASE(MCVTPS2PH)
34755 NODE_NAME_CASE(MCVTPS2PH_SAE)
34756 NODE_NAME_CASE(CVTPH2PS)
34757 NODE_NAME_CASE(STRICT_CVTPH2PS)
34758 NODE_NAME_CASE(CVTPH2PS_SAE)
34759 NODE_NAME_CASE(CVTP2SI)
34760 NODE_NAME_CASE(CVTP2UI)
34761 NODE_NAME_CASE(MCVTP2SI)
34762 NODE_NAME_CASE(MCVTP2UI)
34763 NODE_NAME_CASE(CVTP2SI_RND)
34764 NODE_NAME_CASE(CVTP2UI_RND)
34765 NODE_NAME_CASE(CVTS2SI)
34766 NODE_NAME_CASE(CVTS2UI)
34767 NODE_NAME_CASE(CVTS2SI_RND)
34768 NODE_NAME_CASE(CVTS2UI_RND)
34769 NODE_NAME_CASE(CVTNE2PS2BF16)
34770 NODE_NAME_CASE(CVTNEPS2BF16)
34771 NODE_NAME_CASE(MCVTNEPS2BF16)
34772 NODE_NAME_CASE(DPBF16PS)
34773 NODE_NAME_CASE(LWPINS)
34774 NODE_NAME_CASE(MGATHER)
34775 NODE_NAME_CASE(MSCATTER)
34776 NODE_NAME_CASE(VPDPBUSD)
34777 NODE_NAME_CASE(VPDPBUSDS)
34778 NODE_NAME_CASE(VPDPWSSD)
34779 NODE_NAME_CASE(VPDPWSSDS)
34780 NODE_NAME_CASE(VPSHUFBITQMB)
34781 NODE_NAME_CASE(GF2P8MULB)
34782 NODE_NAME_CASE(GF2P8AFFINEQB)
34783 NODE_NAME_CASE(GF2P8AFFINEINVQB)
34784 NODE_NAME_CASE(NT_CALL)
34785 NODE_NAME_CASE(NT_BRIND)
34786 NODE_NAME_CASE(UMWAIT)
34787 NODE_NAME_CASE(TPAUSE)
34788 NODE_NAME_CASE(ENQCMD)
34789 NODE_NAME_CASE(ENQCMDS)
34790 NODE_NAME_CASE(VP2INTERSECT)
34791 NODE_NAME_CASE(VPDPBSUD)
34792 NODE_NAME_CASE(VPDPBSUDS)
34793 NODE_NAME_CASE(VPDPBUUD)
34794 NODE_NAME_CASE(VPDPBUUDS)
34795 NODE_NAME_CASE(VPDPBSSD)
34796 NODE_NAME_CASE(VPDPBSSDS)
34797 NODE_NAME_CASE(AESENC128KL)
34798 NODE_NAME_CASE(AESDEC128KL)
34799 NODE_NAME_CASE(AESENC256KL)
34800 NODE_NAME_CASE(AESDEC256KL)
34801 NODE_NAME_CASE(AESENCWIDE128KL)
34802 NODE_NAME_CASE(AESDECWIDE128KL)
34803 NODE_NAME_CASE(AESENCWIDE256KL)
34804 NODE_NAME_CASE(AESDECWIDE256KL)
34805 NODE_NAME_CASE(CMPCCXADD)
34806 NODE_NAME_CASE(TESTUI)
34807 }
34808 return nullptr;
34809#undef NODE_NAME_CASE
34810}
34811
34812/// Return true if the addressing mode represented by AM is legal for this
34813/// target, for a load/store of the specified type.
34814bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
34815 const AddrMode &AM, Type *Ty,
34816 unsigned AS,
34817 Instruction *I) const {
34818 // X86 supports extremely general addressing modes.
34819 CodeModel::Model M = getTargetMachine().getCodeModel();
34820
34821 // X86 allows a sign-extended 32-bit immediate field as a displacement.
34822 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
34823 return false;
34824
34825 if (AM.BaseGV) {
34826 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
34827
34828 // If a reference to this global requires an extra load, we can't fold it.
34829 if (isGlobalStubReference(GVFlags))
34830 return false;
34831
34832 // If BaseGV requires a register for the PIC base, we cannot also have a
34833 // BaseReg specified.
34834 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
34835 return false;
34836
34837 // If lower 4G is not available, then we must use rip-relative addressing.
34838 if ((M != CodeModel::Small || isPositionIndependent()) &&
34839 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
34840 return false;
34841 }
34842
34843 switch (AM.Scale) {
34844 case 0:
34845 case 1:
34846 case 2:
34847 case 4:
34848 case 8:
34849 // These scales always work.
34850 break;
34851 case 3:
34852 case 5:
34853 case 9:
34854 // These scales are formed with basereg+scalereg. Only accept if there is
34855 // no basereg yet.
34856 if (AM.HasBaseReg)
34857 return false;
34858 break;
34859 default: // Other stuff never works.
34860 return false;
34861 }
34862
34863 return true;
34864}
34865
34866bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
34867 unsigned Bits = Ty->getScalarSizeInBits();
34868
34869 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
34870 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
34871 if (Subtarget.hasXOP() &&
34872 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
34873 return false;
34874
34875 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
34876 // shifts just as cheap as scalar ones.
34877 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
34878 return false;
34879
34880 // AVX512BW has shifts such as vpsllvw.
34881 if (Subtarget.hasBWI() && Bits == 16)
34882 return false;
34883
34884 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
34885 // fully general vector.
34886 return true;
34887}
34888
34889bool X86TargetLowering::isBinOp(unsigned Opcode) const {
34890 switch (Opcode) {
34891 // These are non-commutative binops.
34892 // TODO: Add more X86ISD opcodes once we have test coverage.
34893 case X86ISD::ANDNP:
34894 case X86ISD::PCMPGT:
34895 case X86ISD::FMAX:
34896 case X86ISD::FMIN:
34897 case X86ISD::FANDN:
34898 case X86ISD::VPSHA:
34899 case X86ISD::VPSHL:
34900 case X86ISD::VSHLV:
34901 case X86ISD::VSRLV:
34902 case X86ISD::VSRAV:
34903 return true;
34904 }
34905
34906 return TargetLoweringBase::isBinOp(Opcode);
34907}
34908
34909bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
34910 switch (Opcode) {
34911 // TODO: Add more X86ISD opcodes once we have test coverage.
34912 case X86ISD::PCMPEQ:
34913 case X86ISD::PMULDQ:
34914 case X86ISD::PMULUDQ:
34915 case X86ISD::FMAXC:
34916 case X86ISD::FMINC:
34917 case X86ISD::FAND:
34918 case X86ISD::FOR:
34919 case X86ISD::FXOR:
34920 return true;
34921 }
34922
34923 return TargetLoweringBase::isCommutativeBinOp(Opcode);
34924}
34925
34926bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
34927 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34928 return false;
34929 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
34930 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
34931 return NumBits1 > NumBits2;
34932}
34933
34934bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
34935 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34936 return false;
34937
34938 if (!isTypeLegal(EVT::getEVT(Ty1)))
34939 return false;
34940
34941 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34941, __extension__
__PRETTY_FUNCTION__))
;
34942
34943 // Assuming the caller doesn't have a zeroext or signext return parameter,
34944 // truncation all the way down to i1 is valid.
34945 return true;
34946}
34947
34948bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
34949 return isInt<32>(Imm);
34950}
34951
34952bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
34953 // Can also use sub to handle negated immediates.
34954 return isInt<32>(Imm);
34955}
34956
34957bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
34958 return isInt<32>(Imm);
34959}
34960
34961bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
34962 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34963 return false;
34964 unsigned NumBits1 = VT1.getSizeInBits();
34965 unsigned NumBits2 = VT2.getSizeInBits();
34966 return NumBits1 > NumBits2;
34967}
34968
34969bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
34970 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34971 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34972}
34973
34974bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
34975 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34976 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34977}
34978
34979bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
34980 EVT VT1 = Val.getValueType();
34981 if (isZExtFree(VT1, VT2))
34982 return true;
34983
34984 if (Val.getOpcode() != ISD::LOAD)
34985 return false;
34986
34987 if (!VT1.isSimple() || !VT1.isInteger() ||
34988 !VT2.isSimple() || !VT2.isInteger())
34989 return false;
34990
34991 switch (VT1.getSimpleVT().SimpleTy) {
34992 default: break;
34993 case MVT::i8:
34994 case MVT::i16:
34995 case MVT::i32:
34996 // X86 has 8, 16, and 32-bit zero-extending loads.
34997 return true;
34998 }
34999
35000 return false;
35001}
35002
35003bool X86TargetLowering::shouldSinkOperands(Instruction *I,
35004 SmallVectorImpl<Use *> &Ops) const {
35005 using namespace llvm::PatternMatch;
35006
35007 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
35008 if (!VTy)
35009 return false;
35010
35011 if (I->getOpcode() == Instruction::Mul &&
35012 VTy->getElementType()->isIntegerTy(64)) {
35013 for (auto &Op : I->operands()) {
35014 // Make sure we are not already sinking this operand
35015 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
35016 continue;
35017
35018 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
35019 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
35020 if (Subtarget.hasSSE41() &&
35021 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
35022 m_SpecificInt(32)))) {
35023 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
35024 Ops.push_back(&Op);
35025 } else if (Subtarget.hasSSE2() &&
35026 match(Op.get(),
35027 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
35028 Ops.push_back(&Op);
35029 }
35030 }
35031
35032 return !Ops.empty();
35033 }
35034
35035 // A uniform shift amount in a vector shift or funnel shift may be much
35036 // cheaper than a generic variable vector shift, so make that pattern visible
35037 // to SDAG by sinking the shuffle instruction next to the shift.
35038 int ShiftAmountOpNum = -1;
35039 if (I->isShift())
35040 ShiftAmountOpNum = 1;
35041 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
35042 if (II->getIntrinsicID() == Intrinsic::fshl ||
35043 II->getIntrinsicID() == Intrinsic::fshr)
35044 ShiftAmountOpNum = 2;
35045 }
35046
35047 if (ShiftAmountOpNum == -1)
35048 return false;
35049
35050 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
35051 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
35052 isVectorShiftByScalarCheap(I->getType())) {
35053 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
35054 return true;
35055 }
35056
35057 return false;
35058}
35059
35060bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
35061 if (!Subtarget.is64Bit())
35062 return false;
35063 return TargetLowering::shouldConvertPhiType(From, To);
35064}
35065
35066bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
35067 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35068 return false;
35069
35070 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35071
35072 // There is no extending load for vXi1.
35073 if (SrcVT.getScalarType() == MVT::i1)
35074 return false;
35075
35076 return true;
35077}
35078
35079bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
35080 EVT VT) const {
35081 if (!Subtarget.hasAnyFMA())
35082 return false;
35083
35084 VT = VT.getScalarType();
35085
35086 if (!VT.isSimple())
35087 return false;
35088
35089 switch (VT.getSimpleVT().SimpleTy) {
35090 case MVT::f16:
35091 return Subtarget.hasFP16();
35092 case MVT::f32:
35093 case MVT::f64:
35094 return true;
35095 default:
35096 break;
35097 }
35098
35099 return false;
35100}
35101
35102bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
35103 // i16 instructions are longer (0x66 prefix) and potentially slower.
35104 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
35105}
35106
35107bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
35108 EVT VT) const {
35109 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35110 // benefit. The transform may also be profitable for scalar code.
35111 if (!Subtarget.hasAVX512())
35112 return false;
35113 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35114 return false;
35115 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35116 return false;
35117
35118 return true;
35119}
35120
35121/// Targets can use this to indicate that they only support *some*
35122/// VECTOR_SHUFFLE operations, those with specific masks.
35123/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35124/// are assumed to be legal.
35125bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
35126 if (!VT.isSimple())
35127 return false;
35128
35129 // Not for i1 vectors
35130 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35131 return false;
35132
35133 // Very little shuffling can be done for 64-bit vectors right now.
35134 if (VT.getSimpleVT().getSizeInBits() == 64)
35135 return false;
35136
35137 // We only care that the types being shuffled are legal. The lowering can
35138 // handle any possible shuffle mask that results.
35139 return isTypeLegal(VT.getSimpleVT());
35140}
35141
35142bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
35143 EVT VT) const {
35144 // Don't convert an 'and' into a shuffle that we don't directly support.
35145 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35146 if (!Subtarget.hasAVX2())
35147 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35148 return false;
35149
35150 // Just delegate to the generic legality, clear masks aren't special.
35151 return isShuffleMaskLegal(Mask, VT);
35152}
35153
35154bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
35155 // If the subtarget is using thunks, we need to not generate jump tables.
35156 if (Subtarget.useIndirectThunkBranches())
35157 return false;
35158
35159 // Otherwise, fallback on the generic logic.
35160 return TargetLowering::areJTsAllowed(Fn);
35161}
35162
35163MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
35164 EVT ConditionVT) const {
35165 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35166 // zero-extensions.
35167 if (ConditionVT.getSizeInBits() < 32)
35168 return MVT::i32;
35169 return TargetLoweringBase::getPreferredSwitchConditionType(Context,
35170 ConditionVT);
35171}
35172
35173//===----------------------------------------------------------------------===//
35174// X86 Scheduler Hooks
35175//===----------------------------------------------------------------------===//
35176
35177// Returns true if EFLAG is consumed after this iterator in the rest of the
35178// basic block or any successors of the basic block.
35179static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
35180 MachineBasicBlock *BB) {
35181 // Scan forward through BB for a use/def of EFLAGS.
35182 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35183 if (mi.readsRegister(X86::EFLAGS))
35184 return true;
35185 // If we found a def, we can stop searching.
35186 if (mi.definesRegister(X86::EFLAGS))
35187 return false;
35188 }
35189
35190 // If we hit the end of the block, check whether EFLAGS is live into a
35191 // successor.
35192 for (MachineBasicBlock *Succ : BB->successors())
35193 if (Succ->isLiveIn(X86::EFLAGS))
35194 return true;
35195
35196 return false;
35197}
35198
35199/// Utility function to emit xbegin specifying the start of an RTM region.
35200static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
35201 const TargetInstrInfo *TII) {
35202 const DebugLoc &DL = MI.getDebugLoc();
35203
35204 const BasicBlock *BB = MBB->getBasicBlock();
35205 MachineFunction::iterator I = ++MBB->getIterator();
35206
35207 // For the v = xbegin(), we generate
35208 //
35209 // thisMBB:
35210 // xbegin sinkMBB
35211 //
35212 // mainMBB:
35213 // s0 = -1
35214 //
35215 // fallBB:
35216 // eax = # XABORT_DEF
35217 // s1 = eax
35218 //
35219 // sinkMBB:
35220 // v = phi(s0/mainBB, s1/fallBB)
35221
35222 MachineBasicBlock *thisMBB = MBB;
35223 MachineFunction *MF = MBB->getParent();
35224 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35225 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35226 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35227 MF->insert(I, mainMBB);
35228 MF->insert(I, fallMBB);
35229 MF->insert(I, sinkMBB);
35230
35231 if (isEFLAGSLiveAfter(MI, MBB)) {
35232 mainMBB->addLiveIn(X86::EFLAGS);
35233 fallMBB->addLiveIn(X86::EFLAGS);
35234 sinkMBB->addLiveIn(X86::EFLAGS);
35235 }
35236
35237 // Transfer the remainder of BB and its successor edges to sinkMBB.
35238 sinkMBB->splice(sinkMBB->begin(), MBB,
35239 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35240 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35241
35242 MachineRegisterInfo &MRI = MF->getRegInfo();
35243 Register DstReg = MI.getOperand(0).getReg();
35244 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35245 Register mainDstReg = MRI.createVirtualRegister(RC);
35246 Register fallDstReg = MRI.createVirtualRegister(RC);
35247
35248 // thisMBB:
35249 // xbegin fallMBB
35250 // # fallthrough to mainMBB
35251 // # abortion to fallMBB
35252 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35253 thisMBB->addSuccessor(mainMBB);
35254 thisMBB->addSuccessor(fallMBB);
35255
35256 // mainMBB:
35257 // mainDstReg := -1
35258 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35259 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35260 mainMBB->addSuccessor(sinkMBB);
35261
35262 // fallMBB:
35263 // ; pseudo instruction to model hardware's definition from XABORT
35264 // EAX := XABORT_DEF
35265 // fallDstReg := EAX
35266 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
35267 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
35268 .addReg(X86::EAX);
35269 fallMBB->addSuccessor(sinkMBB);
35270
35271 // sinkMBB:
35272 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35273 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
35274 .addReg(mainDstReg).addMBB(mainMBB)
35275 .addReg(fallDstReg).addMBB(fallMBB);
35276
35277 MI.eraseFromParent();
35278 return sinkMBB;
35279}
35280
35281MachineBasicBlock *
35282X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35283 MachineBasicBlock *MBB) const {
35284 // Emit va_arg instruction on X86-64.
35285
35286 // Operands to this pseudo-instruction:
35287 // 0 ) Output : destination address (reg)
35288 // 1-5) Input : va_list address (addr, i64mem)
35289 // 6 ) ArgSize : Size (in bytes) of vararg type
35290 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35291 // 8 ) Align : Alignment of type
35292 // 9 ) EFLAGS (implicit-def)
35293
35294 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35294, __extension__
__PRETTY_FUNCTION__))
;
35295 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35296
35297 Register DestReg = MI.getOperand(0).getReg();
35298 MachineOperand &Base = MI.getOperand(1);
35299 MachineOperand &Scale = MI.getOperand(2);
35300 MachineOperand &Index = MI.getOperand(3);
35301 MachineOperand &Disp = MI.getOperand(4);
35302 MachineOperand &Segment = MI.getOperand(5);
35303 unsigned ArgSize = MI.getOperand(6).getImm();
35304 unsigned ArgMode = MI.getOperand(7).getImm();
35305 Align Alignment = Align(MI.getOperand(8).getImm());
35306
35307 MachineFunction *MF = MBB->getParent();
35308
35309 // Memory Reference
35310 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35310, __extension__
__PRETTY_FUNCTION__))
;
35311
35312 MachineMemOperand *OldMMO = MI.memoperands().front();
35313
35314 // Clone the MMO into two separate MMOs for loading and storing
35315 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35316 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35317 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35318 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35319
35320 // Machine Information
35321 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35322 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35323 const TargetRegisterClass *AddrRegClass =
35324 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
35325 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35326 const DebugLoc &DL = MI.getDebugLoc();
35327
35328 // struct va_list {
35329 // i32 gp_offset
35330 // i32 fp_offset
35331 // i64 overflow_area (address)
35332 // i64 reg_save_area (address)
35333 // }
35334 // sizeof(va_list) = 24
35335 // alignment(va_list) = 8
35336
35337 unsigned TotalNumIntRegs = 6;
35338 unsigned TotalNumXMMRegs = 8;
35339 bool UseGPOffset = (ArgMode == 1);
35340 bool UseFPOffset = (ArgMode == 2);
35341 unsigned MaxOffset = TotalNumIntRegs * 8 +
35342 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35343
35344 /* Align ArgSize to a multiple of 8 */
35345 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35346 bool NeedsAlign = (Alignment > 8);
35347
35348 MachineBasicBlock *thisMBB = MBB;
35349 MachineBasicBlock *overflowMBB;
35350 MachineBasicBlock *offsetMBB;
35351 MachineBasicBlock *endMBB;
35352
35353 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
35354 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
35355 unsigned OffsetReg = 0;
35356
35357 if (!UseGPOffset && !UseFPOffset) {
35358 // If we only pull from the overflow region, we don't create a branch.
35359 // We don't need to alter control flow.
35360 OffsetDestReg = 0; // unused
35361 OverflowDestReg = DestReg;
35362
35363 offsetMBB = nullptr;
35364 overflowMBB = thisMBB;
35365 endMBB = thisMBB;
35366 } else {
35367 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35368 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35369 // If not, pull from overflow_area. (branch to overflowMBB)
35370 //
35371 // thisMBB
35372 // | .
35373 // | .
35374 // offsetMBB overflowMBB
35375 // | .
35376 // | .
35377 // endMBB
35378
35379 // Registers for the PHI in endMBB
35380 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35381 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35382
35383 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35384 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35385 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35386 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35387
35388 MachineFunction::iterator MBBIter = ++MBB->getIterator();
35389
35390 // Insert the new basic blocks
35391 MF->insert(MBBIter, offsetMBB);
35392 MF->insert(MBBIter, overflowMBB);
35393 MF->insert(MBBIter, endMBB);
35394
35395 // Transfer the remainder of MBB and its successor edges to endMBB.
35396 endMBB->splice(endMBB->begin(), thisMBB,
35397 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35398 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35399
35400 // Make offsetMBB and overflowMBB successors of thisMBB
35401 thisMBB->addSuccessor(offsetMBB);
35402 thisMBB->addSuccessor(overflowMBB);
35403
35404 // endMBB is a successor of both offsetMBB and overflowMBB
35405 offsetMBB->addSuccessor(endMBB);
35406 overflowMBB->addSuccessor(endMBB);
35407
35408 // Load the offset value into a register
35409 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35410 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
35411 .add(Base)
35412 .add(Scale)
35413 .add(Index)
35414 .addDisp(Disp, UseFPOffset ? 4 : 0)
35415 .add(Segment)
35416 .setMemRefs(LoadOnlyMMO);
35417
35418 // Check if there is enough room left to pull this argument.
35419 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
35420 .addReg(OffsetReg)
35421 .addImm(MaxOffset + 8 - ArgSizeA8);
35422
35423 // Branch to "overflowMBB" if offset >= max
35424 // Fall through to "offsetMBB" otherwise
35425 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
35426 .addMBB(overflowMBB).addImm(X86::COND_AE);
35427 }
35428
35429 // In offsetMBB, emit code to use the reg_save_area.
35430 if (offsetMBB) {
35431 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35431, __extension__ __PRETTY_FUNCTION__))
;
35432
35433 // Read the reg_save_area address.
35434 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35435 BuildMI(
35436 offsetMBB, DL,
35437 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35438 RegSaveReg)
35439 .add(Base)
35440 .add(Scale)
35441 .add(Index)
35442 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35443 .add(Segment)
35444 .setMemRefs(LoadOnlyMMO);
35445
35446 if (Subtarget.isTarget64BitLP64()) {
35447 // Zero-extend the offset
35448 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35449 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35450 .addImm(0)
35451 .addReg(OffsetReg)
35452 .addImm(X86::sub_32bit);
35453
35454 // Add the offset to the reg_save_area to get the final address.
35455 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
35456 .addReg(OffsetReg64)
35457 .addReg(RegSaveReg);
35458 } else {
35459 // Add the offset to the reg_save_area to get the final address.
35460 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
35461 .addReg(OffsetReg)
35462 .addReg(RegSaveReg);
35463 }
35464
35465 // Compute the offset for the next argument
35466 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35467 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
35468 .addReg(OffsetReg)
35469 .addImm(UseFPOffset ? 16 : 8);
35470
35471 // Store it back into the va_list.
35472 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
35473 .add(Base)
35474 .add(Scale)
35475 .add(Index)
35476 .addDisp(Disp, UseFPOffset ? 4 : 0)
35477 .add(Segment)
35478 .addReg(NextOffsetReg)
35479 .setMemRefs(StoreOnlyMMO);
35480
35481 // Jump to endMBB
35482 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
35483 .addMBB(endMBB);
35484 }
35485
35486 //
35487 // Emit code to use overflow area
35488 //
35489
35490 // Load the overflow_area address into a register.
35491 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
35492 BuildMI(overflowMBB, DL,
35493 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35494 OverflowAddrReg)
35495 .add(Base)
35496 .add(Scale)
35497 .add(Index)
35498 .addDisp(Disp, 8)
35499 .add(Segment)
35500 .setMemRefs(LoadOnlyMMO);
35501
35502 // If we need to align it, do so. Otherwise, just copy the address
35503 // to OverflowDestReg.
35504 if (NeedsAlign) {
35505 // Align the overflow address
35506 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
35507
35508 // aligned_addr = (addr + (align-1)) & ~(align-1)
35509 BuildMI(
35510 overflowMBB, DL,
35511 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35512 TmpReg)
35513 .addReg(OverflowAddrReg)
35514 .addImm(Alignment.value() - 1);
35515
35516 BuildMI(
35517 overflowMBB, DL,
35518 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
35519 OverflowDestReg)
35520 .addReg(TmpReg)
35521 .addImm(~(uint64_t)(Alignment.value() - 1));
35522 } else {
35523 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
35524 .addReg(OverflowAddrReg);
35525 }
35526
35527 // Compute the next overflow address after this argument.
35528 // (the overflow address should be kept 8-byte aligned)
35529 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
35530 BuildMI(
35531 overflowMBB, DL,
35532 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35533 NextAddrReg)
35534 .addReg(OverflowDestReg)
35535 .addImm(ArgSizeA8);
35536
35537 // Store the new overflow address.
35538 BuildMI(overflowMBB, DL,
35539 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
35540 .add(Base)
35541 .add(Scale)
35542 .add(Index)
35543 .addDisp(Disp, 8)
35544 .add(Segment)
35545 .addReg(NextAddrReg)
35546 .setMemRefs(StoreOnlyMMO);
35547
35548 // If we branched, emit the PHI to the front of endMBB.
35549 if (offsetMBB) {
35550 BuildMI(*endMBB, endMBB->begin(), DL,
35551 TII->get(X86::PHI), DestReg)
35552 .addReg(OffsetDestReg).addMBB(offsetMBB)
35553 .addReg(OverflowDestReg).addMBB(overflowMBB);
35554 }
35555
35556 // Erase the pseudo instruction
35557 MI.eraseFromParent();
35558
35559 return endMBB;
35560}
35561
35562// The EFLAGS operand of SelectItr might be missing a kill marker
35563// because there were multiple uses of EFLAGS, and ISel didn't know
35564// which to mark. Figure out whether SelectItr should have had a
35565// kill marker, and set it if it should. Returns the correct kill
35566// marker value.
35567static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
35568 MachineBasicBlock* BB,
35569 const TargetRegisterInfo* TRI) {
35570 if (isEFLAGSLiveAfter(SelectItr, BB))
35571 return false;
35572
35573 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
35574 // out. SelectMI should have a kill flag on EFLAGS.
35575 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
35576 return true;
35577}
35578
35579// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
35580// together with other CMOV pseudo-opcodes into a single basic-block with
35581// conditional jump around it.
35582static bool isCMOVPseudo(MachineInstr &MI) {
35583 switch (MI.getOpcode()) {
35584 case X86::CMOV_FR16:
35585 case X86::CMOV_FR16X:
35586 case X86::CMOV_FR32:
35587 case X86::CMOV_FR32X:
35588 case X86::CMOV_FR64:
35589 case X86::CMOV_FR64X:
35590 case X86::CMOV_GR8:
35591 case X86::CMOV_GR16:
35592 case X86::CMOV_GR32:
35593 case X86::CMOV_RFP32:
35594 case X86::CMOV_RFP64:
35595 case X86::CMOV_RFP80:
35596 case X86::CMOV_VR64:
35597 case X86::CMOV_VR128:
35598 case X86::CMOV_VR128X:
35599 case X86::CMOV_VR256:
35600 case X86::CMOV_VR256X:
35601 case X86::CMOV_VR512:
35602 case X86::CMOV_VK1:
35603 case X86::CMOV_VK2:
35604 case X86::CMOV_VK4:
35605 case X86::CMOV_VK8:
35606 case X86::CMOV_VK16:
35607 case X86::CMOV_VK32:
35608 case X86::CMOV_VK64:
35609 return true;
35610
35611 default:
35612 return false;
35613 }
35614}
35615
35616// Helper function, which inserts PHI functions into SinkMBB:
35617// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
35618// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
35619// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
35620// the last PHI function inserted.
35621static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
35622 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
35623 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
35624 MachineBasicBlock *SinkMBB) {
35625 MachineFunction *MF = TrueMBB->getParent();
35626 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
35627 const DebugLoc &DL = MIItBegin->getDebugLoc();
35628
35629 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
35630 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
35631
35632 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
35633
35634 // As we are creating the PHIs, we have to be careful if there is more than
35635 // one. Later CMOVs may reference the results of earlier CMOVs, but later
35636 // PHIs have to reference the individual true/false inputs from earlier PHIs.
35637 // That also means that PHI construction must work forward from earlier to
35638 // later, and that the code must maintain a mapping from earlier PHI's
35639 // destination registers, and the registers that went into the PHI.
35640 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
35641 MachineInstrBuilder MIB;
35642
35643 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
35644 Register DestReg = MIIt->getOperand(0).getReg();
35645 Register Op1Reg = MIIt->getOperand(1).getReg();
35646 Register Op2Reg = MIIt->getOperand(2).getReg();
35647
35648 // If this CMOV we are generating is the opposite condition from
35649 // the jump we generated, then we have to swap the operands for the
35650 // PHI that is going to be generated.
35651 if (MIIt->getOperand(3).getImm() == OppCC)
35652 std::swap(Op1Reg, Op2Reg);
35653
35654 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
35655 Op1Reg = RegRewriteTable[Op1Reg].first;
35656
35657 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
35658 Op2Reg = RegRewriteTable[Op2Reg].second;
35659
35660 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
35661 .addReg(Op1Reg)
35662 .addMBB(FalseMBB)
35663 .addReg(Op2Reg)
35664 .addMBB(TrueMBB);
35665
35666 // Add this PHI to the rewrite table.
35667 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
35668 }
35669
35670 return MIB;
35671}
35672
35673// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
35674MachineBasicBlock *
35675X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
35676 MachineInstr &SecondCascadedCMOV,
35677 MachineBasicBlock *ThisMBB) const {
35678 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35679 const DebugLoc &DL = FirstCMOV.getDebugLoc();
35680
35681 // We lower cascaded CMOVs such as
35682 //
35683 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
35684 //
35685 // to two successive branches.
35686 //
35687 // Without this, we would add a PHI between the two jumps, which ends up
35688 // creating a few copies all around. For instance, for
35689 //
35690 // (sitofp (zext (fcmp une)))
35691 //
35692 // we would generate:
35693 //
35694 // ucomiss %xmm1, %xmm0
35695 // movss <1.0f>, %xmm0
35696 // movaps %xmm0, %xmm1
35697 // jne .LBB5_2
35698 // xorps %xmm1, %xmm1
35699 // .LBB5_2:
35700 // jp .LBB5_4
35701 // movaps %xmm1, %xmm0
35702 // .LBB5_4:
35703 // retq
35704 //
35705 // because this custom-inserter would have generated:
35706 //
35707 // A
35708 // | \
35709 // | B
35710 // | /
35711 // C
35712 // | \
35713 // | D
35714 // | /
35715 // E
35716 //
35717 // A: X = ...; Y = ...
35718 // B: empty
35719 // C: Z = PHI [X, A], [Y, B]
35720 // D: empty
35721 // E: PHI [X, C], [Z, D]
35722 //
35723 // If we lower both CMOVs in a single step, we can instead generate:
35724 //
35725 // A
35726 // | \
35727 // | C
35728 // | /|
35729 // |/ |
35730 // | |
35731 // | D
35732 // | /
35733 // E
35734 //
35735 // A: X = ...; Y = ...
35736 // D: empty
35737 // E: PHI [X, A], [X, C], [Y, D]
35738 //
35739 // Which, in our sitofp/fcmp example, gives us something like:
35740 //
35741 // ucomiss %xmm1, %xmm0
35742 // movss <1.0f>, %xmm0
35743 // jne .LBB5_4
35744 // jp .LBB5_4
35745 // xorps %xmm0, %xmm0
35746 // .LBB5_4:
35747 // retq
35748 //
35749
35750 // We lower cascaded CMOV into two successive branches to the same block.
35751 // EFLAGS is used by both, so mark it as live in the second.
35752 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35753 MachineFunction *F = ThisMBB->getParent();
35754 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35755 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35756 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35757
35758 MachineFunction::iterator It = ++ThisMBB->getIterator();
35759 F->insert(It, FirstInsertedMBB);
35760 F->insert(It, SecondInsertedMBB);
35761 F->insert(It, SinkMBB);
35762
35763 // For a cascaded CMOV, we lower it to two successive branches to
35764 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
35765 // the FirstInsertedMBB.
35766 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35767
35768 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35769 // live into the sink and copy blocks.
35770 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35771 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
35772 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
35773 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35774 SinkMBB->addLiveIn(X86::EFLAGS);
35775 }
35776
35777 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35778 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35779 std::next(MachineBasicBlock::iterator(FirstCMOV)),
35780 ThisMBB->end());
35781 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35782
35783 // Fallthrough block for ThisMBB.
35784 ThisMBB->addSuccessor(FirstInsertedMBB);
35785 // The true block target of the first branch is always SinkMBB.
35786 ThisMBB->addSuccessor(SinkMBB);
35787 // Fallthrough block for FirstInsertedMBB.
35788 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35789 // The true block for the branch of FirstInsertedMBB.
35790 FirstInsertedMBB->addSuccessor(SinkMBB);
35791 // This is fallthrough.
35792 SecondInsertedMBB->addSuccessor(SinkMBB);
35793
35794 // Create the conditional branch instructions.
35795 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
35796 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35797
35798 X86::CondCode SecondCC =
35799 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
35800 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
35801
35802 // SinkMBB:
35803 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
35804 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
35805 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
35806 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
35807 MachineInstrBuilder MIB =
35808 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
35809 .addReg(Op1Reg)
35810 .addMBB(SecondInsertedMBB)
35811 .addReg(Op2Reg)
35812 .addMBB(ThisMBB);
35813
35814 // The second SecondInsertedMBB provides the same incoming value as the
35815 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
35816 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
35817
35818 // Now remove the CMOVs.
35819 FirstCMOV.eraseFromParent();
35820 SecondCascadedCMOV.eraseFromParent();
35821
35822 return SinkMBB;
35823}
35824
35825MachineBasicBlock *
35826X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
35827 MachineBasicBlock *ThisMBB) const {
35828 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35829 const DebugLoc &DL = MI.getDebugLoc();
35830
35831 // To "insert" a SELECT_CC instruction, we actually have to insert the
35832 // diamond control-flow pattern. The incoming instruction knows the
35833 // destination vreg to set, the condition code register to branch on, the
35834 // true/false values to select between and a branch opcode to use.
35835
35836 // ThisMBB:
35837 // ...
35838 // TrueVal = ...
35839 // cmpTY ccX, r1, r2
35840 // bCC copy1MBB
35841 // fallthrough --> FalseMBB
35842
35843 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
35844 // as described above, by inserting a BB, and then making a PHI at the join
35845 // point to select the true and false operands of the CMOV in the PHI.
35846 //
35847 // The code also handles two different cases of multiple CMOV opcodes
35848 // in a row.
35849 //
35850 // Case 1:
35851 // In this case, there are multiple CMOVs in a row, all which are based on
35852 // the same condition setting (or the exact opposite condition setting).
35853 // In this case we can lower all the CMOVs using a single inserted BB, and
35854 // then make a number of PHIs at the join point to model the CMOVs. The only
35855 // trickiness here, is that in a case like:
35856 //
35857 // t2 = CMOV cond1 t1, f1
35858 // t3 = CMOV cond1 t2, f2
35859 //
35860 // when rewriting this into PHIs, we have to perform some renaming on the
35861 // temps since you cannot have a PHI operand refer to a PHI result earlier
35862 // in the same block. The "simple" but wrong lowering would be:
35863 //
35864 // t2 = PHI t1(BB1), f1(BB2)
35865 // t3 = PHI t2(BB1), f2(BB2)
35866 //
35867 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
35868 // renaming is to note that on the path through BB1, t2 is really just a
35869 // copy of t1, and do that renaming, properly generating:
35870 //
35871 // t2 = PHI t1(BB1), f1(BB2)
35872 // t3 = PHI t1(BB1), f2(BB2)
35873 //
35874 // Case 2:
35875 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
35876 // function - EmitLoweredCascadedSelect.
35877
35878 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
35879 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
35880 MachineInstr *LastCMOV = &MI;
35881 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
35882
35883 // Check for case 1, where there are multiple CMOVs with the same condition
35884 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
35885 // number of jumps the most.
35886
35887 if (isCMOVPseudo(MI)) {
35888 // See if we have a string of CMOVS with the same condition. Skip over
35889 // intervening debug insts.
35890 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
35891 (NextMIIt->getOperand(3).getImm() == CC ||
35892 NextMIIt->getOperand(3).getImm() == OppCC)) {
35893 LastCMOV = &*NextMIIt;
35894 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
35895 }
35896 }
35897
35898 // This checks for case 2, but only do this if we didn't already find
35899 // case 1, as indicated by LastCMOV == MI.
35900 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
35901 NextMIIt->getOpcode() == MI.getOpcode() &&
35902 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
35903 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
35904 NextMIIt->getOperand(1).isKill()) {
35905 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
35906 }
35907
35908 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35909 MachineFunction *F = ThisMBB->getParent();
35910 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
35911 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35912
35913 MachineFunction::iterator It = ++ThisMBB->getIterator();
35914 F->insert(It, FalseMBB);
35915 F->insert(It, SinkMBB);
35916
35917 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35918 // live into the sink and copy blocks.
35919 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35920 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
35921 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
35922 FalseMBB->addLiveIn(X86::EFLAGS);
35923 SinkMBB->addLiveIn(X86::EFLAGS);
35924 }
35925
35926 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
35927 auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
35928 MachineBasicBlock::iterator(LastCMOV));
35929 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
35930 if (MI.isDebugInstr())
35931 SinkMBB->push_back(MI.removeFromParent());
35932
35933 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35934 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35935 std::next(MachineBasicBlock::iterator(LastCMOV)),
35936 ThisMBB->end());
35937 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35938
35939 // Fallthrough block for ThisMBB.
35940 ThisMBB->addSuccessor(FalseMBB);
35941 // The true block target of the first (or only) branch is always a SinkMBB.
35942 ThisMBB->addSuccessor(SinkMBB);
35943 // Fallthrough block for FalseMBB.
35944 FalseMBB->addSuccessor(SinkMBB);
35945
35946 // Create the conditional branch instruction.
35947 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35948
35949 // SinkMBB:
35950 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35951 // ...
35952 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
35953 MachineBasicBlock::iterator MIItEnd =
35954 std::next(MachineBasicBlock::iterator(LastCMOV));
35955 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35956
35957 // Now remove the CMOV(s).
35958 ThisMBB->erase(MIItBegin, MIItEnd);
35959
35960 return SinkMBB;
35961}
35962
35963static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
35964 if (IsLP64) {
35965 if (isInt<8>(Imm))
35966 return X86::SUB64ri8;
35967 return X86::SUB64ri32;
35968 } else {
35969 if (isInt<8>(Imm))
35970 return X86::SUB32ri8;
35971 return X86::SUB32ri;
35972 }
35973}
35974
35975MachineBasicBlock *
35976X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35977 MachineBasicBlock *MBB) const {
35978 MachineFunction *MF = MBB->getParent();
35979 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35980 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35981 const DebugLoc &DL = MI.getDebugLoc();
35982 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35983
35984 const unsigned ProbeSize = getStackProbeSize(*MF);
35985
35986 MachineRegisterInfo &MRI = MF->getRegInfo();
35987 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35988 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35989 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35990
35991 MachineFunction::iterator MBBIter = ++MBB->getIterator();
35992 MF->insert(MBBIter, testMBB);
35993 MF->insert(MBBIter, blockMBB);
35994 MF->insert(MBBIter, tailMBB);
35995
35996 Register sizeVReg = MI.getOperand(1).getReg();
35997
35998 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35999
36000 Register TmpStackPtr = MRI.createVirtualRegister(
36001 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36002 Register FinalStackPtr = MRI.createVirtualRegister(
36003 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36004
36005 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
36006 .addReg(physSPReg);
36007 {
36008 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36009 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
36010 .addReg(TmpStackPtr)
36011 .addReg(sizeVReg);
36012 }
36013
36014 // test rsp size
36015
36016 BuildMI(testMBB, DL,
36017 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36018 .addReg(FinalStackPtr)
36019 .addReg(physSPReg);
36020
36021 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
36022 .addMBB(tailMBB)
36023 .addImm(X86::COND_GE);
36024 testMBB->addSuccessor(blockMBB);
36025 testMBB->addSuccessor(tailMBB);
36026
36027 // Touch the block then extend it. This is done on the opposite side of
36028 // static probe where we allocate then touch, to avoid the need of probing the
36029 // tail of the static alloca. Possible scenarios are:
36030 //
36031 // + ---- <- ------------ <- ------------- <- ------------ +
36032 // | |
36033 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36034 // | |
36035 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36036 //
36037 // The property we want to enforce is to never have more than [page alloc] between two probes.
36038
36039 const unsigned XORMIOpc =
36040 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
36041 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
36042 .addImm(0);
36043
36044 BuildMI(blockMBB, DL,
36045 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
36046 .addReg(physSPReg)
36047 .addImm(ProbeSize);
36048
36049
36050 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
36051 blockMBB->addSuccessor(testMBB);
36052
36053 // Replace original instruction by the expected stack ptr
36054 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
36055 .addReg(FinalStackPtr);
36056
36057 tailMBB->splice(tailMBB->end(), MBB,
36058 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36059 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
36060 MBB->addSuccessor(testMBB);
36061
36062 // Delete the original pseudo instruction.
36063 MI.eraseFromParent();
36064
36065 // And we're done.
36066 return tailMBB;
36067}
36068
36069MachineBasicBlock *
36070X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36071 MachineBasicBlock *BB) const {
36072 MachineFunction *MF = BB->getParent();
36073 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36074 const DebugLoc &DL = MI.getDebugLoc();
36075 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36076
36077 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36077, __extension__ __PRETTY_FUNCTION__))
;
36078
36079 const bool Is64Bit = Subtarget.is64Bit();
36080 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36081
36082 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36083 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36084
36085 // BB:
36086 // ... [Till the alloca]
36087 // If stacklet is not large enough, jump to mallocMBB
36088 //
36089 // bumpMBB:
36090 // Allocate by subtracting from RSP
36091 // Jump to continueMBB
36092 //
36093 // mallocMBB:
36094 // Allocate by call to runtime
36095 //
36096 // continueMBB:
36097 // ...
36098 // [rest of original BB]
36099 //
36100
36101 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36102 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36103 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36104
36105 MachineRegisterInfo &MRI = MF->getRegInfo();
36106 const TargetRegisterClass *AddrRegClass =
36107 getRegClassFor(getPointerTy(MF->getDataLayout()));
36108
36109 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36110 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36111 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36112 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36113 sizeVReg = MI.getOperand(1).getReg(),
36114 physSPReg =
36115 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36116
36117 MachineFunction::iterator MBBIter = ++BB->getIterator();
36118
36119 MF->insert(MBBIter, bumpMBB);
36120 MF->insert(MBBIter, mallocMBB);
36121 MF->insert(MBBIter, continueMBB);
36122
36123 continueMBB->splice(continueMBB->begin(), BB,
36124 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36125 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36126
36127 // Add code to the main basic block to check if the stack limit has been hit,
36128 // and if so, jump to mallocMBB otherwise to bumpMBB.
36129 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36130 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36131 .addReg(tmpSPVReg).addReg(sizeVReg);
36132 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36133 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36134 .addReg(SPLimitVReg);
36135 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36136
36137 // bumpMBB simply decreases the stack pointer, since we know the current
36138 // stacklet has enough space.
36139 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
36140 .addReg(SPLimitVReg);
36141 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36142 .addReg(SPLimitVReg);
36143 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36144
36145 // Calls into a routine in libgcc to allocate more space from the heap.
36146 const uint32_t *RegMask =
36147 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36148 if (IsLP64) {
36149 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
36150 .addReg(sizeVReg);
36151 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
36152 .addExternalSymbol("__morestack_allocate_stack_space")
36153 .addRegMask(RegMask)
36154 .addReg(X86::RDI, RegState::Implicit)
36155 .addReg(X86::RAX, RegState::ImplicitDefine);
36156 } else if (Is64Bit) {
36157 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
36158 .addReg(sizeVReg);
36159 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
36160 .addExternalSymbol("__morestack_allocate_stack_space")
36161 .addRegMask(RegMask)
36162 .addReg(X86::EDI, RegState::Implicit)
36163 .addReg(X86::EAX, RegState::ImplicitDefine);
36164 } else {
36165 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36166 .addImm(12);
36167 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36168 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
36169 .addExternalSymbol("__morestack_allocate_stack_space")
36170 .addRegMask(RegMask)
36171 .addReg(X86::EAX, RegState::ImplicitDefine);
36172 }
36173
36174 if (!Is64Bit)
36175 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36176 .addImm(16);
36177
36178 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36179 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36180 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
36181
36182 // Set up the CFG correctly.
36183 BB->addSuccessor(bumpMBB);
36184 BB->addSuccessor(mallocMBB);
36185 mallocMBB->addSuccessor(continueMBB);
36186 bumpMBB->addSuccessor(continueMBB);
36187
36188 // Take care of the PHI nodes.
36189 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
36190 MI.getOperand(0).getReg())
36191 .addReg(mallocPtrVReg)
36192 .addMBB(mallocMBB)
36193 .addReg(bumpSPPtrVReg)
36194 .addMBB(bumpMBB);
36195
36196 // Delete the original pseudo instruction.
36197 MI.eraseFromParent();
36198
36199 // And we're done.
36200 return continueMBB;
36201}
36202
36203MachineBasicBlock *
36204X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36205 MachineBasicBlock *BB) const {
36206 MachineFunction *MF = BB->getParent();
36207 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36208 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36209 const DebugLoc &DL = MI.getDebugLoc();
36210
36211 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36213, __extension__
__PRETTY_FUNCTION__))
36212 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36213, __extension__
__PRETTY_FUNCTION__))
36213 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36213, __extension__
__PRETTY_FUNCTION__))
;
36214
36215 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36216 if (!Subtarget.is32Bit())
36217 return BB;
36218
36219 // C++ EH creates a new target block to hold the restore code, and wires up
36220 // the new block to the return destination with a normal JMP_4.
36221 MachineBasicBlock *RestoreMBB =
36222 MF->CreateMachineBasicBlock(BB->getBasicBlock());
36223 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36223, __extension__ __PRETTY_FUNCTION__))
;
36224 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36225 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36226 BB->addSuccessor(RestoreMBB);
36227 MI.getOperand(0).setMBB(RestoreMBB);
36228
36229 // Marking this as an EH pad but not a funclet entry block causes PEI to
36230 // restore stack pointers in the block.
36231 RestoreMBB->setIsEHPad(true);
36232
36233 auto RestoreMBBI = RestoreMBB->begin();
36234 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36235 return BB;
36236}
36237
36238MachineBasicBlock *
36239X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
36240 MachineBasicBlock *BB) const {
36241 // So, here we replace TLSADDR with the sequence:
36242 // adjust_stackdown -> TLSADDR -> adjust_stackup.
36243 // We need this because TLSADDR is lowered into calls
36244 // inside MC, therefore without the two markers shrink-wrapping
36245 // may push the prologue/epilogue pass them.
36246 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36247 const DebugLoc &DL = MI.getDebugLoc();
36248 MachineFunction &MF = *BB->getParent();
36249
36250 // Emit CALLSEQ_START right before the instruction.
36251 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
36252 MachineInstrBuilder CallseqStart =
36253 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
36254 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
36255
36256 // Emit CALLSEQ_END right after the instruction.
36257 // We don't call erase from parent because we want to keep the
36258 // original instruction around.
36259 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
36260 MachineInstrBuilder CallseqEnd =
36261 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
36262 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
36263
36264 return BB;
36265}
36266
36267MachineBasicBlock *
36268X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36269 MachineBasicBlock *BB) const {
36270 // This is pretty easy. We're taking the value that we received from
36271 // our load from the relocation, sticking it in either RDI (x86-64)
36272 // or EAX and doing an indirect call. The return value will then
36273 // be in the normal return register.
36274 MachineFunction *F = BB->getParent();
36275 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36276 const DebugLoc &DL = MI.getDebugLoc();
36277
36278 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36278, __extension__
__PRETTY_FUNCTION__))
;
36279 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36279, __extension__
__PRETTY_FUNCTION__))
;
36280
36281 // Get a register mask for the lowered call.
36282 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36283 // proper register mask.
36284 const uint32_t *RegMask =
36285 Subtarget.is64Bit() ?
36286 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36287 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36288 if (Subtarget.is64Bit()) {
36289 MachineInstrBuilder MIB =
36290 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
36291 .addReg(X86::RIP)
36292 .addImm(0)
36293 .addReg(0)
36294 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36295 MI.getOperand(3).getTargetFlags())
36296 .addReg(0);
36297 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
36298 addDirectMem(MIB, X86::RDI);
36299 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36300 } else if (!isPositionIndependent()) {
36301 MachineInstrBuilder MIB =
36302 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
36303 .addReg(0)
36304 .addImm(0)
36305 .addReg(0)
36306 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36307 MI.getOperand(3).getTargetFlags())
36308 .addReg(0);
36309 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
36310 addDirectMem(MIB, X86::EAX);
36311 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36312 } else {
36313 MachineInstrBuilder MIB =
36314 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
36315 .addReg(TII->getGlobalBaseReg(F))
36316 .addImm(0)
36317 .addReg(0)
36318 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36319 MI.getOperand(3).getTargetFlags())
36320 .addReg(0);
36321 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
36322 addDirectMem(MIB, X86::EAX);
36323 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36324 }
36325
36326 MI.eraseFromParent(); // The pseudo instruction is gone now.
36327 return BB;
36328}
36329
36330static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36331 switch (RPOpc) {
36332 case X86::INDIRECT_THUNK_CALL32:
36333 return X86::CALLpcrel32;
36334 case X86::INDIRECT_THUNK_CALL64:
36335 return X86::CALL64pcrel32;
36336 case X86::INDIRECT_THUNK_TCRETURN32:
36337 return X86::TCRETURNdi;
36338 case X86::INDIRECT_THUNK_TCRETURN64:
36339 return X86::TCRETURNdi64;
36340 }
36341 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36341)
;
36342}
36343
36344static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36345 unsigned Reg) {
36346 if (Subtarget.useRetpolineExternalThunk()) {
36347 // When using an external thunk for retpolines, we pick names that match the
36348 // names GCC happens to use as well. This helps simplify the implementation
36349 // of the thunks for kernels where they have no easy ability to create
36350 // aliases and are doing non-trivial configuration of the thunk's body. For
36351 // example, the Linux kernel will do boot-time hot patching of the thunk
36352 // bodies and cannot easily export aliases of these to loaded modules.
36353 //
36354 // Note that at any point in the future, we may need to change the semantics
36355 // of how we implement retpolines and at that time will likely change the
36356 // name of the called thunk. Essentially, there is no hard guarantee that
36357 // LLVM will generate calls to specific thunks, we merely make a best-effort
36358 // attempt to help out kernels and other systems where duplicating the
36359 // thunks is costly.
36360 switch (Reg) {
36361 case X86::EAX:
36362 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36362, __extension__
__PRETTY_FUNCTION__))
;
36363 return "__x86_indirect_thunk_eax";
36364 case X86::ECX:
36365 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36365, __extension__
__PRETTY_FUNCTION__))
;
36366 return "__x86_indirect_thunk_ecx";
36367 case X86::EDX:
36368 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36368, __extension__
__PRETTY_FUNCTION__))
;
36369 return "__x86_indirect_thunk_edx";
36370 case X86::EDI:
36371 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36371, __extension__
__PRETTY_FUNCTION__))
;
36372 return "__x86_indirect_thunk_edi";
36373 case X86::R11:
36374 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36374, __extension__
__PRETTY_FUNCTION__))
;
36375 return "__x86_indirect_thunk_r11";
36376 }
36377 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36377)
;
36378 }
36379
36380 if (Subtarget.useRetpolineIndirectCalls() ||
36381 Subtarget.useRetpolineIndirectBranches()) {
36382 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36383 switch (Reg) {
36384 case X86::EAX:
36385 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36385, __extension__
__PRETTY_FUNCTION__))
;
36386 return "__llvm_retpoline_eax";
36387 case X86::ECX:
36388 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36388, __extension__
__PRETTY_FUNCTION__))
;
36389 return "__llvm_retpoline_ecx";
36390 case X86::EDX:
36391 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36391, __extension__
__PRETTY_FUNCTION__))
;
36392 return "__llvm_retpoline_edx";
36393 case X86::EDI:
36394 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36394, __extension__
__PRETTY_FUNCTION__))
;
36395 return "__llvm_retpoline_edi";
36396 case X86::R11:
36397 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__))
;
36398 return "__llvm_retpoline_r11";
36399 }
36400 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36400)
;
36401 }
36402
36403 if (Subtarget.useLVIControlFlowIntegrity()) {
36404 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36404, __extension__
__PRETTY_FUNCTION__))
;
36405 return "__llvm_lvi_thunk_r11";
36406 }
36407 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36407)
;
36408}
36409
36410MachineBasicBlock *
36411X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36412 MachineBasicBlock *BB) const {
36413 // Copy the virtual register into the R11 physical register and
36414 // call the retpoline thunk.
36415 const DebugLoc &DL = MI.getDebugLoc();
36416 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36417 Register CalleeVReg = MI.getOperand(0).getReg();
36418 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36419
36420 // Find an available scratch register to hold the callee. On 64-bit, we can
36421 // just use R11, but we scan for uses anyway to ensure we don't generate
36422 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36423 // already a register use operand to the call to hold the callee. If none
36424 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36425 // register and ESI is the base pointer to realigned stack frames with VLAs.
36426 SmallVector<unsigned, 3> AvailableRegs;
36427 if (Subtarget.is64Bit())
36428 AvailableRegs.push_back(X86::R11);
36429 else
36430 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36431
36432 // Zero out any registers that are already used.
36433 for (const auto &MO : MI.operands()) {
36434 if (MO.isReg() && MO.isUse())
36435 for (unsigned &Reg : AvailableRegs)
36436 if (Reg == MO.getReg())
36437 Reg = 0;
36438 }
36439
36440 // Choose the first remaining non-zero available register.
36441 unsigned AvailableReg = 0;
36442 for (unsigned MaybeReg : AvailableRegs) {
36443 if (MaybeReg) {
36444 AvailableReg = MaybeReg;
36445 break;
36446 }
36447 }
36448 if (!AvailableReg)
36449 report_fatal_error("calling convention incompatible with retpoline, no "
36450 "available registers");
36451
36452 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36453
36454 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
36455 .addReg(CalleeVReg);
36456 MI.getOperand(0).ChangeToES(Symbol);
36457 MI.setDesc(TII->get(Opc));
36458 MachineInstrBuilder(*BB->getParent(), &MI)
36459 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36460 return BB;
36461}
36462
36463/// SetJmp implies future control flow change upon calling the corresponding
36464/// LongJmp.
36465/// Instead of using the 'return' instruction, the long jump fixes the stack and
36466/// performs an indirect branch. To do so it uses the registers that were stored
36467/// in the jump buffer (when calling SetJmp).
36468/// In case the shadow stack is enabled we need to fix it as well, because some
36469/// return addresses will be skipped.
36470/// The function will save the SSP for future fixing in the function
36471/// emitLongJmpShadowStackFix.
36472/// \sa emitLongJmpShadowStackFix
36473/// \param [in] MI The temporary Machine Instruction for the builtin.
36474/// \param [in] MBB The Machine Basic Block that will be modified.
36475void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36476 MachineBasicBlock *MBB) const {
36477 const DebugLoc &DL = MI.getDebugLoc();
36478 MachineFunction *MF = MBB->getParent();
36479 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36480 MachineRegisterInfo &MRI = MF->getRegInfo();
36481 MachineInstrBuilder MIB;
36482
36483 // Memory Reference.
36484 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36485 MI.memoperands_end());
36486
36487 // Initialize a register with zero.
36488 MVT PVT = getPointerTy(MF->getDataLayout());
36489 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36490 Register ZReg = MRI.createVirtualRegister(PtrRC);
36491 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36492 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
36493 .addDef(ZReg)
36494 .addReg(ZReg, RegState::Undef)
36495 .addReg(ZReg, RegState::Undef);
36496
36497 // Read the current SSP Register value to the zeroed register.
36498 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36499 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36500 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36501
36502 // Write the SSP register value to offset 3 in input memory buffer.
36503 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36504 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
36505 const int64_t SSPOffset = 3 * PVT.getStoreSize();
36506 const unsigned MemOpndSlot = 1;
36507 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36508 if (i == X86::AddrDisp)
36509 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
36510 else
36511 MIB.add(MI.getOperand(MemOpndSlot + i));
36512 }
36513 MIB.addReg(SSPCopyReg);
36514 MIB.setMemRefs(MMOs);
36515}
36516
36517MachineBasicBlock *
36518X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
36519 MachineBasicBlock *MBB) const {
36520 const DebugLoc &DL = MI.getDebugLoc();
36521 MachineFunction *MF = MBB->getParent();
36522 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36523 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36524 MachineRegisterInfo &MRI = MF->getRegInfo();
36525
36526 const BasicBlock *BB = MBB->getBasicBlock();
36527 MachineFunction::iterator I = ++MBB->getIterator();
36528
36529 // Memory Reference
36530 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36531 MI.memoperands_end());
36532
36533 unsigned DstReg;
36534 unsigned MemOpndSlot = 0;
36535
36536 unsigned CurOp = 0;
36537
36538 DstReg = MI.getOperand(CurOp++).getReg();
36539 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36540 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36540, __extension__
__PRETTY_FUNCTION__))
;
36541 (void)TRI;
36542 Register mainDstReg = MRI.createVirtualRegister(RC);
36543 Register restoreDstReg = MRI.createVirtualRegister(RC);
36544
36545 MemOpndSlot = CurOp;
36546
36547 MVT PVT = getPointerTy(MF->getDataLayout());
36548 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36549, __extension__
__PRETTY_FUNCTION__))
36549 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36549, __extension__
__PRETTY_FUNCTION__))
;
36550
36551 // For v = setjmp(buf), we generate
36552 //
36553 // thisMBB:
36554 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
36555 // SjLjSetup restoreMBB
36556 //
36557 // mainMBB:
36558 // v_main = 0
36559 //
36560 // sinkMBB:
36561 // v = phi(main, restore)
36562 //
36563 // restoreMBB:
36564 // if base pointer being used, load it from frame
36565 // v_restore = 1
36566
36567 MachineBasicBlock *thisMBB = MBB;
36568 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36569 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36570 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
36571 MF->insert(I, mainMBB);
36572 MF->insert(I, sinkMBB);
36573 MF->push_back(restoreMBB);
36574 restoreMBB->setMachineBlockAddressTaken();
36575
36576 MachineInstrBuilder MIB;
36577
36578 // Transfer the remainder of BB and its successor edges to sinkMBB.
36579 sinkMBB->splice(sinkMBB->begin(), MBB,
36580 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36581 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36582
36583 // thisMBB:
36584 unsigned PtrStoreOpc = 0;
36585 unsigned LabelReg = 0;
36586 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36587 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36588 !isPositionIndependent();
36589
36590 // Prepare IP either in reg or imm.
36591 if (!UseImmLabel) {
36592 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36593 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36594 LabelReg = MRI.createVirtualRegister(PtrRC);
36595 if (Subtarget.is64Bit()) {
36596 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
36597 .addReg(X86::RIP)
36598 .addImm(0)
36599 .addReg(0)
36600 .addMBB(restoreMBB)
36601 .addReg(0);
36602 } else {
36603 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
36604 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
36605 .addReg(XII->getGlobalBaseReg(MF))
36606 .addImm(0)
36607 .addReg(0)
36608 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
36609 .addReg(0);
36610 }
36611 } else
36612 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36613 // Store IP
36614 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
36615 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36616 if (i == X86::AddrDisp)
36617 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
36618 else
36619 MIB.add(MI.getOperand(MemOpndSlot + i));
36620 }
36621 if (!UseImmLabel)
36622 MIB.addReg(LabelReg);
36623 else
36624 MIB.addMBB(restoreMBB);
36625 MIB.setMemRefs(MMOs);
36626
36627 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
36628 emitSetJmpShadowStackFix(MI, thisMBB);
36629 }
36630
36631 // Setup
36632 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
36633 .addMBB(restoreMBB);
36634
36635 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36636 MIB.addRegMask(RegInfo->getNoPreservedMask());
36637 thisMBB->addSuccessor(mainMBB);
36638 thisMBB->addSuccessor(restoreMBB);
36639
36640 // mainMBB:
36641 // EAX = 0
36642 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
36643 mainMBB->addSuccessor(sinkMBB);
36644
36645 // sinkMBB:
36646 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
36647 TII->get(X86::PHI), DstReg)
36648 .addReg(mainDstReg).addMBB(mainMBB)
36649 .addReg(restoreDstReg).addMBB(restoreMBB);
36650
36651 // restoreMBB:
36652 if (RegInfo->hasBasePointer(*MF)) {
36653 const bool Uses64BitFramePtr =
36654 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36655 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
36656 X86FI->setRestoreBasePointer(MF);
36657 Register FramePtr = RegInfo->getFrameRegister(*MF);
36658 Register BasePtr = RegInfo->getBaseRegister();
36659 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
36660 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
36661 FramePtr, true, X86FI->getRestoreBasePointerOffset())
36662 .setMIFlag(MachineInstr::FrameSetup);
36663 }
36664 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
36665 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36666 restoreMBB->addSuccessor(sinkMBB);
36667
36668 MI.eraseFromParent();
36669 return sinkMBB;
36670}
36671
36672/// Fix the shadow stack using the previously saved SSP pointer.
36673/// \sa emitSetJmpShadowStackFix
36674/// \param [in] MI The temporary Machine Instruction for the builtin.
36675/// \param [in] MBB The Machine Basic Block that will be modified.
36676/// \return The sink MBB that will perform the future indirect branch.
36677MachineBasicBlock *
36678X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
36679 MachineBasicBlock *MBB) const {
36680 const DebugLoc &DL = MI.getDebugLoc();
36681 MachineFunction *MF = MBB->getParent();
36682 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36683 MachineRegisterInfo &MRI = MF->getRegInfo();
36684
36685 // Memory Reference
36686 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36687 MI.memoperands_end());
36688
36689 MVT PVT = getPointerTy(MF->getDataLayout());
36690 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36691
36692 // checkSspMBB:
36693 // xor vreg1, vreg1
36694 // rdssp vreg1
36695 // test vreg1, vreg1
36696 // je sinkMBB # Jump if Shadow Stack is not supported
36697 // fallMBB:
36698 // mov buf+24/12(%rip), vreg2
36699 // sub vreg1, vreg2
36700 // jbe sinkMBB # No need to fix the Shadow Stack
36701 // fixShadowMBB:
36702 // shr 3/2, vreg2
36703 // incssp vreg2 # fix the SSP according to the lower 8 bits
36704 // shr 8, vreg2
36705 // je sinkMBB
36706 // fixShadowLoopPrepareMBB:
36707 // shl vreg2
36708 // mov 128, vreg3
36709 // fixShadowLoopMBB:
36710 // incssp vreg3
36711 // dec vreg2
36712 // jne fixShadowLoopMBB # Iterate until you finish fixing
36713 // # the Shadow Stack
36714 // sinkMBB:
36715
36716 MachineFunction::iterator I = ++MBB->getIterator();
36717 const BasicBlock *BB = MBB->getBasicBlock();
36718
36719 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36720 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36721 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36722 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36723 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36724 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36725 MF->insert(I, checkSspMBB);
36726 MF->insert(I, fallMBB);
36727 MF->insert(I, fixShadowMBB);
36728 MF->insert(I, fixShadowLoopPrepareMBB);
36729 MF->insert(I, fixShadowLoopMBB);
36730 MF->insert(I, sinkMBB);
36731
36732 // Transfer the remainder of BB and its successor edges to sinkMBB.
36733 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36734 MBB->end());
36735 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36736
36737 MBB->addSuccessor(checkSspMBB);
36738
36739 // Initialize a register with zero.
36740 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
36741 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
36742
36743 if (PVT == MVT::i64) {
36744 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
36745 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36746 .addImm(0)
36747 .addReg(ZReg)
36748 .addImm(X86::sub_32bit);
36749 ZReg = TmpZReg;
36750 }
36751
36752 // Read the current SSP Register value to the zeroed register.
36753 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36754 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36755 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36756
36757 // Check whether the result of the SSP register is zero and jump directly
36758 // to the sink.
36759 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
36760 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
36761 .addReg(SSPCopyReg)
36762 .addReg(SSPCopyReg);
36763 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
36764 checkSspMBB->addSuccessor(sinkMBB);
36765 checkSspMBB->addSuccessor(fallMBB);
36766
36767 // Reload the previously saved SSP register value.
36768 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
36769 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36770 const int64_t SPPOffset = 3 * PVT.getStoreSize();
36771 MachineInstrBuilder MIB =
36772 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
36773 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36774 const MachineOperand &MO = MI.getOperand(i);
36775 if (i == X86::AddrDisp)
36776 MIB.addDisp(MO, SPPOffset);
36777 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36778 // preserve kill flags.
36779 MIB.addReg(MO.getReg());
36780 else
36781 MIB.add(MO);
36782 }
36783 MIB.setMemRefs(MMOs);
36784
36785 // Subtract the current SSP from the previous SSP.
36786 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
36787 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
36788 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
36789 .addReg(PrevSSPReg)
36790 .addReg(SSPCopyReg);
36791
36792 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
36793 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
36794 fallMBB->addSuccessor(sinkMBB);
36795 fallMBB->addSuccessor(fixShadowMBB);
36796
36797 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
36798 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
36799 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
36800 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
36801 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
36802 .addReg(SspSubReg)
36803 .addImm(Offset);
36804
36805 // Increase SSP when looking only on the lower 8 bits of the delta.
36806 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
36807 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36808
36809 // Reset the lower 8 bits.
36810 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
36811 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
36812 .addReg(SspFirstShrReg)
36813 .addImm(8);
36814
36815 // Jump if the result of the shift is zero.
36816 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
36817 fixShadowMBB->addSuccessor(sinkMBB);
36818 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36819
36820 // Do a single shift left.
36821 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
36822 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
36823 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
36824 .addReg(SspSecondShrReg);
36825
36826 // Save the value 128 to a register (will be used next with incssp).
36827 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
36828 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
36829 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
36830 .addImm(128);
36831 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36832
36833 // Since incssp only looks at the lower 8 bits, we might need to do several
36834 // iterations of incssp until we finish fixing the shadow stack.
36835 Register DecReg = MRI.createVirtualRegister(PtrRC);
36836 Register CounterReg = MRI.createVirtualRegister(PtrRC);
36837 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
36838 .addReg(SspAfterShlReg)
36839 .addMBB(fixShadowLoopPrepareMBB)
36840 .addReg(DecReg)
36841 .addMBB(fixShadowLoopMBB);
36842
36843 // Every iteration we increase the SSP by 128.
36844 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
36845
36846 // Every iteration we decrement the counter by 1.
36847 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
36848 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
36849
36850 // Jump if the counter is not zero yet.
36851 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
36852 fixShadowLoopMBB->addSuccessor(sinkMBB);
36853 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36854
36855 return sinkMBB;
36856}
36857
36858MachineBasicBlock *
36859X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
36860 MachineBasicBlock *MBB) const {
36861 const DebugLoc &DL = MI.getDebugLoc();
36862 MachineFunction *MF = MBB->getParent();
36863 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36864 MachineRegisterInfo &MRI = MF->getRegInfo();
36865
36866 // Memory Reference
36867 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36868 MI.memoperands_end());
36869
36870 MVT PVT = getPointerTy(MF->getDataLayout());
36871 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36872, __extension__
__PRETTY_FUNCTION__))
36872 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36872, __extension__
__PRETTY_FUNCTION__))
;
36873
36874 const TargetRegisterClass *RC =
36875 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36876 Register Tmp = MRI.createVirtualRegister(RC);
36877 // Since FP is only updated here but NOT referenced, it's treated as GPR.
36878 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36879 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
36880 Register SP = RegInfo->getStackRegister();
36881
36882 MachineInstrBuilder MIB;
36883
36884 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36885 const int64_t SPOffset = 2 * PVT.getStoreSize();
36886
36887 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36888 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
36889
36890 MachineBasicBlock *thisMBB = MBB;
36891
36892 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
36893 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
36894 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
36895 }
36896
36897 // Reload FP
36898 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
36899 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36900 const MachineOperand &MO = MI.getOperand(i);
36901 if (MO.isReg()) // Don't add the whole operand, we don't want to
36902 // preserve kill flags.
36903 MIB.addReg(MO.getReg());
36904 else
36905 MIB.add(MO);
36906 }
36907 MIB.setMemRefs(MMOs);
36908
36909 // Reload IP
36910 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
36911 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36912 const MachineOperand &MO = MI.getOperand(i);
36913 if (i == X86::AddrDisp)
36914 MIB.addDisp(MO, LabelOffset);
36915 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36916 // preserve kill flags.
36917 MIB.addReg(MO.getReg());
36918 else
36919 MIB.add(MO);
36920 }
36921 MIB.setMemRefs(MMOs);
36922
36923 // Reload SP
36924 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
36925 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36926 if (i == X86::AddrDisp)
36927 MIB.addDisp(MI.getOperand(i), SPOffset);
36928 else
36929 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
36930 // the last instruction of the expansion.
36931 }
36932 MIB.setMemRefs(MMOs);
36933
36934 // Jump
36935 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
36936
36937 MI.eraseFromParent();
36938 return thisMBB;
36939}
36940
36941void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36942 MachineBasicBlock *MBB,
36943 MachineBasicBlock *DispatchBB,
36944 int FI) const {
36945 const DebugLoc &DL = MI.getDebugLoc();
36946 MachineFunction *MF = MBB->getParent();
36947 MachineRegisterInfo *MRI = &MF->getRegInfo();
36948 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36949
36950 MVT PVT = getPointerTy(MF->getDataLayout());
36951 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36951, __extension__
__PRETTY_FUNCTION__))
;
36952
36953 unsigned Op = 0;
36954 unsigned VR = 0;
36955
36956 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36957 !isPositionIndependent();
36958
36959 if (UseImmLabel) {
36960 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36961 } else {
36962 const TargetRegisterClass *TRC =
36963 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36964 VR = MRI->createVirtualRegister(TRC);
36965 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36966
36967 if (Subtarget.is64Bit())
36968 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
36969 .addReg(X86::RIP)
36970 .addImm(1)
36971 .addReg(0)
36972 .addMBB(DispatchBB)
36973 .addReg(0);
36974 else
36975 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
36976 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36977 .addImm(1)
36978 .addReg(0)
36979 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36980 .addReg(0);
36981 }
36982
36983 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
36984 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36985 if (UseImmLabel)
36986 MIB.addMBB(DispatchBB);
36987 else
36988 MIB.addReg(VR);
36989}
36990
36991MachineBasicBlock *
36992X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36993 MachineBasicBlock *BB) const {
36994 const DebugLoc &DL = MI.getDebugLoc();
36995 MachineFunction *MF = BB->getParent();
36996 MachineRegisterInfo *MRI = &MF->getRegInfo();
36997 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36998 int FI = MF->getFrameInfo().getFunctionContextIndex();
36999
37000 // Get a mapping of the call site numbers to all of the landing pads they're
37001 // associated with.
37002 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37003 unsigned MaxCSNum = 0;
37004 for (auto &MBB : *MF) {
37005 if (!MBB.isEHPad())
37006 continue;
37007
37008 MCSymbol *Sym = nullptr;
37009 for (const auto &MI : MBB) {
37010 if (MI.isDebugInstr())
37011 continue;
37012
37013 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37013, __extension__
__PRETTY_FUNCTION__))
;
37014 Sym = MI.getOperand(0).getMCSymbol();
37015 break;
37016 }
37017
37018 if (!MF->hasCallSiteLandingPad(Sym))
37019 continue;
37020
37021 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37022 CallSiteNumToLPad[CSI].push_back(&MBB);
37023 MaxCSNum = std::max(MaxCSNum, CSI);
37024 }
37025 }
37026
37027 // Get an ordered list of the machine basic blocks for the jump table.
37028 std::vector<MachineBasicBlock *> LPadList;
37029 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37030 LPadList.reserve(CallSiteNumToLPad.size());
37031
37032 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37033 for (auto &LP : CallSiteNumToLPad[CSI]) {
37034 LPadList.push_back(LP);
37035 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37036 }
37037 }
37038
37039 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37040, __extension__
__PRETTY_FUNCTION__))
37040 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37040, __extension__
__PRETTY_FUNCTION__))
;
37041
37042 // Create the MBBs for the dispatch code.
37043
37044 // Shove the dispatch's address into the return slot in the function context.
37045 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37046 DispatchBB->setIsEHPad(true);
37047
37048 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37049 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
37050 DispatchBB->addSuccessor(TrapBB);
37051
37052 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37053 DispatchBB->addSuccessor(DispContBB);
37054
37055 // Insert MBBs.
37056 MF->push_back(DispatchBB);
37057 MF->push_back(DispContBB);
37058 MF->push_back(TrapBB);
37059
37060 // Insert code into the entry block that creates and registers the function
37061 // context.
37062 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37063
37064 // Create the jump table and associated information
37065 unsigned JTE = getJumpTableEncoding();
37066 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37067 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37068
37069 const X86RegisterInfo &RI = TII->getRegisterInfo();
37070 // Add a register mask with no preserved registers. This results in all
37071 // registers being marked as clobbered.
37072 if (RI.hasBasePointer(*MF)) {
37073 const bool FPIs64Bit =
37074 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37075 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37076 MFI->setRestoreBasePointer(MF);
37077
37078 Register FP = RI.getFrameRegister(*MF);
37079 Register BP = RI.getBaseRegister();
37080 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37081 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
37082 MFI->getRestoreBasePointerOffset())
37083 .addRegMask(RI.getNoPreservedMask());
37084 } else {
37085 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
37086 .addRegMask(RI.getNoPreservedMask());
37087 }
37088
37089 // IReg is used as an index in a memory operand and therefore can't be SP
37090 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37091 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
37092 Subtarget.is64Bit() ? 8 : 4);
37093 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
37094 .addReg(IReg)
37095 .addImm(LPadList.size());
37096 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
37097
37098 if (Subtarget.is64Bit()) {
37099 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37100 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37101
37102 // leaq .LJTI0_0(%rip), BReg
37103 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
37104 .addReg(X86::RIP)
37105 .addImm(1)
37106 .addReg(0)
37107 .addJumpTableIndex(MJTI)
37108 .addReg(0);
37109 // movzx IReg64, IReg
37110 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37111 .addImm(0)
37112 .addReg(IReg)
37113 .addImm(X86::sub_32bit);
37114
37115 switch (JTE) {
37116 case MachineJumpTableInfo::EK_BlockAddress:
37117 // jmpq *(BReg,IReg64,8)
37118 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
37119 .addReg(BReg)
37120 .addImm(8)
37121 .addReg(IReg64)
37122 .addImm(0)
37123 .addReg(0);
37124 break;
37125 case MachineJumpTableInfo::EK_LabelDifference32: {
37126 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37127 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37128 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37129
37130 // movl (BReg,IReg64,4), OReg
37131 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
37132 .addReg(BReg)
37133 .addImm(4)
37134 .addReg(IReg64)
37135 .addImm(0)
37136 .addReg(0);
37137 // movsx OReg64, OReg
37138 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
37139 // addq BReg, OReg64, TReg
37140 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
37141 .addReg(OReg64)
37142 .addReg(BReg);
37143 // jmpq *TReg
37144 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
37145 break;
37146 }
37147 default:
37148 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37148)
;
37149 }
37150 } else {
37151 // jmpl *.LJTI0_0(,IReg,4)
37152 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
37153 .addReg(0)
37154 .addImm(4)
37155 .addReg(IReg)
37156 .addJumpTableIndex(MJTI)
37157 .addReg(0);
37158 }
37159
37160 // Add the jump table entries as successors to the MBB.
37161 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37162 for (auto &LP : LPadList)
37163 if (SeenMBBs.insert(LP).second)
37164 DispContBB->addSuccessor(LP);
37165
37166 // N.B. the order the invoke BBs are processed in doesn't matter here.
37167 SmallVector<MachineBasicBlock *, 64> MBBLPads;
37168 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37169 for (MachineBasicBlock *MBB : InvokeBBs) {
37170 // Remove the landing pad successor from the invoke block and replace it
37171 // with the new dispatch block.
37172 // Keep a copy of Successors since it's modified inside the loop.
37173 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37174 MBB->succ_rend());
37175 // FIXME: Avoid quadratic complexity.
37176 for (auto *MBBS : Successors) {
37177 if (MBBS->isEHPad()) {
37178 MBB->removeSuccessor(MBBS);
37179 MBBLPads.push_back(MBBS);
37180 }
37181 }
37182
37183 MBB->addSuccessor(DispatchBB);
37184
37185 // Find the invoke call and mark all of the callee-saved registers as
37186 // 'implicit defined' so that they're spilled. This prevents code from
37187 // moving instructions to before the EH block, where they will never be
37188 // executed.
37189 for (auto &II : reverse(*MBB)) {
37190 if (!II.isCall())
37191 continue;
37192
37193 DenseMap<unsigned, bool> DefRegs;
37194 for (auto &MOp : II.operands())
37195 if (MOp.isReg())
37196 DefRegs[MOp.getReg()] = true;
37197
37198 MachineInstrBuilder MIB(*MF, &II);
37199 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37200 unsigned Reg = SavedRegs[RegIdx];
37201 if (!DefRegs[Reg])
37202 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
37203 }
37204
37205 break;
37206 }
37207 }
37208
37209 // Mark all former landing pads as non-landing pads. The dispatch is the only
37210 // landing pad now.
37211 for (auto &LP : MBBLPads)
37212 LP->setIsEHPad(false);
37213
37214 // The instruction is gone now.
37215 MI.eraseFromParent();
37216 return BB;
37217}
37218
37219MachineBasicBlock *
37220X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
37221 MachineBasicBlock *BB) const {
37222 MachineFunction *MF = BB->getParent();
37223 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37224 const DebugLoc &DL = MI.getDebugLoc();
37225
37226 auto TMMImmToTMMReg = [](unsigned Imm) {
37227 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37227, __extension__
__PRETTY_FUNCTION__))
;
37228 return X86::TMM0 + Imm;
37229 };
37230 switch (MI.getOpcode()) {
37231 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37231)
;
37232 case X86::TLS_addr32:
37233 case X86::TLS_addr64:
37234 case X86::TLS_addrX32:
37235 case X86::TLS_base_addr32:
37236 case X86::TLS_base_addr64:
37237 case X86::TLS_base_addrX32:
37238 return EmitLoweredTLSAddr(MI, BB);
37239 case X86::INDIRECT_THUNK_CALL32:
37240 case X86::INDIRECT_THUNK_CALL64:
37241 case X86::INDIRECT_THUNK_TCRETURN32:
37242 case X86::INDIRECT_THUNK_TCRETURN64:
37243 return EmitLoweredIndirectThunk(MI, BB);
37244 case X86::CATCHRET:
37245 return EmitLoweredCatchRet(MI, BB);
37246 case X86::SEG_ALLOCA_32:
37247 case X86::SEG_ALLOCA_64:
37248 return EmitLoweredSegAlloca(MI, BB);
37249 case X86::PROBED_ALLOCA_32:
37250 case X86::PROBED_ALLOCA_64:
37251 return EmitLoweredProbedAlloca(MI, BB);
37252 case X86::TLSCall_32:
37253 case X86::TLSCall_64:
37254 return EmitLoweredTLSCall(MI, BB);
37255 case X86::CMOV_FR16:
37256 case X86::CMOV_FR16X:
37257 case X86::CMOV_FR32:
37258 case X86::CMOV_FR32X:
37259 case X86::CMOV_FR64:
37260 case X86::CMOV_FR64X:
37261 case X86::CMOV_GR8:
37262 case X86::CMOV_GR16:
37263 case X86::CMOV_GR32:
37264 case X86::CMOV_RFP32:
37265 case X86::CMOV_RFP64:
37266 case X86::CMOV_RFP80:
37267 case X86::CMOV_VR64:
37268 case X86::CMOV_VR128:
37269 case X86::CMOV_VR128X:
37270 case X86::CMOV_VR256:
37271 case X86::CMOV_VR256X:
37272 case X86::CMOV_VR512:
37273 case X86::CMOV_VK1:
37274 case X86::CMOV_VK2:
37275 case X86::CMOV_VK4:
37276 case X86::CMOV_VK8:
37277 case X86::CMOV_VK16:
37278 case X86::CMOV_VK32:
37279 case X86::CMOV_VK64:
37280 return EmitLoweredSelect(MI, BB);
37281
37282 case X86::RDFLAGS32:
37283 case X86::RDFLAGS64: {
37284 unsigned PushF =
37285 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
37286 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
37287 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
37288 // Permit reads of the EFLAGS and DF registers without them being defined.
37289 // This intrinsic exists to read external processor state in flags, such as
37290 // the trap flag, interrupt flag, and direction flag, none of which are
37291 // modeled by the backend.
37292 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37293, __extension__
__PRETTY_FUNCTION__))
37293 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37293, __extension__
__PRETTY_FUNCTION__))
;
37294 Push->getOperand(2).setIsUndef();
37295 assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37296, __extension__
__PRETTY_FUNCTION__))
37296 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37296, __extension__
__PRETTY_FUNCTION__))
;
37297 Push->getOperand(3).setIsUndef();
37298 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
37299
37300 MI.eraseFromParent(); // The pseudo is gone now.
37301 return BB;
37302 }
37303
37304 case X86::WRFLAGS32:
37305 case X86::WRFLAGS64: {
37306 unsigned Push =
37307 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
37308 unsigned PopF =
37309 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
37310 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
37311 BuildMI(*BB, MI, DL, TII->get(PopF));
37312
37313 MI.eraseFromParent(); // The pseudo is gone now.
37314 return BB;
37315 }
37316
37317 case X86::FP32_TO_INT16_IN_MEM:
37318 case X86::FP32_TO_INT32_IN_MEM:
37319 case X86::FP32_TO_INT64_IN_MEM:
37320 case X86::FP64_TO_INT16_IN_MEM:
37321 case X86::FP64_TO_INT32_IN_MEM:
37322 case X86::FP64_TO_INT64_IN_MEM:
37323 case X86::FP80_TO_INT16_IN_MEM:
37324 case X86::FP80_TO_INT32_IN_MEM:
37325 case X86::FP80_TO_INT64_IN_MEM: {
37326 // Change the floating point control register to use "round towards zero"
37327 // mode when truncating to an integer value.
37328 int OrigCWFrameIdx =
37329 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37330 addFrameReference(BuildMI(*BB, MI, DL,
37331 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
37332
37333 // Load the old value of the control word...
37334 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37335 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
37336 OrigCWFrameIdx);
37337
37338 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37339 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37340 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
37341 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37342
37343 // Extract to 16 bits.
37344 Register NewCW16 =
37345 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37346 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
37347 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37348
37349 // Prepare memory for FLDCW.
37350 int NewCWFrameIdx =
37351 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37352 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
37353 NewCWFrameIdx)
37354 .addReg(NewCW16, RegState::Kill);
37355
37356 // Reload the modified control word now...
37357 addFrameReference(BuildMI(*BB, MI, DL,
37358 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37359
37360 // Get the X86 opcode to use.
37361 unsigned Opc;
37362 switch (MI.getOpcode()) {
37363 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37363)
;
37364 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37365 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37366 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37367 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37368 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37369 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37370 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37371 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37372 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37373 }
37374
37375 X86AddressMode AM = getAddressFromInstr(&MI, 0);
37376 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
37377 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37378
37379 // Reload the original control word now.
37380 addFrameReference(BuildMI(*BB, MI, DL,
37381 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
37382
37383 MI.eraseFromParent(); // The pseudo instruction is gone now.
37384 return BB;
37385 }
37386
37387 // xbegin
37388 case X86::XBEGIN:
37389 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37390
37391 case X86::VAARG_64:
37392 case X86::VAARG_X32:
37393 return EmitVAARGWithCustomInserter(MI, BB);
37394
37395 case X86::EH_SjLj_SetJmp32:
37396 case X86::EH_SjLj_SetJmp64:
37397 return emitEHSjLjSetJmp(MI, BB);
37398
37399 case X86::EH_SjLj_LongJmp32:
37400 case X86::EH_SjLj_LongJmp64:
37401 return emitEHSjLjLongJmp(MI, BB);
37402
37403 case X86::Int_eh_sjlj_setup_dispatch:
37404 return EmitSjLjDispatchBlock(MI, BB);
37405
37406 case TargetOpcode::STATEPOINT:
37407 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37408 // this point in the process. We diverge later.
37409 return emitPatchPoint(MI, BB);
37410
37411 case TargetOpcode::STACKMAP:
37412 case TargetOpcode::PATCHPOINT:
37413 return emitPatchPoint(MI, BB);
37414
37415 case TargetOpcode::PATCHABLE_EVENT_CALL:
37416 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37417 return BB;
37418
37419 case X86::LCMPXCHG8B: {
37420 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37421 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37422 // requires a memory operand. If it happens that current architecture is
37423 // i686 and for current function we need a base pointer
37424 // - which is ESI for i686 - register allocator would not be able to
37425 // allocate registers for an address in form of X(%reg, %reg, Y)
37426 // - there never would be enough unreserved registers during regalloc
37427 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37428 // We are giving a hand to register allocator by precomputing the address in
37429 // a new vreg using LEA.
37430
37431 // If it is not i686 or there is no base pointer - nothing to do here.
37432 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37433 return BB;
37434
37435 // Even though this code does not necessarily needs the base pointer to
37436 // be ESI, we check for that. The reason: if this assert fails, there are
37437 // some changes happened in the compiler base pointer handling, which most
37438 // probably have to be addressed somehow here.
37439 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37441, __extension__
__PRETTY_FUNCTION__))
37440 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37441, __extension__
__PRETTY_FUNCTION__))
37441 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37441, __extension__
__PRETTY_FUNCTION__))
;
37442
37443 MachineRegisterInfo &MRI = MF->getRegInfo();
37444 MVT SPTy = getPointerTy(MF->getDataLayout());
37445 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
37446 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
37447
37448 X86AddressMode AM = getAddressFromInstr(&MI, 0);
37449 // Regalloc does not need any help when the memory operand of CMPXCHG8B
37450 // does not use index register.
37451 if (AM.IndexReg == X86::NoRegister)
37452 return BB;
37453
37454 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
37455 // four operand definitions that are E[ABCD] registers. We skip them and
37456 // then insert the LEA.
37457 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
37458 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
37459 RMBBI->definesRegister(X86::EBX) ||
37460 RMBBI->definesRegister(X86::ECX) ||
37461 RMBBI->definesRegister(X86::EDX))) {
37462 ++RMBBI;
37463 }
37464 MachineBasicBlock::iterator MBBI(RMBBI);
37465 addFullAddress(
37466 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
37467
37468 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
37469
37470 return BB;
37471 }
37472 case X86::LCMPXCHG16B_NO_RBX: {
37473 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37474 Register BasePtr = TRI->getBaseRegister();
37475 if (TRI->hasBasePointer(*MF) &&
37476 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
37477 if (!BB->isLiveIn(BasePtr))
37478 BB->addLiveIn(BasePtr);
37479 // Save RBX into a virtual register.
37480 Register SaveRBX =
37481 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37482 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
37483 .addReg(X86::RBX);
37484 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37485 MachineInstrBuilder MIB =
37486 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
37487 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37488 MIB.add(MI.getOperand(Idx));
37489 MIB.add(MI.getOperand(X86::AddrNumOperands));
37490 MIB.addReg(SaveRBX);
37491 } else {
37492 // Simple case, just copy the virtual register to RBX.
37493 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
37494 .add(MI.getOperand(X86::AddrNumOperands));
37495 MachineInstrBuilder MIB =
37496 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
37497 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37498 MIB.add(MI.getOperand(Idx));
37499 }
37500 MI.eraseFromParent();
37501 return BB;
37502 }
37503 case X86::MWAITX: {
37504 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37505 Register BasePtr = TRI->getBaseRegister();
37506 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
37507 // If no need to save the base pointer, we generate MWAITXrrr,
37508 // else we generate pseudo MWAITX_SAVE_RBX.
37509 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
37510 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
37511 .addReg(MI.getOperand(0).getReg());
37512 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
37513 .addReg(MI.getOperand(1).getReg());
37514 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
37515 .addReg(MI.getOperand(2).getReg());
37516 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
37517 MI.eraseFromParent();
37518 } else {
37519 if (!BB->isLiveIn(BasePtr)) {
37520 BB->addLiveIn(BasePtr);
37521 }
37522 // Parameters can be copied into ECX and EAX but not EBX yet.
37523 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
37524 .addReg(MI.getOperand(0).getReg());
37525 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
37526 .addReg(MI.getOperand(1).getReg());
37527 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37527, __extension__
__PRETTY_FUNCTION__))
;
37528 // Save RBX into a virtual register.
37529 Register SaveRBX =
37530 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37531 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
37532 .addReg(X86::RBX);
37533 // Generate mwaitx pseudo.
37534 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37535 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
37536 .addDef(Dst) // Destination tied in with SaveRBX.
37537 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
37538 .addUse(SaveRBX); // Save of base pointer.
37539 MI.eraseFromParent();
37540 }
37541 return BB;
37542 }
37543 case TargetOpcode::PREALLOCATED_SETUP: {
37544 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37544, __extension__
__PRETTY_FUNCTION__))
;
37545 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
37546 MFI->setHasPreallocatedCall(true);
37547 int64_t PreallocatedId = MI.getOperand(0).getImm();
37548 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
37549 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37549, __extension__
__PRETTY_FUNCTION__))
;
37550 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
37551 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
37552 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
37553 .addReg(X86::ESP)
37554 .addImm(StackAdjustment);
37555 MI.eraseFromParent();
37556 return BB;
37557 }
37558 case TargetOpcode::PREALLOCATED_ARG: {
37559 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37559, __extension__
__PRETTY_FUNCTION__))
;
37560 int64_t PreallocatedId = MI.getOperand(1).getImm();
37561 int64_t ArgIdx = MI.getOperand(2).getImm();
37562 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
37563 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
37564 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
37565 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
37566 // stack pointer + offset
37567 addRegOffset(
37568 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
37569 X86::ESP, false, ArgOffset);
37570 MI.eraseFromParent();
37571 return BB;
37572 }
37573 case X86::PTDPBSSD:
37574 case X86::PTDPBSUD:
37575 case X86::PTDPBUSD:
37576 case X86::PTDPBUUD:
37577 case X86::PTDPBF16PS:
37578 case X86::PTDPFP16PS: {
37579 unsigned Opc;
37580 switch (MI.getOpcode()) {
37581 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37581)
;
37582 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
37583 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
37584 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
37585 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
37586 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
37587 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
37588 }
37589
37590 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37591 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37592 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
37593 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37594 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
37595
37596 MI.eraseFromParent(); // The pseudo is gone now.
37597 return BB;
37598 }
37599 case X86::PTILEZERO: {
37600 unsigned Imm = MI.getOperand(0).getImm();
37601 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
37602 MI.eraseFromParent(); // The pseudo is gone now.
37603 return BB;
37604 }
37605 case X86::PTILELOADD:
37606 case X86::PTILELOADDT1:
37607 case X86::PTILESTORED: {
37608 unsigned Opc;
37609 switch (MI.getOpcode()) {
37610 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37610)
;
37611 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
37612 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
37613 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
37614 }
37615
37616 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37617 unsigned CurOp = 0;
37618 if (Opc != X86::TILESTORED)
37619 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37620 RegState::Define);
37621
37622 MIB.add(MI.getOperand(CurOp++)); // base
37623 MIB.add(MI.getOperand(CurOp++)); // scale
37624 MIB.add(MI.getOperand(CurOp++)); // index -- stride
37625 MIB.add(MI.getOperand(CurOp++)); // displacement
37626 MIB.add(MI.getOperand(CurOp++)); // segment
37627
37628 if (Opc == X86::TILESTORED)
37629 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37630 RegState::Undef);
37631
37632 MI.eraseFromParent(); // The pseudo is gone now.
37633 return BB;
37634 }
37635 }
37636}
37637
37638//===----------------------------------------------------------------------===//
37639// X86 Optimization Hooks
37640//===----------------------------------------------------------------------===//
37641
37642bool
37643X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
37644 const APInt &DemandedBits,
37645 const APInt &DemandedElts,
37646 TargetLoweringOpt &TLO) const {
37647 EVT VT = Op.getValueType();
37648 unsigned Opcode = Op.getOpcode();
37649 unsigned EltSize = VT.getScalarSizeInBits();
37650
37651 if (VT.isVector()) {
37652 // If the constant is only all signbits in the active bits, then we should
37653 // extend it to the entire constant to allow it act as a boolean constant
37654 // vector.
37655 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
37656 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
37657 return false;
37658 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
37659 if (!DemandedElts[i] || V.getOperand(i).isUndef())
37660 continue;
37661 const APInt &Val = V.getConstantOperandAPInt(i);
37662 if (Val.getBitWidth() > Val.getNumSignBits() &&
37663 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
37664 return true;
37665 }
37666 return false;
37667 };
37668 // For vectors - if we have a constant, then try to sign extend.
37669 // TODO: Handle AND/ANDN cases.
37670 unsigned ActiveBits = DemandedBits.getActiveBits();
37671 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
37672 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
37673 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
37674 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
37675 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
37676 VT.getVectorNumElements());
37677 SDValue NewC =
37678 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
37679 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
37680 SDValue NewOp =
37681 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
37682 return TLO.CombineTo(Op, NewOp);
37683 }
37684 return false;
37685 }
37686
37687 // Only optimize Ands to prevent shrinking a constant that could be
37688 // matched by movzx.
37689 if (Opcode != ISD::AND)
37690 return false;
37691
37692 // Make sure the RHS really is a constant.
37693 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
37694 if (!C)
37695 return false;
37696
37697 const APInt &Mask = C->getAPIntValue();
37698
37699 // Clear all non-demanded bits initially.
37700 APInt ShrunkMask = Mask & DemandedBits;
37701
37702 // Find the width of the shrunk mask.
37703 unsigned Width = ShrunkMask.getActiveBits();
37704
37705 // If the mask is all 0s there's nothing to do here.
37706 if (Width == 0)
37707 return false;
37708
37709 // Find the next power of 2 width, rounding up to a byte.
37710 Width = PowerOf2Ceil(std::max(Width, 8U));
37711 // Truncate the width to size to handle illegal types.
37712 Width = std::min(Width, EltSize);
37713
37714 // Calculate a possible zero extend mask for this constant.
37715 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
37716
37717 // If we aren't changing the mask, just return true to keep it and prevent
37718 // the caller from optimizing.
37719 if (ZeroExtendMask == Mask)
37720 return true;
37721
37722 // Make sure the new mask can be represented by a combination of mask bits
37723 // and non-demanded bits.
37724 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
37725 return false;
37726
37727 // Replace the constant with the zero extend mask.
37728 SDLoc DL(Op);
37729 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
37730 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
37731 return TLO.CombineTo(Op, NewOp);
37732}
37733
37734void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
37735 KnownBits &Known,
37736 const APInt &DemandedElts,
37737 const SelectionDAG &DAG,
37738 unsigned Depth) const {
37739 unsigned BitWidth = Known.getBitWidth();
37740 unsigned NumElts = DemandedElts.getBitWidth();
37741 unsigned Opc = Op.getOpcode();
37742 EVT VT = Op.getValueType();
37743 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))
37744 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))
37745 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))
37746 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))
37747 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))
37748 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))
;
37749
37750 Known.resetAll();
37751 switch (Opc) {
37752 default: break;
37753 case X86ISD::SETCC:
37754 Known.Zero.setBitsFrom(1);
37755 break;
37756 case X86ISD::MOVMSK: {
37757 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
37758 Known.Zero.setBitsFrom(NumLoBits);
37759 break;
37760 }
37761 case X86ISD::PEXTRB:
37762 case X86ISD::PEXTRW: {
37763 SDValue Src = Op.getOperand(0);
37764 EVT SrcVT = Src.getValueType();
37765 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
37766 Op.getConstantOperandVal(1));
37767 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
37768 Known = Known.anyextOrTrunc(BitWidth);
37769 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
37770 break;
37771 }
37772 case X86ISD::VSRAI:
37773 case X86ISD::VSHLI:
37774 case X86ISD::VSRLI: {
37775 unsigned ShAmt = Op.getConstantOperandVal(1);
37776 if (ShAmt >= VT.getScalarSizeInBits()) {
37777 // Out of range logical bit shifts are guaranteed to be zero.
37778 // Out of range arithmetic bit shifts splat the sign bit.
37779 if (Opc != X86ISD::VSRAI) {
37780 Known.setAllZero();
37781 break;
37782 }
37783
37784 ShAmt = VT.getScalarSizeInBits() - 1;
37785 }
37786
37787 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37788 if (Opc == X86ISD::VSHLI) {
37789 Known.Zero <<= ShAmt;
37790 Known.One <<= ShAmt;
37791 // Low bits are known zero.
37792 Known.Zero.setLowBits(ShAmt);
37793 } else if (Opc == X86ISD::VSRLI) {
37794 Known.Zero.lshrInPlace(ShAmt);
37795 Known.One.lshrInPlace(ShAmt);
37796 // High bits are known zero.
37797 Known.Zero.setHighBits(ShAmt);
37798 } else {
37799 Known.Zero.ashrInPlace(ShAmt);
37800 Known.One.ashrInPlace(ShAmt);
37801 }
37802 break;
37803 }
37804 case X86ISD::PACKUS: {
37805 // PACKUS is just a truncation if the upper half is zero.
37806 APInt DemandedLHS, DemandedRHS;
37807 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
37808
37809 Known.One = APInt::getAllOnes(BitWidth * 2);
37810 Known.Zero = APInt::getAllOnes(BitWidth * 2);
37811
37812 KnownBits Known2;
37813 if (!!DemandedLHS) {
37814 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
37815 Known = KnownBits::commonBits(Known, Known2);
37816 }
37817 if (!!DemandedRHS) {
37818 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
37819 Known = KnownBits::commonBits(Known, Known2);
37820 }
37821
37822 if (Known.countMinLeadingZeros() < BitWidth)
37823 Known.resetAll();
37824 Known = Known.trunc(BitWidth);
37825 break;
37826 }
37827 case X86ISD::VBROADCAST: {
37828 SDValue Src = Op.getOperand(0);
37829 if (!Src.getSimpleValueType().isVector()) {
37830 Known = DAG.computeKnownBits(Src, Depth + 1);
37831 return;
37832 }
37833 break;
37834 }
37835 case X86ISD::AND: {
37836 if (Op.getResNo() == 0) {
37837 KnownBits Known2;
37838 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37839 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37840 Known &= Known2;
37841 }
37842 break;
37843 }
37844 case X86ISD::ANDNP: {
37845 KnownBits Known2;
37846 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37847 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37848
37849 // ANDNP = (~X & Y);
37850 Known.One &= Known2.Zero;
37851 Known.Zero |= Known2.One;
37852 break;
37853 }
37854 case X86ISD::FOR: {
37855 KnownBits Known2;
37856 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37857 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37858
37859 Known |= Known2;
37860 break;
37861 }
37862 case X86ISD::PSADBW: {
37863 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37865, __extension__
__PRETTY_FUNCTION__))
37864 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37865, __extension__
__PRETTY_FUNCTION__))
37865 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37865, __extension__
__PRETTY_FUNCTION__))
;
37866
37867 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
37868 Known.Zero.setBitsFrom(16);
37869 break;
37870 }
37871 case X86ISD::PMULUDQ: {
37872 KnownBits Known2;
37873 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37874 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37875
37876 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37877 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37878 Known = KnownBits::mul(Known, Known2);
37879 break;
37880 }
37881 case X86ISD::CMOV: {
37882 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37883 // If we don't know any bits, early out.
37884 if (Known.isUnknown())
37885 break;
37886 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37887
37888 // Only known if known in both the LHS and RHS.
37889 Known = KnownBits::commonBits(Known, Known2);
37890 break;
37891 }
37892 case X86ISD::BEXTR:
37893 case X86ISD::BEXTRI: {
37894 SDValue Op0 = Op.getOperand(0);
37895 SDValue Op1 = Op.getOperand(1);
37896
37897 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37898 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37899 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37900
37901 // If the length is 0, the result is 0.
37902 if (Length == 0) {
37903 Known.setAllZero();
37904 break;
37905 }
37906
37907 if ((Shift + Length) <= BitWidth) {
37908 Known = DAG.computeKnownBits(Op0, Depth + 1);
37909 Known = Known.extractBits(Length, Shift);
37910 Known = Known.zextOrTrunc(BitWidth);
37911 }
37912 }
37913 break;
37914 }
37915 case X86ISD::PDEP: {
37916 KnownBits Known2;
37917 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37918 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37919 // Zeros are retained from the mask operand. But not ones.
37920 Known.One.clearAllBits();
37921 // The result will have at least as many trailing zeros as the non-mask
37922 // operand since bits can only map to the same or higher bit position.
37923 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37924 break;
37925 }
37926 case X86ISD::PEXT: {
37927 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37928 // The result has as many leading zeros as the number of zeroes in the mask.
37929 unsigned Count = Known.Zero.countPopulation();
37930 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37931 Known.One.clearAllBits();
37932 break;
37933 }
37934 case X86ISD::VTRUNC:
37935 case X86ISD::VTRUNCS:
37936 case X86ISD::VTRUNCUS:
37937 case X86ISD::CVTSI2P:
37938 case X86ISD::CVTUI2P:
37939 case X86ISD::CVTP2SI:
37940 case X86ISD::CVTP2UI:
37941 case X86ISD::MCVTP2SI:
37942 case X86ISD::MCVTP2UI:
37943 case X86ISD::CVTTP2SI:
37944 case X86ISD::CVTTP2UI:
37945 case X86ISD::MCVTTP2SI:
37946 case X86ISD::MCVTTP2UI:
37947 case X86ISD::MCVTSI2P:
37948 case X86ISD::MCVTUI2P:
37949 case X86ISD::VFPROUND:
37950 case X86ISD::VMFPROUND:
37951 case X86ISD::CVTPS2PH:
37952 case X86ISD::MCVTPS2PH: {
37953 // Truncations/Conversions - upper elements are known zero.
37954 EVT SrcVT = Op.getOperand(0).getValueType();
37955 if (SrcVT.isVector()) {
37956 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37957 if (NumElts > NumSrcElts &&
37958 DemandedElts.countTrailingZeros() >= NumSrcElts)
37959 Known.setAllZero();
37960 }
37961 break;
37962 }
37963 case X86ISD::STRICT_CVTTP2SI:
37964 case X86ISD::STRICT_CVTTP2UI:
37965 case X86ISD::STRICT_CVTSI2P:
37966 case X86ISD::STRICT_CVTUI2P:
37967 case X86ISD::STRICT_VFPROUND:
37968 case X86ISD::STRICT_CVTPS2PH: {
37969 // Strict Conversions - upper elements are known zero.
37970 EVT SrcVT = Op.getOperand(1).getValueType();
37971 if (SrcVT.isVector()) {
37972 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37973 if (NumElts > NumSrcElts &&
37974 DemandedElts.countTrailingZeros() >= NumSrcElts)
37975 Known.setAllZero();
37976 }
37977 break;
37978 }
37979 case X86ISD::MOVQ2DQ: {
37980 // Move from MMX to XMM. Upper half of XMM should be 0.
37981 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
37982 Known.setAllZero();
37983 break;
37984 }
37985 case X86ISD::VBROADCAST_LOAD: {
37986 APInt UndefElts;
37987 SmallVector<APInt, 16> EltBits;
37988 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37989 /*AllowWholeUndefs*/ false,
37990 /*AllowPartialUndefs*/ false)) {
37991 Known.Zero.setAllBits();
37992 Known.One.setAllBits();
37993 for (unsigned I = 0; I != NumElts; ++I) {
37994 if (!DemandedElts[I])
37995 continue;
37996 if (UndefElts[I]) {
37997 Known.resetAll();
37998 break;
37999 }
38000 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38001 Known = KnownBits::commonBits(Known, Known2);
38002 }
38003 return;
38004 }
38005 break;
38006 }
38007 }
38008
38009 // Handle target shuffles.
38010 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38011 if (isTargetShuffle(Opc)) {
38012 SmallVector<int, 64> Mask;
38013 SmallVector<SDValue, 2> Ops;
38014 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38015 unsigned NumOps = Ops.size();
38016 unsigned NumElts = VT.getVectorNumElements();
38017 if (Mask.size() == NumElts) {
38018 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38019 Known.Zero.setAllBits(); Known.One.setAllBits();
38020 for (unsigned i = 0; i != NumElts; ++i) {
38021 if (!DemandedElts[i])
38022 continue;
38023 int M = Mask[i];
38024 if (M == SM_SentinelUndef) {
38025 // For UNDEF elements, we don't know anything about the common state
38026 // of the shuffle result.
38027 Known.resetAll();
38028 break;
38029 }
38030 if (M == SM_SentinelZero) {
38031 Known.One.clearAllBits();
38032 continue;
38033 }
38034 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38035, __extension__
__PRETTY_FUNCTION__))
38035 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38035, __extension__
__PRETTY_FUNCTION__))
;
38036
38037 unsigned OpIdx = (unsigned)M / NumElts;
38038 unsigned EltIdx = (unsigned)M % NumElts;
38039 if (Ops[OpIdx].getValueType() != VT) {
38040 // TODO - handle target shuffle ops with different value types.
38041 Known.resetAll();
38042 break;
38043 }
38044 DemandedOps[OpIdx].setBit(EltIdx);
38045 }
38046 // Known bits are the values that are shared by every demanded element.
38047 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38048 if (!DemandedOps[i])
38049 continue;
38050 KnownBits Known2 =
38051 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38052 Known = KnownBits::commonBits(Known, Known2);
38053 }
38054 }
38055 }
38056 }
38057}
38058
38059unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
38060 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38061 unsigned Depth) const {
38062 EVT VT = Op.getValueType();
38063 unsigned VTBits = VT.getScalarSizeInBits();
38064 unsigned Opcode = Op.getOpcode();
38065 switch (Opcode) {
38066 case X86ISD::SETCC_CARRY:
38067 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38068 return VTBits;
38069
38070 case X86ISD::VTRUNC: {
38071 SDValue Src = Op.getOperand(0);
38072 MVT SrcVT = Src.getSimpleValueType();
38073 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38074 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38074, __extension__
__PRETTY_FUNCTION__))
;
38075 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38076 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38077 if (Tmp > (NumSrcBits - VTBits))
38078 return Tmp - (NumSrcBits - VTBits);
38079 return 1;
38080 }
38081
38082 case X86ISD::PACKSS: {
38083 // PACKSS is just a truncation if the sign bits extend to the packed size.
38084 APInt DemandedLHS, DemandedRHS;
38085 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
38086 DemandedRHS);
38087
38088 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
38089 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
38090 if (!!DemandedLHS)
38091 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38092 if (!!DemandedRHS)
38093 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38094 unsigned Tmp = std::min(Tmp0, Tmp1);
38095 if (Tmp > (SrcBits - VTBits))
38096 return Tmp - (SrcBits - VTBits);
38097 return 1;
38098 }
38099
38100 case X86ISD::VBROADCAST: {
38101 SDValue Src = Op.getOperand(0);
38102 if (!Src.getSimpleValueType().isVector())
38103 return DAG.ComputeNumSignBits(Src, Depth + 1);
38104 break;
38105 }
38106
38107 case X86ISD::VSHLI: {
38108 SDValue Src = Op.getOperand(0);
38109 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
38110 if (ShiftVal.uge(VTBits))
38111 return VTBits; // Shifted all bits out --> zero.
38112 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38113 if (ShiftVal.uge(Tmp))
38114 return 1; // Shifted all sign bits out --> unknown.
38115 return Tmp - ShiftVal.getZExtValue();
38116 }
38117
38118 case X86ISD::VSRAI: {
38119 SDValue Src = Op.getOperand(0);
38120 APInt ShiftVal = Op.getConstantOperandAPInt(1);
38121 if (ShiftVal.uge(VTBits - 1))
38122 return VTBits; // Sign splat.
38123 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38124 ShiftVal += Tmp;
38125 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
38126 }
38127
38128 case X86ISD::FSETCC:
38129 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38130 if (VT == MVT::f32 || VT == MVT::f64 ||
38131 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
38132 return VTBits;
38133 break;
38134
38135 case X86ISD::PCMPGT:
38136 case X86ISD::PCMPEQ:
38137 case X86ISD::CMPP:
38138 case X86ISD::VPCOM:
38139 case X86ISD::VPCOMU:
38140 // Vector compares return zero/all-bits result values.
38141 return VTBits;
38142
38143 case X86ISD::ANDNP: {
38144 unsigned Tmp0 =
38145 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38146 if (Tmp0 == 1) return 1; // Early out.
38147 unsigned Tmp1 =
38148 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38149 return std::min(Tmp0, Tmp1);
38150 }
38151
38152 case X86ISD::CMOV: {
38153 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38154 if (Tmp0 == 1) return 1; // Early out.
38155 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38156 return std::min(Tmp0, Tmp1);
38157 }
38158 }
38159
38160 // Handle target shuffles.
38161 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38162 if (isTargetShuffle(Opcode)) {
38163 SmallVector<int, 64> Mask;
38164 SmallVector<SDValue, 2> Ops;
38165 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
38166 unsigned NumOps = Ops.size();
38167 unsigned NumElts = VT.getVectorNumElements();
38168 if (Mask.size() == NumElts) {
38169 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38170 for (unsigned i = 0; i != NumElts; ++i) {
38171 if (!DemandedElts[i])
38172 continue;
38173 int M = Mask[i];
38174 if (M == SM_SentinelUndef) {
38175 // For UNDEF elements, we don't know anything about the common state
38176 // of the shuffle result.
38177 return 1;
38178 } else if (M == SM_SentinelZero) {
38179 // Zero = all sign bits.
38180 continue;
38181 }
38182 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38183, __extension__
__PRETTY_FUNCTION__))
38183 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38183, __extension__
__PRETTY_FUNCTION__))
;
38184
38185 unsigned OpIdx = (unsigned)M / NumElts;
38186 unsigned EltIdx = (unsigned)M % NumElts;
38187 if (Ops[OpIdx].getValueType() != VT) {
38188 // TODO - handle target shuffle ops with different value types.
38189 return 1;
38190 }
38191 DemandedOps[OpIdx].setBit(EltIdx);
38192 }
38193 unsigned Tmp0 = VTBits;
38194 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
38195 if (!DemandedOps[i])
38196 continue;
38197 unsigned Tmp1 =
38198 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38199 Tmp0 = std::min(Tmp0, Tmp1);
38200 }
38201 return Tmp0;
38202 }
38203 }
38204 }
38205
38206 // Fallback case.
38207 return 1;
38208}
38209
38210SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
38211 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38212 return N->getOperand(0);
38213 return N;
38214}
38215
38216// Helper to look for a normal load that can be narrowed into a vzload with the
38217// specified VT and memory VT. Returns SDValue() on failure.
38218static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
38219 SelectionDAG &DAG) {
38220 // Can't if the load is volatile or atomic.
38221 if (!LN->isSimple())
38222 return SDValue();
38223
38224 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38225 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38226 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
38227 LN->getPointerInfo(), LN->getOriginalAlign(),
38228 LN->getMemOperand()->getFlags());
38229}
38230
38231// Attempt to match a combined shuffle mask against supported unary shuffle
38232// instructions.
38233// TODO: Investigate sharing more of this with shuffle lowering.
38234static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38235 bool AllowFloatDomain, bool AllowIntDomain,
38236 SDValue V1, const SelectionDAG &DAG,
38237 const X86Subtarget &Subtarget, unsigned &Shuffle,
38238 MVT &SrcVT, MVT &DstVT) {
38239 unsigned NumMaskElts = Mask.size();
38240 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
38241
38242 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38243 if (Mask[0] == 0 &&
38244 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
38245 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38246 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38247 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38248 Shuffle = X86ISD::VZEXT_MOVL;
38249 if (MaskEltSize == 16)
38250 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38251 else
38252 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38253 return true;
38254 }
38255 }
38256
38257 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
38258 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38259 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
38260 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
38261 unsigned MaxScale = 64 / MaskEltSize;
38262 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
38263 bool MatchAny = true;
38264 bool MatchZero = true;
38265 unsigned NumDstElts = NumMaskElts / Scale;
38266 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
38267 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
38268 MatchAny = MatchZero = false;
38269 break;
38270 }
38271 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
38272 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
38273 }
38274 if (MatchAny || MatchZero) {
38275 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38275, __extension__
__PRETTY_FUNCTION__))
;
38276 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
38277 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
38278 MVT::getIntegerVT(MaskEltSize);
38279 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
38280
38281 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
38282 if (SrcVT.getVectorNumElements() != NumDstElts)
38283 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
38284
38285 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
38286 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
38287 return true;
38288 }
38289 }
38290 }
38291
38292 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38293 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
38294 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
38295 isUndefOrEqual(Mask[0], 0) &&
38296 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38297 Shuffle = X86ISD::VZEXT_MOVL;
38298 if (MaskEltSize == 16)
38299 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38300 else
38301 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38302 return true;
38303 }
38304
38305 // Check if we have SSE3 which will let us use MOVDDUP etc. The
38306 // instructions are no slower than UNPCKLPD but has the option to
38307 // fold the input operand into even an unaligned memory load.
38308 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
38309 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
38310 Shuffle = X86ISD::MOVDDUP;
38311 SrcVT = DstVT = MVT::v2f64;
38312 return true;
38313 }
38314 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38315 Shuffle = X86ISD::MOVSLDUP;
38316 SrcVT = DstVT = MVT::v4f32;
38317 return true;
38318 }
38319 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
38320 Shuffle = X86ISD::MOVSHDUP;
38321 SrcVT = DstVT = MVT::v4f32;
38322 return true;
38323 }
38324 }
38325
38326 if (MaskVT.is256BitVector() && AllowFloatDomain) {
38327 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38327, __extension__
__PRETTY_FUNCTION__))
;
38328 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38329 Shuffle = X86ISD::MOVDDUP;
38330 SrcVT = DstVT = MVT::v4f64;
38331 return true;
38332 }
38333 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38334 V1)) {
38335 Shuffle = X86ISD::MOVSLDUP;
38336 SrcVT = DstVT = MVT::v8f32;
38337 return true;
38338 }
38339 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
38340 V1)) {
38341 Shuffle = X86ISD::MOVSHDUP;
38342 SrcVT = DstVT = MVT::v8f32;
38343 return true;
38344 }
38345 }
38346
38347 if (MaskVT.is512BitVector() && AllowFloatDomain) {
38348 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38349, __extension__
__PRETTY_FUNCTION__))
38349 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38349, __extension__
__PRETTY_FUNCTION__))
;
38350 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38351 V1)) {
38352 Shuffle = X86ISD::MOVDDUP;
38353 SrcVT = DstVT = MVT::v8f64;
38354 return true;
38355 }
38356 if (isTargetShuffleEquivalent(
38357 MaskVT, Mask,
38358 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
38359 Shuffle = X86ISD::MOVSLDUP;
38360 SrcVT = DstVT = MVT::v16f32;
38361 return true;
38362 }
38363 if (isTargetShuffleEquivalent(
38364 MaskVT, Mask,
38365 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
38366 Shuffle = X86ISD::MOVSHDUP;
38367 SrcVT = DstVT = MVT::v16f32;
38368 return true;
38369 }
38370 }
38371
38372 return false;
38373}
38374
38375// Attempt to match a combined shuffle mask against supported unary immediate
38376// permute instructions.
38377// TODO: Investigate sharing more of this with shuffle lowering.
38378static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
38379 const APInt &Zeroable,
38380 bool AllowFloatDomain, bool AllowIntDomain,
38381 const SelectionDAG &DAG,
38382 const X86Subtarget &Subtarget,
38383 unsigned &Shuffle, MVT &ShuffleVT,
38384 unsigned &PermuteImm) {
38385 unsigned NumMaskElts = Mask.size();
38386 unsigned InputSizeInBits = MaskVT.getSizeInBits();
38387 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
38388 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
38389 bool ContainsZeros = isAnyZero(Mask);
38390
38391 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
38392 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
38393 // Check for lane crossing permutes.
38394 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
38395 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
38396 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
38397 Shuffle = X86ISD::VPERMI;
38398 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
38399 PermuteImm = getV4X86ShuffleImm(Mask);
38400 return true;
38401 }
38402 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
38403 SmallVector<int, 4> RepeatedMask;
38404 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
38405 Shuffle = X86ISD::VPERMI;
38406 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
38407 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
38408 return true;
38409 }
38410 }
38411 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
38412 // VPERMILPD can permute with a non-repeating shuffle.
38413 Shuffle = X86ISD::VPERMILPI;
38414 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
38415 PermuteImm = 0;
38416 for (int i = 0, e = Mask.size(); i != e; ++i) {
38417 int M = Mask[i];
38418 if (M == SM_SentinelUndef)
38419 continue;
38420 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38420, __extension__
__PRETTY_FUNCTION__))
;
38421 PermuteImm |= (M & 1) << i;
38422 }
38423 return true;
38424 }
38425 }
38426
38427 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
38428 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
38429 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
38430 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
38431 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
38432 SmallVector<int, 4> RepeatedMask;
38433 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
38434 // Narrow the repeated mask to create 32-bit element permutes.
38435 SmallVector<int, 4> WordMask = RepeatedMask;
38436 if (MaskScalarSizeInBits == 64)
38437 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
38438
38439 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
38440 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
38441 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
38442 PermuteImm = getV4X86ShuffleImm(WordMask);
38443 return true;
38444 }
38445 }
38446
38447 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
38448 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
38449 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38450 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38451 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38452 SmallVector<int, 4> RepeatedMask;
38453 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
38454 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
38455 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
38456
38457 // PSHUFLW: permute lower 4 elements only.
38458 if (isUndefOrInRange(LoMask, 0, 4) &&
38459 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
38460 Shuffle = X86ISD::PSHUFLW;
38461 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
38462 PermuteImm = getV4X86ShuffleImm(LoMask);
38463 return true;
38464 }
38465
38466 // PSHUFHW: permute upper 4 elements only.
38467 if (isUndefOrInRange(HiMask, 4, 8) &&
38468 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
38469 // Offset the HiMask so that we can create the shuffle immediate.
38470 int OffsetHiMask[4];
38471 for (int i = 0; i != 4; ++i)
38472 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
38473
38474 Shuffle = X86ISD::PSHUFHW;
38475 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
38476 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
38477 return true;
38478 }
38479 }
38480 }
38481
38482 // Attempt to match against byte/bit shifts.
38483 if (AllowIntDomain &&
38484 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38485 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38486 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38487 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
38488 Mask, 0, Zeroable, Subtarget);
38489 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
38490 32 <= ShuffleVT.getScalarSizeInBits())) {
38491 PermuteImm = (unsigned)ShiftAmt;
38492 return true;
38493 }
38494 }
38495
38496 // Attempt to match against bit rotates.
38497 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
38498 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
38499 Subtarget.hasAVX512())) {
38500 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
38501 Subtarget, Mask);
38502 if (0 < RotateAmt) {
38503 Shuffle = X86ISD::VROTLI;
38504 PermuteImm = (unsigned)RotateAmt;
38505 return true;
38506 }
38507 }
38508
38509 return false;
38510}
38511
38512// Attempt to match a combined unary shuffle mask against supported binary
38513// shuffle instructions.
38514// TODO: Investigate sharing more of this with shuffle lowering.
38515static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38516 bool AllowFloatDomain, bool AllowIntDomain,
38517 SDValue &V1, SDValue &V2, const SDLoc &DL,
38518 SelectionDAG &DAG, const X86Subtarget &Subtarget,
38519 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
38520 bool IsUnary) {
38521 unsigned NumMaskElts = Mask.size();
38522 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38523 unsigned SizeInBits = MaskVT.getSizeInBits();
38524
38525 if (MaskVT.is128BitVector()) {
38526 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
38527 AllowFloatDomain) {
38528 V2 = V1;
38529 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
38530 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
38531 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
38532 return true;
38533 }
38534 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
38535 AllowFloatDomain) {
38536 V2 = V1;
38537 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
38538 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
38539 return true;
38540 }
38541 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
38542 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
38543 std::swap(V1, V2);
38544 Shuffle = X86ISD::MOVSD;
38545 SrcVT = DstVT = MVT::v2f64;
38546 return true;
38547 }
38548 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
38549 (AllowFloatDomain || !Subtarget.hasSSE41())) {
38550 Shuffle = X86ISD::MOVSS;
38551 SrcVT = DstVT = MVT::v4f32;
38552 return true;
38553 }
38554 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
38555 DAG) &&
38556 Subtarget.hasFP16()) {
38557 Shuffle = X86ISD::MOVSH;
38558 SrcVT = DstVT = MVT::v8f16;
38559 return true;
38560 }
38561 }
38562
38563 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
38564 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
38565 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
38566 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
38567 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
38568 Subtarget)) {
38569 DstVT = MaskVT;
38570 return true;
38571 }
38572 }
38573
38574 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
38575 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
38576 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38577 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
38578 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38579 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
38580 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
38581 Subtarget)) {
38582 SrcVT = DstVT = MaskVT;
38583 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
38584 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
38585 return true;
38586 }
38587 }
38588
38589 // Attempt to match against a OR if we're performing a blend shuffle and the
38590 // non-blended source element is zero in each case.
38591 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
38592 if (SizeInBits == V1.getValueSizeInBits() &&
38593 SizeInBits == V2.getValueSizeInBits() &&
38594 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38595 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
38596 bool IsBlend = true;
38597 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
38598 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
38599 unsigned Scale1 = NumV1Elts / NumMaskElts;
38600 unsigned Scale2 = NumV2Elts / NumMaskElts;
38601 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
38602 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
38603 for (unsigned i = 0; i != NumMaskElts; ++i) {
38604 int M = Mask[i];
38605 if (M == SM_SentinelUndef)
38606 continue;
38607 if (M == SM_SentinelZero) {
38608 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
38609 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
38610 continue;
38611 }
38612 if (M == (int)i) {
38613 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
38614 continue;
38615 }
38616 if (M == (int)(i + NumMaskElts)) {
38617 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
38618 continue;
38619 }
38620 IsBlend = false;
38621 break;
38622 }
38623 if (IsBlend) {
38624 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
38625 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
38626 Shuffle = ISD::OR;
38627 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38628 return true;
38629 }
38630 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
38631 // FIXME: handle mismatched sizes?
38632 // TODO: investigate if `ISD::OR` handling in
38633 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
38634 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
38635 unsigned NumElts = V.getValueType().getVectorNumElements();
38636 KnownBits Known(NumElts);
38637 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
38638 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
38639 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
38640 if (PeepholeKnown.isZero())
38641 Known.Zero.setBit(EltIdx);
38642 if (PeepholeKnown.isAllOnes())
38643 Known.One.setBit(EltIdx);
38644 }
38645 return Known;
38646 };
38647
38648 KnownBits V1Known = computeKnownBitsElementWise(V1);
38649 KnownBits V2Known = computeKnownBitsElementWise(V2);
38650
38651 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
38652 int M = Mask[i];
38653 if (M == SM_SentinelUndef)
38654 continue;
38655 if (M == SM_SentinelZero) {
38656 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
38657 continue;
38658 }
38659 if (M == (int)i) {
38660 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
38661 continue;
38662 }
38663 if (M == (int)(i + NumMaskElts)) {
38664 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
38665 continue;
38666 }
38667 llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38667)
;
38668 }
38669 if (IsBlend) {
38670 Shuffle = ISD::OR;
38671 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38672 return true;
38673 }
38674 }
38675 }
38676 }
38677
38678 return false;
38679}
38680
38681static bool matchBinaryPermuteShuffle(
38682 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
38683 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
38684 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
38685 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
38686 unsigned NumMaskElts = Mask.size();
38687 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38688
38689 // Attempt to match against VALIGND/VALIGNQ rotate.
38690 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
38691 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
38692 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
38693 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38694 if (!isAnyZero(Mask)) {
38695 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
38696 if (0 < Rotation) {
38697 Shuffle = X86ISD::VALIGN;
38698 if (EltSizeInBits == 64)
38699 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
38700 else
38701 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
38702 PermuteImm = Rotation;
38703 return true;
38704 }
38705 }
38706 }
38707
38708 // Attempt to match against PALIGNR byte rotate.
38709 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38710 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38711 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38712 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
38713 if (0 < ByteRotation) {
38714 Shuffle = X86ISD::PALIGNR;
38715 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
38716 PermuteImm = ByteRotation;
38717 return true;
38718 }
38719 }
38720
38721 // Attempt to combine to X86ISD::BLENDI.
38722 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
38723 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
38724 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
38725 uint64_t BlendMask = 0;
38726 bool ForceV1Zero = false, ForceV2Zero = false;
38727 SmallVector<int, 8> TargetMask(Mask);
38728 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
38729 ForceV2Zero, BlendMask)) {
38730 if (MaskVT == MVT::v16i16) {
38731 // We can only use v16i16 PBLENDW if the lanes are repeated.
38732 SmallVector<int, 8> RepeatedMask;
38733 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38734 RepeatedMask)) {
38735 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38736, __extension__
__PRETTY_FUNCTION__))
38736 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38736, __extension__
__PRETTY_FUNCTION__))
;
38737 PermuteImm = 0;
38738 for (int i = 0; i < 8; ++i)
38739 if (RepeatedMask[i] >= 8)
38740 PermuteImm |= 1 << i;
38741 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38742 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38743 Shuffle = X86ISD::BLENDI;
38744 ShuffleVT = MaskVT;
38745 return true;
38746 }
38747 } else {
38748 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38749 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38750 PermuteImm = (unsigned)BlendMask;
38751 Shuffle = X86ISD::BLENDI;
38752 ShuffleVT = MaskVT;
38753 return true;
38754 }
38755 }
38756 }
38757
38758 // Attempt to combine to INSERTPS, but only if it has elements that need to
38759 // be set to zero.
38760 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38761 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38762 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38763 Shuffle = X86ISD::INSERTPS;
38764 ShuffleVT = MVT::v4f32;
38765 return true;
38766 }
38767
38768 // Attempt to combine to SHUFPD.
38769 if (AllowFloatDomain && EltSizeInBits == 64 &&
38770 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38771 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38772 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38773 bool ForceV1Zero = false, ForceV2Zero = false;
38774 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38775 PermuteImm, Mask, Zeroable)) {
38776 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38777 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38778 Shuffle = X86ISD::SHUFP;
38779 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38780 return true;
38781 }
38782 }
38783
38784 // Attempt to combine to SHUFPS.
38785 if (AllowFloatDomain && EltSizeInBits == 32 &&
38786 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38787 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38788 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38789 SmallVector<int, 4> RepeatedMask;
38790 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38791 // Match each half of the repeated mask, to determine if its just
38792 // referencing one of the vectors, is zeroable or entirely undef.
38793 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38794 int M0 = RepeatedMask[Offset];
38795 int M1 = RepeatedMask[Offset + 1];
38796
38797 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38798 return DAG.getUNDEF(MaskVT);
38799 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38800 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38801 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38802 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38803 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38804 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38805 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38806 return V1;
38807 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38808 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38809 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38810 return V2;
38811 }
38812
38813 return SDValue();
38814 };
38815
38816 int ShufMask[4] = {-1, -1, -1, -1};
38817 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38818 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38819
38820 if (Lo && Hi) {
38821 V1 = Lo;
38822 V2 = Hi;
38823 Shuffle = X86ISD::SHUFP;
38824 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38825 PermuteImm = getV4X86ShuffleImm(ShufMask);
38826 return true;
38827 }
38828 }
38829 }
38830
38831 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38832 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38833 MaskVT.is128BitVector() &&
38834 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38835 Shuffle = X86ISD::INSERTPS;
38836 ShuffleVT = MVT::v4f32;
38837 return true;
38838 }
38839
38840 return false;
38841}
38842
38843static SDValue combineX86ShuffleChainWithExtract(
38844 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38845 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38846 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38847 const X86Subtarget &Subtarget);
38848
38849/// Combine an arbitrary chain of shuffles into a single instruction if
38850/// possible.
38851///
38852/// This is the leaf of the recursive combine below. When we have found some
38853/// chain of single-use x86 shuffle instructions and accumulated the combined
38854/// shuffle mask represented by them, this will try to pattern match that mask
38855/// into either a single instruction if there is a special purpose instruction
38856/// for this operation, or into a PSHUFB instruction which is a fully general
38857/// instruction but should only be used to replace chains over a certain depth.
38858static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
38859 ArrayRef<int> BaseMask, int Depth,
38860 bool HasVariableMask,
38861 bool AllowVariableCrossLaneMask,
38862 bool AllowVariablePerLaneMask,
38863 SelectionDAG &DAG,
38864 const X86Subtarget &Subtarget) {
38865 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38865, __extension__
__PRETTY_FUNCTION__))
;
38866 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38867, __extension__
__PRETTY_FUNCTION__))
38867 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38867, __extension__
__PRETTY_FUNCTION__))
;
38868
38869 SDLoc DL(Root);
38870 MVT RootVT = Root.getSimpleValueType();
38871 unsigned RootSizeInBits = RootVT.getSizeInBits();
38872 unsigned NumRootElts = RootVT.getVectorNumElements();
38873
38874 // Canonicalize shuffle input op to the requested type.
38875 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38876 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38877 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38878 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38879 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38880 return DAG.getBitcast(VT, Op);
38881 };
38882
38883 // Find the inputs that enter the chain. Note that multiple uses are OK
38884 // here, we're not going to remove the operands we find.
38885 bool UnaryShuffle = (Inputs.size() == 1);
38886 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38887 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38888 : peekThroughBitcasts(Inputs[1]));
38889
38890 MVT VT1 = V1.getSimpleValueType();
38891 MVT VT2 = V2.getSimpleValueType();
38892 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38893, __extension__
__PRETTY_FUNCTION__))
38893 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38893, __extension__
__PRETTY_FUNCTION__))
;
38894
38895 SDValue Res;
38896
38897 unsigned NumBaseMaskElts = BaseMask.size();
38898 if (NumBaseMaskElts == 1) {
38899 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38899, __extension__
__PRETTY_FUNCTION__))
;
38900 return CanonicalizeShuffleInput(RootVT, V1);
38901 }
38902
38903 bool OptForSize = DAG.shouldOptForSize();
38904 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38905 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38906 (RootVT.isFloatingPoint() && Depth >= 1) ||
38907 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38908
38909 // Don't combine if we are a AVX512/EVEX target and the mask element size
38910 // is different from the root element size - this would prevent writemasks
38911 // from being reused.
38912 bool IsMaskedShuffle = false;
38913 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38914 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38915 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38916 IsMaskedShuffle = true;
38917 }
38918 }
38919
38920 // If we are shuffling a splat (and not introducing zeros) then we can just
38921 // use it directly. This works for smaller elements as well as they already
38922 // repeat across each mask element.
38923 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38924 V1.getValueSizeInBits() >= RootSizeInBits &&
38925 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38926 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38927 return CanonicalizeShuffleInput(RootVT, V1);
38928 }
38929
38930 SmallVector<int, 64> Mask(BaseMask);
38931
38932 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38933 // etc. can be simplified.
38934 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38935 SmallVector<int> ScaledMask, IdentityMask;
38936 unsigned NumElts = VT1.getVectorNumElements();
38937 if (Mask.size() <= NumElts &&
38938 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38939 for (unsigned i = 0; i != NumElts; ++i)
38940 IdentityMask.push_back(i);
38941 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38942 V2))
38943 return CanonicalizeShuffleInput(RootVT, V1);
38944 }
38945 }
38946
38947 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38948 if (RootVT.is512BitVector() &&
38949 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38950 // If the upper subvectors are zeroable, then an extract+insert is more
38951 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38952 // to zero the upper subvectors.
38953 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38954 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38955 return SDValue(); // Nothing to do!
38956 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38957, __extension__
__PRETTY_FUNCTION__))
38957 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38957, __extension__
__PRETTY_FUNCTION__))
;
38958 Res = CanonicalizeShuffleInput(RootVT, V1);
38959 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38960 bool UseZero = isAnyZero(Mask);
38961 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38962 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38963 }
38964
38965 // Narrow shuffle mask to v4x128.
38966 SmallVector<int, 4> ScaledMask;
38967 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38967, __extension__
__PRETTY_FUNCTION__))
;
38968 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38969
38970 // Try to lower to vshuf64x2/vshuf32x4.
38971 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38972 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38973 SelectionDAG &DAG) {
38974 unsigned PermMask = 0;
38975 // Insure elements came from the same Op.
38976 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38977 for (int i = 0; i < 4; ++i) {
38978 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38978, __extension__
__PRETTY_FUNCTION__))
;
38979 if (ScaledMask[i] < 0)
38980 continue;
38981
38982 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38983 unsigned OpIndex = i / 2;
38984 if (Ops[OpIndex].isUndef())
38985 Ops[OpIndex] = Op;
38986 else if (Ops[OpIndex] != Op)
38987 return SDValue();
38988
38989 // Convert the 128-bit shuffle mask selection values into 128-bit
38990 // selection bits defined by a vshuf64x2 instruction's immediate control
38991 // byte.
38992 PermMask |= (ScaledMask[i] % 4) << (i * 2);
38993 }
38994
38995 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38996 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38997 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38998 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38999 };
39000
39001 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39002 // doesn't work because our mask is for 128 bits and we don't have an MVT
39003 // to match that.
39004 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39005 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39006 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39007 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39008 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39009 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39010 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39011 ScaledMask[1] == (ScaledMask[3] % 2));
39012
39013 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39014 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39015 return SDValue(); // Nothing to do!
39016 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39017 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39018 return DAG.getBitcast(RootVT, V);
39019 }
39020 }
39021
39022 // Handle 128-bit lane shuffles of 256-bit vectors.
39023 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39024 // If the upper half is zeroable, then an extract+insert is more optimal
39025 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39026 // zero the upper half.
39027 if (isUndefOrZero(Mask[1])) {
39028 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39029 return SDValue(); // Nothing to do!
39030 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39030, __extension__
__PRETTY_FUNCTION__))
;
39031 Res = CanonicalizeShuffleInput(RootVT, V1);
39032 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39033 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39034 256);
39035 }
39036
39037 // If we're inserting the low subvector, an insert-subvector 'concat'
39038 // pattern is quicker than VPERM2X128.
39039 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39040 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39041 !Subtarget.hasAVX2()) {
39042 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39043 return SDValue(); // Nothing to do!
39044 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39045 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39046 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39047 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39048 }
39049
39050 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39051 return SDValue(); // Nothing to do!
39052
39053 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39054 // we need to use the zeroing feature.
39055 // Prefer blends for sequential shuffles unless we are optimizing for size.
39056 if (UnaryShuffle &&
39057 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39058 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39059 unsigned PermMask = 0;
39060 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39061 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39062 return DAG.getNode(
39063 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39064 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39065 }
39066
39067 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39068 return SDValue(); // Nothing to do!
39069
39070 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39071 if (!UnaryShuffle && !IsMaskedShuffle) {
39072 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39073, __extension__
__PRETTY_FUNCTION__))
39073 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39073, __extension__
__PRETTY_FUNCTION__))
;
39074 // Prefer blends to X86ISD::VPERM2X128.
39075 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
39076 unsigned PermMask = 0;
39077 PermMask |= ((Mask[0] & 3) << 0);
39078 PermMask |= ((Mask[1] & 3) << 4);
39079 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
39080 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
39081 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
39082 CanonicalizeShuffleInput(RootVT, LHS),
39083 CanonicalizeShuffleInput(RootVT, RHS),
39084 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39085 }
39086 }
39087 }
39088
39089 // For masks that have been widened to 128-bit elements or more,
39090 // narrow back down to 64-bit elements.
39091 if (BaseMaskEltSizeInBits > 64) {
39092 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39092, __extension__
__PRETTY_FUNCTION__))
;
39093 int MaskScale = BaseMaskEltSizeInBits / 64;
39094 SmallVector<int, 64> ScaledMask;
39095 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39096 Mask = std::move(ScaledMask);
39097 }
39098
39099 // For masked shuffles, we're trying to match the root width for better
39100 // writemask folding, attempt to scale the mask.
39101 // TODO - variable shuffles might need this to be widened again.
39102 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
39103 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39103, __extension__
__PRETTY_FUNCTION__))
;
39104 int MaskScale = NumRootElts / Mask.size();
39105 SmallVector<int, 64> ScaledMask;
39106 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39107 Mask = std::move(ScaledMask);
39108 }
39109
39110 unsigned NumMaskElts = Mask.size();
39111 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
39112
39113 // Determine the effective mask value type.
39114 FloatDomain &= (32 <= MaskEltSizeInBits);
39115 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
39116 : MVT::getIntegerVT(MaskEltSizeInBits);
39117 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
39118
39119 // Only allow legal mask types.
39120 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39121 return SDValue();
39122
39123 // Attempt to match the mask against known shuffle patterns.
39124 MVT ShuffleSrcVT, ShuffleVT;
39125 unsigned Shuffle, PermuteImm;
39126
39127 // Which shuffle domains are permitted?
39128 // Permit domain crossing at higher combine depths.
39129 // TODO: Should we indicate which domain is preferred if both are allowed?
39130 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39131 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39132 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
39133
39134 // Determine zeroable mask elements.
39135 APInt KnownUndef, KnownZero;
39136 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
39137 APInt Zeroable = KnownUndef | KnownZero;
39138
39139 if (UnaryShuffle) {
39140 // Attempt to match against broadcast-from-vector.
39141 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
39142 if ((Subtarget.hasAVX2() ||
39143 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
39144 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
39145 if (isUndefOrEqual(Mask, 0)) {
39146 if (V1.getValueType() == MaskVT &&
39147 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39148 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
39149 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39150 return SDValue(); // Nothing to do!
39151 Res = V1.getOperand(0);
39152 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39153 return DAG.getBitcast(RootVT, Res);
39154 }
39155 if (Subtarget.hasAVX2()) {
39156 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39157 return SDValue(); // Nothing to do!
39158 Res = CanonicalizeShuffleInput(MaskVT, V1);
39159 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39160 return DAG.getBitcast(RootVT, Res);
39161 }
39162 }
39163 }
39164
39165 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
39166 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
39167 (!IsMaskedShuffle ||
39168 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39169 if (Depth == 0 && Root.getOpcode() == Shuffle)
39170 return SDValue(); // Nothing to do!
39171 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39172 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
39173 return DAG.getBitcast(RootVT, Res);
39174 }
39175
39176 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39177 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
39178 PermuteImm) &&
39179 (!IsMaskedShuffle ||
39180 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39181 if (Depth == 0 && Root.getOpcode() == Shuffle)
39182 return SDValue(); // Nothing to do!
39183 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
39184 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
39185 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39186 return DAG.getBitcast(RootVT, Res);
39187 }
39188 }
39189
39190 // Attempt to combine to INSERTPS, but only if the inserted element has come
39191 // from a scalar.
39192 // TODO: Handle other insertions here as well?
39193 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
39194 Subtarget.hasSSE41() &&
39195 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
39196 if (MaskEltSizeInBits == 32) {
39197 SDValue SrcV1 = V1, SrcV2 = V2;
39198 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
39199 DAG) &&
39200 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39201 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39202 return SDValue(); // Nothing to do!
39203 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39204 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
39205 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
39206 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39207 return DAG.getBitcast(RootVT, Res);
39208 }
39209 }
39210 if (MaskEltSizeInBits == 64 &&
39211 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
39212 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39213 V2.getScalarValueSizeInBits() <= 32) {
39214 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39215 return SDValue(); // Nothing to do!
39216 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
39217 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39218 CanonicalizeShuffleInput(MVT::v4f32, V1),
39219 CanonicalizeShuffleInput(MVT::v4f32, V2),
39220 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39221 return DAG.getBitcast(RootVT, Res);
39222 }
39223 }
39224
39225 SDValue NewV1 = V1; // Save operands in case early exit happens.
39226 SDValue NewV2 = V2;
39227 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
39228 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
39229 ShuffleVT, UnaryShuffle) &&
39230 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39231 if (Depth == 0 && Root.getOpcode() == Shuffle)
39232 return SDValue(); // Nothing to do!
39233 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
39234 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
39235 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
39236 return DAG.getBitcast(RootVT, Res);
39237 }
39238
39239 NewV1 = V1; // Save operands in case early exit happens.
39240 NewV2 = V2;
39241 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39242 AllowIntDomain, NewV1, NewV2, DL, DAG,
39243 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
39244 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39245 if (Depth == 0 && Root.getOpcode() == Shuffle)
39246 return SDValue(); // Nothing to do!
39247 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
39248 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
39249 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
39250 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39251 return DAG.getBitcast(RootVT, Res);
39252 }
39253
39254 // Typically from here on, we need an integer version of MaskVT.
39255 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
39256 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
39257
39258 // Annoyingly, SSE4A instructions don't map into the above match helpers.
39259 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
39260 uint64_t BitLen, BitIdx;
39261 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
39262 Zeroable)) {
39263 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39264 return SDValue(); // Nothing to do!
39265 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39266 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
39267 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39268 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39269 return DAG.getBitcast(RootVT, Res);
39270 }
39271
39272 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
39273 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39274 return SDValue(); // Nothing to do!
39275 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39276 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
39277 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
39278 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39279 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39280 return DAG.getBitcast(RootVT, Res);
39281 }
39282 }
39283
39284 // Match shuffle against TRUNCATE patterns.
39285 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
39286 // Match against a VTRUNC instruction, accounting for src/dst sizes.
39287 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
39288 Subtarget)) {
39289 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
39290 ShuffleSrcVT.getVectorNumElements();
39291 unsigned Opc =
39292 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
39293 if (Depth == 0 && Root.getOpcode() == Opc)
39294 return SDValue(); // Nothing to do!
39295 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39296 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
39297 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
39298 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
39299 return DAG.getBitcast(RootVT, Res);
39300 }
39301
39302 // Do we need a more general binary truncation pattern?
39303 if (RootSizeInBits < 512 &&
39304 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
39305 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
39306 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
39307 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
39308 // Bail if this was already a truncation or PACK node.
39309 // We sometimes fail to match PACK if we demand known undef elements.
39310 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
39311 Root.getOpcode() == X86ISD::PACKSS ||
39312 Root.getOpcode() == X86ISD::PACKUS))
39313 return SDValue(); // Nothing to do!
39314 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39315 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
39316 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39317 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
39318 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39319 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
39320 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
39321 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
39322 return DAG.getBitcast(RootVT, Res);
39323 }
39324 }
39325
39326 // Don't try to re-form single instruction chains under any circumstances now
39327 // that we've done encoding canonicalization for them.
39328 if (Depth < 1)
39329 return SDValue();
39330
39331 // Depth threshold above which we can efficiently use variable mask shuffles.
39332 int VariableCrossLaneShuffleDepth =
39333 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
39334 int VariablePerLaneShuffleDepth =
39335 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
39336 AllowVariableCrossLaneMask &=
39337 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
39338 AllowVariablePerLaneMask &=
39339 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
39340 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
39341 // higher depth before combining them.
39342 bool AllowBWIVPERMV3 =
39343 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
39344
39345 bool MaskContainsZeros = isAnyZero(Mask);
39346
39347 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
39348 // If we have a single input lane-crossing shuffle then lower to VPERMV.
39349 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
39350 if (Subtarget.hasAVX2() &&
39351 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
39352 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
39353 Res = CanonicalizeShuffleInput(MaskVT, V1);
39354 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
39355 return DAG.getBitcast(RootVT, Res);
39356 }
39357 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
39358 if ((Subtarget.hasAVX512() &&
39359 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
39360 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
39361 (Subtarget.hasBWI() &&
39362 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39363 (Subtarget.hasVBMI() &&
39364 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
39365 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39366 V2 = DAG.getUNDEF(MaskVT);
39367 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39368 return DAG.getBitcast(RootVT, Res);
39369 }
39370 }
39371
39372 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
39373 // vector as the second source (non-VLX will pad to 512-bit shuffles).
39374 if (UnaryShuffle && AllowVariableCrossLaneMask &&
39375 ((Subtarget.hasAVX512() &&
39376 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
39377 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
39378 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
39379 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
39380 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39381 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39382 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39383 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
39384 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
39385 for (unsigned i = 0; i != NumMaskElts; ++i)
39386 if (Mask[i] == SM_SentinelZero)
39387 Mask[i] = NumMaskElts + i;
39388 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39389 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
39390 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39391 return DAG.getBitcast(RootVT, Res);
39392 }
39393
39394 // If that failed and either input is extracted then try to combine as a
39395 // shuffle with the larger type.
39396 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
39397 Inputs, Root, BaseMask, Depth, HasVariableMask,
39398 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
39399 Subtarget))
39400 return WideShuffle;
39401
39402 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
39403 // (non-VLX will pad to 512-bit shuffles).
39404 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
39405 ((Subtarget.hasAVX512() &&
39406 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
39407 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
39408 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
39409 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
39410 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39411 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39412 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39413 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
39414 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39415 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39416 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39417 return DAG.getBitcast(RootVT, Res);
39418 }
39419 return SDValue();
39420 }
39421
39422 // See if we can combine a single input shuffle with zeros to a bit-mask,
39423 // which is much simpler than any shuffle.
39424 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
39425 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
39426 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
39427 APInt Zero = APInt::getZero(MaskEltSizeInBits);
39428 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
39429 APInt UndefElts(NumMaskElts, 0);
39430 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
39431 for (unsigned i = 0; i != NumMaskElts; ++i) {
39432 int M = Mask[i];
39433 if (M == SM_SentinelUndef) {
39434 UndefElts.setBit(i);
39435 continue;
39436 }
39437 if (M == SM_SentinelZero)
39438 continue;
39439 EltBits[i] = AllOnes;
39440 }
39441 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
39442 Res = CanonicalizeShuffleInput(MaskVT, V1);
39443 unsigned AndOpcode =
39444 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
39445 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
39446 return DAG.getBitcast(RootVT, Res);
39447 }
39448
39449 // If we have a single input shuffle with different shuffle patterns in the
39450 // the 128-bit lanes use the variable mask to VPERMILPS.
39451 // TODO Combine other mask types at higher depths.
39452 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
39453 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
39454 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
39455 SmallVector<SDValue, 16> VPermIdx;
39456 for (int M : Mask) {
39457 SDValue Idx =
39458 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
39459 VPermIdx.push_back(Idx);
39460 }
39461 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
39462 Res = CanonicalizeShuffleInput(MaskVT, V1);
39463 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
39464 return DAG.getBitcast(RootVT, Res);
39465 }
39466
39467 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
39468 // to VPERMIL2PD/VPERMIL2PS.
39469 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
39470 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
39471 MaskVT == MVT::v8f32)) {
39472 // VPERMIL2 Operation.
39473 // Bits[3] - Match Bit.
39474 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
39475 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
39476 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
39477 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
39478 SmallVector<int, 8> VPerm2Idx;
39479 unsigned M2ZImm = 0;
39480 for (int M : Mask) {
39481 if (M == SM_SentinelUndef) {
39482 VPerm2Idx.push_back(-1);
39483 continue;
39484 }
39485 if (M == SM_SentinelZero) {
39486 M2ZImm = 2;
39487 VPerm2Idx.push_back(8);
39488 continue;
39489 }
39490 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
39491 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
39492 VPerm2Idx.push_back(Index);
39493 }
39494 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39495 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39496 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
39497 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
39498 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
39499 return DAG.getBitcast(RootVT, Res);
39500 }
39501
39502 // If we have 3 or more shuffle instructions or a chain involving a variable
39503 // mask, we can replace them with a single PSHUFB instruction profitably.
39504 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
39505 // instructions, but in practice PSHUFB tends to be *very* fast so we're
39506 // more aggressive.
39507 if (UnaryShuffle && AllowVariablePerLaneMask &&
39508 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39509 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
39510 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
39511 SmallVector<SDValue, 16> PSHUFBMask;
39512 int NumBytes = RootVT.getSizeInBits() / 8;
39513 int Ratio = NumBytes / NumMaskElts;
39514 for (int i = 0; i < NumBytes; ++i) {
39515 int M = Mask[i / Ratio];
39516 if (M == SM_SentinelUndef) {
39517 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
39518 continue;
39519 }
39520 if (M == SM_SentinelZero) {
39521 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
39522 continue;
39523 }
39524 M = Ratio * M + i % Ratio;
39525 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39525, __extension__
__PRETTY_FUNCTION__))
;
39526 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
39527 }
39528 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
39529 Res = CanonicalizeShuffleInput(ByteVT, V1);
39530 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
39531 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
39532 return DAG.getBitcast(RootVT, Res);
39533 }
39534
39535 // With XOP, if we have a 128-bit binary input shuffle we can always combine
39536 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
39537 // slower than PSHUFB on targets that support both.
39538 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
39539 Subtarget.hasXOP()) {
39540 // VPPERM Mask Operation
39541 // Bits[4:0] - Byte Index (0 - 31)
39542 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
39543 SmallVector<SDValue, 16> VPPERMMask;
39544 int NumBytes = 16;
39545 int Ratio = NumBytes / NumMaskElts;
39546 for (int i = 0; i < NumBytes; ++i) {
39547 int M = Mask[i / Ratio];
39548 if (M == SM_SentinelUndef) {
39549 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
39550 continue;
39551 }
39552 if (M == SM_SentinelZero) {
39553 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
39554 continue;
39555 }
39556 M = Ratio * M + i % Ratio;
39557 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
39558 }
39559 MVT ByteVT = MVT::v16i8;
39560 V1 = CanonicalizeShuffleInput(ByteVT, V1);
39561 V2 = CanonicalizeShuffleInput(ByteVT, V2);
39562 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
39563 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
39564 return DAG.getBitcast(RootVT, Res);
39565 }
39566
39567 // If that failed and either input is extracted then try to combine as a
39568 // shuffle with the larger type.
39569 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
39570 Inputs, Root, BaseMask, Depth, HasVariableMask,
39571 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
39572 return WideShuffle;
39573
39574 // If we have a dual input shuffle then lower to VPERMV3,
39575 // (non-VLX will pad to 512-bit shuffles)
39576 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
39577 ((Subtarget.hasAVX512() &&
39578 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
39579 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
39580 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
39581 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
39582 MaskVT == MVT::v16i32)) ||
39583 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39584 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
39585 MaskVT == MVT::v32i16)) ||
39586 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39587 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
39588 MaskVT == MVT::v64i8)))) {
39589 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39590 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39591 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39592 return DAG.getBitcast(RootVT, Res);
39593 }
39594
39595 // Failed to find any combines.
39596 return SDValue();
39597}
39598
39599// Combine an arbitrary chain of shuffles + extract_subvectors into a single
39600// instruction if possible.
39601//
39602// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
39603// type size to attempt to combine:
39604// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
39605// -->
39606// extract_subvector(shuffle(x,y,m2),0)
39607static SDValue combineX86ShuffleChainWithExtract(
39608 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39609 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39610 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39611 const X86Subtarget &Subtarget) {
39612 unsigned NumMaskElts = BaseMask.size();
39613 unsigned NumInputs = Inputs.size();
39614 if (NumInputs == 0)
39615 return SDValue();
39616
39617 EVT RootVT = Root.getValueType();
39618 unsigned RootSizeInBits = RootVT.getSizeInBits();
39619 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39619, __extension__
__PRETTY_FUNCTION__))
;
39620
39621 // Bail if we have any smaller inputs.
39622 if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
39623 return Input.getValueSizeInBits() < RootSizeInBits;
39624 }))
39625 return SDValue();
39626
39627 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
39628 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
39629
39630 // Peek through subvectors.
39631 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
39632 unsigned WideSizeInBits = RootSizeInBits;
39633 for (unsigned i = 0; i != NumInputs; ++i) {
39634 SDValue &Src = WideInputs[i];
39635 unsigned &Offset = Offsets[i];
39636 Src = peekThroughBitcasts(Src);
39637 EVT BaseVT = Src.getValueType();
39638 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
39639 Offset += Src.getConstantOperandVal(1);
39640 Src = Src.getOperand(0);
39641 }
39642 WideSizeInBits = std::max(WideSizeInBits,
39643 (unsigned)Src.getValueSizeInBits());
39644 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39645, __extension__
__PRETTY_FUNCTION__))
39645 "Unexpected subvector extraction")(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39645, __extension__
__PRETTY_FUNCTION__))
;
39646 Offset /= BaseVT.getVectorNumElements();
39647 Offset *= NumMaskElts;
39648 }
39649
39650 // Bail if we're always extracting from the lowest subvectors,
39651 // combineX86ShuffleChain should match this for the current width.
39652 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
39653 return SDValue();
39654
39655 unsigned Scale = WideSizeInBits / RootSizeInBits;
39656 assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39657, __extension__
__PRETTY_FUNCTION__))
39657 "Unexpected subvector extraction")(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39657, __extension__
__PRETTY_FUNCTION__))
;
39658
39659 // If the src vector types aren't the same, see if we can extend
39660 // them to match each other.
39661 // TODO: Support different scalar types?
39662 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
39663 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
39664 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
39665 Op.getValueType().getScalarType() != WideSVT;
39666 }))
39667 return SDValue();
39668
39669 // Create new mask for larger type.
39670 for (unsigned i = 1; i != NumInputs; ++i)
39671 Offsets[i] += i * Scale * NumMaskElts;
39672
39673 SmallVector<int, 64> WideMask(BaseMask);
39674 for (int &M : WideMask) {
39675 if (M < 0)
39676 continue;
39677 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
39678 }
39679 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
39680
39681 // Remove unused/repeated shuffle source ops.
39682 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
39683 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39683, __extension__
__PRETTY_FUNCTION__))
;
39684
39685 if (WideInputs.size() > 2)
39686 return SDValue();
39687
39688 // Increase depth for every upper subvector we've peeked through.
39689 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
39690
39691 // Attempt to combine wider chain.
39692 // TODO: Can we use a better Root?
39693 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
39694 WideInputs.back().getValueSizeInBits()
39695 ? WideInputs.front()
39696 : WideInputs.back();
39697 if (SDValue WideShuffle =
39698 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
39699 HasVariableMask, AllowVariableCrossLaneMask,
39700 AllowVariablePerLaneMask, DAG, Subtarget)) {
39701 WideShuffle =
39702 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
39703 return DAG.getBitcast(RootVT, WideShuffle);
39704 }
39705 return SDValue();
39706}
39707
39708// Canonicalize the combined shuffle mask chain with horizontal ops.
39709// NOTE: This may update the Ops and Mask.
39710static SDValue canonicalizeShuffleMaskWithHorizOp(
39711 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
39712 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
39713 const X86Subtarget &Subtarget) {
39714 if (Mask.empty() || Ops.empty())
39715 return SDValue();
39716
39717 SmallVector<SDValue> BC;
39718 for (SDValue Op : Ops)
39719 BC.push_back(peekThroughBitcasts(Op));
39720
39721 // All ops must be the same horizop + type.
39722 SDValue BC0 = BC[0];
39723 EVT VT0 = BC0.getValueType();
39724 unsigned Opcode0 = BC0.getOpcode();
39725 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39726 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39727 }))
39728 return SDValue();
39729
39730 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39731 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39732 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39733 if (!isHoriz && !isPack)
39734 return SDValue();
39735
39736 // Do all ops have a single use?
39737 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39738 return Op.hasOneUse() &&
39739 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
39740 });
39741
39742 int NumElts = VT0.getVectorNumElements();
39743 int NumLanes = VT0.getSizeInBits() / 128;
39744 int NumEltsPerLane = NumElts / NumLanes;
39745 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39746 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39747 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39748
39749 if (NumEltsPerLane >= 4 &&
39750 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39751 SmallVector<int> LaneMask, ScaledMask;
39752 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39753 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39754 // See if we can remove the shuffle by resorting the HOP chain so that
39755 // the HOP args are pre-shuffled.
39756 // TODO: Generalize to any sized/depth chain.
39757 // TODO: Add support for PACKSS/PACKUS.
39758 if (isHoriz) {
39759 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39760 auto GetHOpSrc = [&](int M) {
39761 if (M == SM_SentinelUndef)
39762 return DAG.getUNDEF(VT0);
39763 if (M == SM_SentinelZero)
39764 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39765 SDValue Src0 = BC[M / 4];
39766 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39767 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39768 return Src1.getOperand(M % 2);
39769 return SDValue();
39770 };
39771 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39772 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39773 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39774 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39775 if (M0 && M1 && M2 && M3) {
39776 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39777 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39778 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39779 }
39780 }
39781 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39782 if (Ops.size() >= 2) {
39783 SDValue LHS, RHS;
39784 auto GetHOpSrc = [&](int M, int &OutM) {
39785 // TODO: Support SM_SentinelZero
39786 if (M < 0)
39787 return M == SM_SentinelUndef;
39788 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39789 if (!LHS || LHS == Src) {
39790 LHS = Src;
39791 OutM = (M % 2);
39792 return true;
39793 }
39794 if (!RHS || RHS == Src) {
39795 RHS = Src;
39796 OutM = (M % 2) + 2;
39797 return true;
39798 }
39799 return false;
39800 };
39801 int PostMask[4] = {-1, -1, -1, -1};
39802 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39803 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39804 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39805 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39806 LHS = DAG.getBitcast(SrcVT, LHS);
39807 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39808 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39809 // Use SHUFPS for the permute so this will work on SSE3 targets,
39810 // shuffle combining and domain handling will simplify this later on.
39811 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39812 Res = DAG.getBitcast(ShuffleVT, Res);
39813 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39814 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39815 }
39816 }
39817 }
39818 }
39819
39820 if (2 < Ops.size())
39821 return SDValue();
39822
39823 SDValue BC1 = BC[BC.size() - 1];
39824 if (Mask.size() == VT0.getVectorNumElements()) {
39825 // Canonicalize binary shuffles of horizontal ops that use the
39826 // same sources to an unary shuffle.
39827 // TODO: Try to perform this fold even if the shuffle remains.
39828 if (Ops.size() == 2) {
39829 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39830 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39831 };
39832 // Commute if all BC0's ops are contained in BC1.
39833 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39834 ContainsOps(BC1, BC0.getOperand(1))) {
39835 ShuffleVectorSDNode::commuteMask(Mask);
39836 std::swap(Ops[0], Ops[1]);
39837 std::swap(BC0, BC1);
39838 }
39839
39840 // If BC1 can be represented by BC0, then convert to unary shuffle.
39841 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39842 ContainsOps(BC0, BC1.getOperand(1))) {
39843 for (int &M : Mask) {
39844 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39845 continue;
39846 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39847 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39848 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39849 M += NumHalfEltsPerLane;
39850 }
39851 }
39852 }
39853
39854 // Canonicalize unary horizontal ops to only refer to lower halves.
39855 for (int i = 0; i != NumElts; ++i) {
39856 int &M = Mask[i];
39857 if (isUndefOrZero(M))
39858 continue;
39859 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39860 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39861 M -= NumHalfEltsPerLane;
39862 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39863 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39864 M -= NumHalfEltsPerLane;
39865 }
39866 }
39867
39868 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39869 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39870 // represents the LHS/RHS inputs for the lower/upper halves.
39871 SmallVector<int, 16> TargetMask128, WideMask128;
39872 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39873 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39874 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39874, __extension__
__PRETTY_FUNCTION__))
;
39875 bool SingleOp = (Ops.size() == 1);
39876 if (isPack || OneUseOps ||
39877 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39878 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39879 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39880 Lo = Lo.getOperand(WideMask128[0] & 1);
39881 Hi = Hi.getOperand(WideMask128[1] & 1);
39882 if (SingleOp) {
39883 SDValue Undef = DAG.getUNDEF(SrcVT);
39884 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39885 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39886 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39887 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39888 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39889 }
39890 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39891 }
39892 }
39893
39894 return SDValue();
39895}
39896
39897// Attempt to constant fold all of the constant source ops.
39898// Returns true if the entire shuffle is folded to a constant.
39899// TODO: Extend this to merge multiple constant Ops and update the mask.
39900static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
39901 ArrayRef<int> Mask, SDValue Root,
39902 bool HasVariableMask,
39903 SelectionDAG &DAG,
39904 const X86Subtarget &Subtarget) {
39905 MVT VT = Root.getSimpleValueType();
39906
39907 unsigned SizeInBits = VT.getSizeInBits();
39908 unsigned NumMaskElts = Mask.size();
39909 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39910 unsigned NumOps = Ops.size();
39911
39912 // Extract constant bits from each source op.
39913 bool OneUseConstantOp = false;
39914 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39915 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39916 for (unsigned i = 0; i != NumOps; ++i) {
39917 SDValue SrcOp = Ops[i];
39918 OneUseConstantOp |= SrcOp.hasOneUse();
39919 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
39920 RawBitsOps[i]))
39921 return SDValue();
39922 }
39923
39924 // If we're optimizing for size, only fold if at least one of the constants is
39925 // only used once or the combined shuffle has included a variable mask
39926 // shuffle, this is to avoid constant pool bloat.
39927 bool IsOptimizingSize = DAG.shouldOptForSize();
39928 if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)
39929 return SDValue();
39930
39931 // Shuffle the constant bits according to the mask.
39932 SDLoc DL(Root);
39933 APInt UndefElts(NumMaskElts, 0);
39934 APInt ZeroElts(NumMaskElts, 0);
39935 APInt ConstantElts(NumMaskElts, 0);
39936 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39937 APInt::getZero(MaskSizeInBits));
39938 for (unsigned i = 0; i != NumMaskElts; ++i) {
39939 int M = Mask[i];
39940 if (M == SM_SentinelUndef) {
39941 UndefElts.setBit(i);
39942 continue;
39943 } else if (M == SM_SentinelZero) {
39944 ZeroElts.setBit(i);
39945 continue;
39946 }
39947 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39947, __extension__
__PRETTY_FUNCTION__))
;
39948
39949 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39950 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39951
39952 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39953 if (SrcUndefElts[SrcMaskIdx]) {
39954 UndefElts.setBit(i);
39955 continue;
39956 }
39957
39958 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39959 APInt &Bits = SrcEltBits[SrcMaskIdx];
39960 if (!Bits) {
39961 ZeroElts.setBit(i);
39962 continue;
39963 }
39964
39965 ConstantElts.setBit(i);
39966 ConstantBitData[i] = Bits;
39967 }
39968 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39968, __extension__
__PRETTY_FUNCTION__))
;
39969
39970 // Attempt to create a zero vector.
39971 if ((UndefElts | ZeroElts).isAllOnes())
39972 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
39973
39974 // Create the constant data.
39975 MVT MaskSVT;
39976 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39977 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39978 else
39979 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39980
39981 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39982 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39983 return SDValue();
39984
39985 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39986 return DAG.getBitcast(VT, CstOp);
39987}
39988
39989namespace llvm {
39990 namespace X86 {
39991 enum {
39992 MaxShuffleCombineDepth = 8
39993 };
39994 }
39995} // namespace llvm
39996
39997/// Fully generic combining of x86 shuffle instructions.
39998///
39999/// This should be the last combine run over the x86 shuffle instructions. Once
40000/// they have been fully optimized, this will recursively consider all chains
40001/// of single-use shuffle instructions, build a generic model of the cumulative
40002/// shuffle operation, and check for simpler instructions which implement this
40003/// operation. We use this primarily for two purposes:
40004///
40005/// 1) Collapse generic shuffles to specialized single instructions when
40006/// equivalent. In most cases, this is just an encoding size win, but
40007/// sometimes we will collapse multiple generic shuffles into a single
40008/// special-purpose shuffle.
40009/// 2) Look for sequences of shuffle instructions with 3 or more total
40010/// instructions, and replace them with the slightly more expensive SSSE3
40011/// PSHUFB instruction if available. We do this as the last combining step
40012/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40013/// a suitable short sequence of other instructions. The PSHUFB will either
40014/// use a register or have to read from memory and so is slightly (but only
40015/// slightly) more expensive than the other shuffle instructions.
40016///
40017/// Because this is inherently a quadratic operation (for each shuffle in
40018/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40019/// This should never be an issue in practice as the shuffle lowering doesn't
40020/// produce sequences of more than 8 instructions.
40021///
40022/// FIXME: We will currently miss some cases where the redundant shuffling
40023/// would simplify under the threshold for PSHUFB formation because of
40024/// combine-ordering. To fix this, we should do the redundant instruction
40025/// combining in this recursive walk.
40026static SDValue combineX86ShufflesRecursively(
40027 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40028 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40029 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40030 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40031 const X86Subtarget &Subtarget) {
40032 assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40034, __extension__
__PRETTY_FUNCTION__))
40033 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40034, __extension__
__PRETTY_FUNCTION__))
40034 "Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40034, __extension__
__PRETTY_FUNCTION__))
;
40035 MVT RootVT = Root.getSimpleValueType();
40036 assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40036, __extension__
__PRETTY_FUNCTION__))
;
40037 unsigned RootSizeInBits = RootVT.getSizeInBits();
40038
40039 // Bound the depth of our recursive combine because this is ultimately
40040 // quadratic in nature.
40041 if (Depth >= MaxDepth)
40042 return SDValue();
40043
40044 // Directly rip through bitcasts to find the underlying operand.
40045 SDValue Op = SrcOps[SrcOpIndex];
40046 Op = peekThroughOneUseBitcasts(Op);
40047
40048 EVT VT = Op.getValueType();
40049 if (!VT.isVector() || !VT.isSimple())
40050 return SDValue(); // Bail if we hit a non-simple non-vector.
40051
40052 // FIXME: Just bail on f16 for now.
40053 if (VT.getVectorElementType() == MVT::f16)
40054 return SDValue();
40055
40056 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40057, __extension__
__PRETTY_FUNCTION__))
40057 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40057, __extension__
__PRETTY_FUNCTION__))
;
40058
40059 // Create a demanded elts mask from the referenced elements of Op.
40060 APInt OpDemandedElts = APInt::getZero(RootMask.size());
40061 for (int M : RootMask) {
40062 int BaseIdx = RootMask.size() * SrcOpIndex;
40063 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
40064 OpDemandedElts.setBit(M - BaseIdx);
40065 }
40066 if (RootSizeInBits != VT.getSizeInBits()) {
40067 // Op is smaller than Root - extract the demanded elts for the subvector.
40068 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
40069 unsigned NumOpMaskElts = RootMask.size() / Scale;
40070 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40070, __extension__
__PRETTY_FUNCTION__))
;
40071 assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))
40072 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))
40073 .isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))
40074 "Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))
;
40075 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
40076 }
40077 OpDemandedElts =
40078 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
40079
40080 // Extract target shuffle mask and resolve sentinels and inputs.
40081 SmallVector<int, 64> OpMask;
40082 SmallVector<SDValue, 2> OpInputs;
40083 APInt OpUndef, OpZero;
40084 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
40085 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
40086 OpZero, DAG, Depth, false)) {
40087 // Shuffle inputs must not be larger than the shuffle result.
40088 // TODO: Relax this for single input faux shuffles (e.g. trunc).
40089 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
40090 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
40091 }))
40092 return SDValue();
40093 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40094 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40095 !isNullConstant(Op.getOperand(1))) {
40096 SDValue SrcVec = Op.getOperand(0);
40097 int ExtractIdx = Op.getConstantOperandVal(1);
40098 unsigned NumElts = VT.getVectorNumElements();
40099 OpInputs.assign({SrcVec});
40100 OpMask.assign(NumElts, SM_SentinelUndef);
40101 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40102 OpZero = OpUndef = APInt::getNullValue(NumElts);
40103 } else {
40104 return SDValue();
40105 }
40106
40107 // If the shuffle result was smaller than the root, we need to adjust the
40108 // mask indices and pad the mask with undefs.
40109 if (RootSizeInBits > VT.getSizeInBits()) {
40110 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
40111 unsigned OpMaskSize = OpMask.size();
40112 if (OpInputs.size() > 1) {
40113 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
40114 for (int &M : OpMask) {
40115 if (M < 0)
40116 continue;
40117 int EltIdx = M % OpMaskSize;
40118 int OpIdx = M / OpMaskSize;
40119 M = (PaddedMaskSize * OpIdx) + EltIdx;
40120 }
40121 }
40122 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
40123 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
40124 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40125 }
40126
40127 SmallVector<int, 64> Mask;
40128 SmallVector<SDValue, 16> Ops;
40129
40130 // We don't need to merge masks if the root is empty.
40131 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40132 if (EmptyRoot) {
40133 // Only resolve zeros if it will remove an input, otherwise we might end
40134 // up in an infinite loop.
40135 bool ResolveKnownZeros = true;
40136 if (!OpZero.isZero()) {
40137 APInt UsedInputs = APInt::getZero(OpInputs.size());
40138 for (int i = 0, e = OpMask.size(); i != e; ++i) {
40139 int M = OpMask[i];
40140 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
40141 continue;
40142 UsedInputs.setBit(M / OpMask.size());
40143 if (UsedInputs.isAllOnes()) {
40144 ResolveKnownZeros = false;
40145 break;
40146 }
40147 }
40148 }
40149 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
40150 ResolveKnownZeros);
40151
40152 Mask = OpMask;
40153 Ops.append(OpInputs.begin(), OpInputs.end());
40154 } else {
40155 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
40156
40157 // Add the inputs to the Ops list, avoiding duplicates.
40158 Ops.append(SrcOps.begin(), SrcOps.end());
40159
40160 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40161 // Attempt to find an existing match.
40162 SDValue InputBC = peekThroughBitcasts(Input);
40163 for (int i = 0, e = Ops.size(); i < e; ++i)
40164 if (InputBC == peekThroughBitcasts(Ops[i]))
40165 return i;
40166 // Match failed - should we replace an existing Op?
40167 if (InsertionPoint >= 0) {
40168 Ops[InsertionPoint] = Input;
40169 return InsertionPoint;
40170 }
40171 // Add to the end of the Ops list.
40172 Ops.push_back(Input);
40173 return Ops.size() - 1;
40174 };
40175
40176 SmallVector<int, 2> OpInputIdx;
40177 for (SDValue OpInput : OpInputs)
40178 OpInputIdx.push_back(
40179 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40180
40181 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))
40182 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))
40183 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))
40184 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))
40185 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))
40186 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))
;
40187
40188 // This function can be performance-critical, so we rely on the power-of-2
40189 // knowledge that we have about the mask sizes to replace div/rem ops with
40190 // bit-masks and shifts.
40191 assert(isPowerOf2_32(RootMask.size()) &&(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40192, __extension__
__PRETTY_FUNCTION__))
40192 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40192, __extension__
__PRETTY_FUNCTION__))
;
40193 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40193, __extension__
__PRETTY_FUNCTION__))
;
40194 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
40195 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
40196
40197 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
40198 unsigned RootRatio =
40199 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
40200 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
40201 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40202, __extension__
__PRETTY_FUNCTION__))
40202 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40202, __extension__
__PRETTY_FUNCTION__))
;
40203
40204 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40204, __extension__
__PRETTY_FUNCTION__))
;
40205 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40205, __extension__
__PRETTY_FUNCTION__))
;
40206 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40206, __extension__
__PRETTY_FUNCTION__))
;
40207 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
40208 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
40209
40210 Mask.resize(MaskWidth, SM_SentinelUndef);
40211
40212 // Merge this shuffle operation's mask into our accumulated mask. Note that
40213 // this shuffle's mask will be the first applied to the input, followed by
40214 // the root mask to get us all the way to the root value arrangement. The
40215 // reason for this order is that we are recursing up the operation chain.
40216 for (unsigned i = 0; i < MaskWidth; ++i) {
40217 unsigned RootIdx = i >> RootRatioLog2;
40218 if (RootMask[RootIdx] < 0) {
40219 // This is a zero or undef lane, we're done.
40220 Mask[i] = RootMask[RootIdx];
40221 continue;
40222 }
40223
40224 unsigned RootMaskedIdx =
40225 RootRatio == 1
40226 ? RootMask[RootIdx]
40227 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40228
40229 // Just insert the scaled root mask value if it references an input other
40230 // than the SrcOp we're currently inserting.
40231 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
40232 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
40233 Mask[i] = RootMaskedIdx;
40234 continue;
40235 }
40236
40237 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40238 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
40239 if (OpMask[OpIdx] < 0) {
40240 // The incoming lanes are zero or undef, it doesn't matter which ones we
40241 // are using.
40242 Mask[i] = OpMask[OpIdx];
40243 continue;
40244 }
40245
40246 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40247 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
40248 : (OpMask[OpIdx] << OpRatioLog2) +
40249 (RootMaskedIdx & (OpRatio - 1));
40250
40251 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
40252 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
40253 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40253, __extension__
__PRETTY_FUNCTION__))
;
40254 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
40255
40256 Mask[i] = OpMaskedIdx;
40257 }
40258 }
40259
40260 // Remove unused/repeated shuffle source ops.
40261 resolveTargetShuffleInputsAndMask(Ops, Mask);
40262
40263 // Handle the all undef/zero/ones cases early.
40264 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
40265 return DAG.getUNDEF(RootVT);
40266 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
40267 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
40268 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
40269 !llvm::is_contained(Mask, SM_SentinelZero))
40270 return getOnesVector(RootVT, DAG, SDLoc(Root));
40271
40272 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40272, __extension__
__PRETTY_FUNCTION__))
;
40273 HasVariableMask |= IsOpVariableMask;
40274
40275 // Update the list of shuffle nodes that have been combined so far.
40276 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
40277 SrcNodes.end());
40278 CombinedNodes.push_back(Op.getNode());
40279
40280 // See if we can recurse into each shuffle source op (if it's a target
40281 // shuffle). The source op should only be generally combined if it either has
40282 // a single use (i.e. current Op) or all its users have already been combined,
40283 // if not then we can still combine but should prevent generation of variable
40284 // shuffles to avoid constant pool bloat.
40285 // Don't recurse if we already have more source ops than we can combine in
40286 // the remaining recursion depth.
40287 if (Ops.size() < (MaxDepth - Depth)) {
40288 for (int i = 0, e = Ops.size(); i < e; ++i) {
40289 // For empty roots, we need to resolve zeroable elements before combining
40290 // them with other shuffles.
40291 SmallVector<int, 64> ResolvedMask = Mask;
40292 if (EmptyRoot)
40293 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
40294 bool AllowCrossLaneVar = false;
40295 bool AllowPerLaneVar = false;
40296 if (Ops[i].getNode()->hasOneUse() ||
40297 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
40298 AllowCrossLaneVar = AllowVariableCrossLaneMask;
40299 AllowPerLaneVar = AllowVariablePerLaneMask;
40300 }
40301 if (SDValue Res = combineX86ShufflesRecursively(
40302 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
40303 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
40304 Subtarget))
40305 return Res;
40306 }
40307 }
40308
40309 // Attempt to constant fold all of the constant source ops.
40310 if (SDValue Cst = combineX86ShufflesConstants(
40311 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
40312 return Cst;
40313
40314 // If constant fold failed and we only have constants - then we have
40315 // multiple uses by a single non-variable shuffle - just bail.
40316 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
40317 APInt UndefElts;
40318 SmallVector<APInt> RawBits;
40319 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40320 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
40321 RawBits);
40322 })) {
40323 return SDValue();
40324 }
40325
40326 // Canonicalize the combined shuffle mask chain with horizontal ops.
40327 // NOTE: This will update the Ops and Mask.
40328 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
40329 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
40330 return DAG.getBitcast(RootVT, HOp);
40331
40332 // Try to refine our inputs given our knowledge of target shuffle mask.
40333 for (auto I : enumerate(Ops)) {
40334 int OpIdx = I.index();
40335 SDValue &Op = I.value();
40336
40337 // What range of shuffle mask element values results in picking from Op?
40338 int Lo = OpIdx * Mask.size();
40339 int Hi = Lo + Mask.size();
40340
40341 // Which elements of Op do we demand, given the mask's granularity?
40342 APInt OpDemandedElts(Mask.size(), 0);
40343 for (int MaskElt : Mask) {
40344 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
40345 int OpEltIdx = MaskElt - Lo;
40346 OpDemandedElts.setBit(OpEltIdx);
40347 }
40348 }
40349
40350 // Is the shuffle result smaller than the root?
40351 if (Op.getValueSizeInBits() < RootSizeInBits) {
40352 // We padded the mask with undefs. But we now need to undo that.
40353 unsigned NumExpectedVectorElts = Mask.size();
40354 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
40355 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
40356 assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40358, __extension__
__PRETTY_FUNCTION__))
40357 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40358, __extension__
__PRETTY_FUNCTION__))
40358 "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40358, __extension__
__PRETTY_FUNCTION__))
;
40359 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
40360 }
40361
40362 // The Op itself may be of different VT, so we need to scale the mask.
40363 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
40364 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
40365
40366 // Can this operand be simplified any further, given it's demanded elements?
40367 if (SDValue NewOp =
40368 DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
40369 Op, OpScaledDemandedElts, DAG))
40370 Op = NewOp;
40371 }
40372 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
40373
40374 // Widen any subvector shuffle inputs we've collected.
40375 // TODO: Remove this to avoid generating temporary nodes, we should only
40376 // widen once combineX86ShuffleChain has found a match.
40377 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
40378 return Op.getValueSizeInBits() < RootSizeInBits;
40379 })) {
40380 for (SDValue &Op : Ops)
40381 if (Op.getValueSizeInBits() < RootSizeInBits)
40382 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
40383 RootSizeInBits);
40384 // Reresolve - we might have repeated subvector sources.
40385 resolveTargetShuffleInputsAndMask(Ops, Mask);
40386 }
40387
40388 // We can only combine unary and binary shuffle mask cases.
40389 if (Ops.size() <= 2) {
40390 // Minor canonicalization of the accumulated shuffle mask to make it easier
40391 // to match below. All this does is detect masks with sequential pairs of
40392 // elements, and shrink them to the half-width mask. It does this in a loop
40393 // so it will reduce the size of the mask to the minimal width mask which
40394 // performs an equivalent shuffle.
40395 while (Mask.size() > 1) {
40396 SmallVector<int, 64> WidenedMask;
40397 if (!canWidenShuffleElements(Mask, WidenedMask))
40398 break;
40399 Mask = std::move(WidenedMask);
40400 }
40401
40402 // Canonicalization of binary shuffle masks to improve pattern matching by
40403 // commuting the inputs.
40404 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
40405 ShuffleVectorSDNode::commuteMask(Mask);
40406 std::swap(Ops[0], Ops[1]);
40407 }
40408
40409 // Try to combine into a single shuffle instruction.
40410 if (SDValue Shuffle = combineX86ShuffleChain(
40411 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
40412 AllowVariablePerLaneMask, DAG, Subtarget))
40413 return Shuffle;
40414
40415 // If all the operands come from the same larger vector, fallthrough and try
40416 // to use combineX86ShuffleChainWithExtract.
40417 SDValue LHS = peekThroughBitcasts(Ops.front());
40418 SDValue RHS = peekThroughBitcasts(Ops.back());
40419 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
40420 (RootSizeInBits / Mask.size()) != 64 ||
40421 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
40422 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
40423 LHS.getOperand(0) != RHS.getOperand(0))
40424 return SDValue();
40425 }
40426
40427 // If that failed and any input is extracted then try to combine as a
40428 // shuffle with the larger type.
40429 return combineX86ShuffleChainWithExtract(
40430 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
40431 AllowVariablePerLaneMask, DAG, Subtarget);
40432}
40433
40434/// Helper entry wrapper to combineX86ShufflesRecursively.
40435static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
40436 const X86Subtarget &Subtarget) {
40437 return combineX86ShufflesRecursively(
40438 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
40439 /*HasVarMask*/ false,
40440 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
40441 Subtarget);
40442}
40443
40444/// Get the PSHUF-style mask from PSHUF node.
40445///
40446/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
40447/// PSHUF-style masks that can be reused with such instructions.
40448static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
40449 MVT VT = N.getSimpleValueType();
40450 SmallVector<int, 4> Mask;
40451 SmallVector<SDValue, 2> Ops;
40452 bool HaveMask =
40453 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
40454 (void)HaveMask;
40455 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 40455
, __extension__ __PRETTY_FUNCTION__))
;
40456
40457 // If we have more than 128-bits, only the low 128-bits of shuffle mask
40458 // matter. Check that the upper masks are repeats and remove them.
40459 if (VT.getSizeInBits() > 128) {
40460 int LaneElts = 128 / VT.getScalarSizeInBits();
40461#ifndef NDEBUG
40462 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
40463 for (int j = 0; j < LaneElts; ++j)
40464 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40465, __extension__
__PRETTY_FUNCTION__))
40465 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40465, __extension__
__PRETTY_FUNCTION__))
;
40466#endif
40467 Mask.resize(LaneElts);
40468 }
40469
40470 switch (N.getOpcode()) {
40471 case X86ISD::PSHUFD:
40472 return Mask;
40473 case X86ISD::PSHUFLW:
40474 Mask.resize(4);
40475 return Mask;
40476 case X86ISD::PSHUFHW:
40477 Mask.erase(Mask.begin(), Mask.begin() + 4);
40478 for (int &M : Mask)
40479 M -= 4;
40480 return Mask;
40481 default:
40482 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40482)
;
40483 }
40484}
40485
40486/// Search for a combinable shuffle across a chain ending in pshufd.
40487///
40488/// We walk up the chain and look for a combinable shuffle, skipping over
40489/// shuffles that we could hoist this shuffle's transformation past without
40490/// altering anything.
40491static SDValue
40492combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
40493 SelectionDAG &DAG) {
40494 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40495, __extension__
__PRETTY_FUNCTION__))
40495 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40495, __extension__
__PRETTY_FUNCTION__))
;
40496 SDLoc DL(N);
40497
40498 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
40499 // of the shuffles in the chain so that we can form a fresh chain to replace
40500 // this one.
40501 SmallVector<SDValue, 8> Chain;
40502 SDValue V = N.getOperand(0);
40503 for (; V.hasOneUse(); V = V.getOperand(0)) {
40504 switch (V.getOpcode()) {
40505 default:
40506 return SDValue(); // Nothing combined!
40507
40508 case ISD::BITCAST:
40509 // Skip bitcasts as we always know the type for the target specific
40510 // instructions.
40511 continue;
40512
40513 case X86ISD::PSHUFD:
40514 // Found another dword shuffle.
40515 break;
40516
40517 case X86ISD::PSHUFLW:
40518 // Check that the low words (being shuffled) are the identity in the
40519 // dword shuffle, and the high words are self-contained.
40520 if (Mask[0] != 0 || Mask[1] != 1 ||
40521 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
40522 return SDValue();
40523
40524 Chain.push_back(V);
40525 continue;
40526
40527 case X86ISD::PSHUFHW:
40528 // Check that the high words (being shuffled) are the identity in the
40529 // dword shuffle, and the low words are self-contained.
40530 if (Mask[2] != 2 || Mask[3] != 3 ||
40531 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
40532 return SDValue();
40533
40534 Chain.push_back(V);
40535 continue;
40536
40537 case X86ISD::UNPCKL:
40538 case X86ISD::UNPCKH:
40539 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
40540 // shuffle into a preceding word shuffle.
40541 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
40542 V.getSimpleValueType().getVectorElementType() != MVT::i16)
40543 return SDValue();
40544
40545 // Search for a half-shuffle which we can combine with.
40546 unsigned CombineOp =
40547 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
40548 if (V.getOperand(0) != V.getOperand(1) ||
40549 !V->isOnlyUserOf(V.getOperand(0).getNode()))
40550 return SDValue();
40551 Chain.push_back(V);
40552 V = V.getOperand(0);
40553 do {
40554 switch (V.getOpcode()) {
40555 default:
40556 return SDValue(); // Nothing to combine.
40557
40558 case X86ISD::PSHUFLW:
40559 case X86ISD::PSHUFHW:
40560 if (V.getOpcode() == CombineOp)
40561 break;
40562
40563 Chain.push_back(V);
40564
40565 [[fallthrough]];
40566 case ISD::BITCAST:
40567 V = V.getOperand(0);
40568 continue;
40569 }
40570 break;
40571 } while (V.hasOneUse());
40572 break;
40573 }
40574 // Break out of the loop if we break out of the switch.
40575 break;
40576 }
40577
40578 if (!V.hasOneUse())
40579 // We fell out of the loop without finding a viable combining instruction.
40580 return SDValue();
40581
40582 // Merge this node's mask and our incoming mask.
40583 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
40584 for (int &M : Mask)
40585 M = VMask[M];
40586 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
40587 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40588
40589 // Rebuild the chain around this new shuffle.
40590 while (!Chain.empty()) {
40591 SDValue W = Chain.pop_back_val();
40592
40593 if (V.getValueType() != W.getOperand(0).getValueType())
40594 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
40595
40596 switch (W.getOpcode()) {
40597 default:
40598 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40598)
;
40599
40600 case X86ISD::UNPCKL:
40601 case X86ISD::UNPCKH:
40602 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
40603 break;
40604
40605 case X86ISD::PSHUFD:
40606 case X86ISD::PSHUFLW:
40607 case X86ISD::PSHUFHW:
40608 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
40609 break;
40610 }
40611 }
40612 if (V.getValueType() != N.getValueType())
40613 V = DAG.getBitcast(N.getValueType(), V);
40614
40615 // Return the new chain to replace N.
40616 return V;
40617}
40618
40619// Attempt to commute shufps LHS loads:
40620// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
40621static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
40622 SelectionDAG &DAG) {
40623 // TODO: Add vXf64 support.
40624 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
40625 return SDValue();
40626
40627 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
40628 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
40629 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
40630 return SDValue();
40631 SDValue N0 = V.getOperand(0);
40632 SDValue N1 = V.getOperand(1);
40633 unsigned Imm = V.getConstantOperandVal(2);
40634 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
40635 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
40636 X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
40637 return SDValue();
40638 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
40639 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
40640 DAG.getTargetConstant(Imm, DL, MVT::i8));
40641 };
40642
40643 switch (N.getOpcode()) {
40644 case X86ISD::VPERMILPI:
40645 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
40646 unsigned Imm = N.getConstantOperandVal(1);
40647 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
40648 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40649 }
40650 break;
40651 case X86ISD::SHUFP: {
40652 SDValue N0 = N.getOperand(0);
40653 SDValue N1 = N.getOperand(1);
40654 unsigned Imm = N.getConstantOperandVal(2);
40655 if (N0 == N1) {
40656 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
40657 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
40658 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40659 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
40660 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
40661 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
40662 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
40663 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
40664 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
40665 }
40666 break;
40667 }
40668 }
40669
40670 return SDValue();
40671}
40672
40673// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40674static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
40675 const SDLoc &DL) {
40676 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40677 EVT ShuffleVT = N.getValueType();
40678
40679 auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
40680 // AllZeros/AllOnes constants are freely shuffled and will peek through
40681 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40682 // merge with target shuffles if it has one use so shuffle combining is
40683 // likely to kick in. Shuffles of splats are expected to be removed.
40684 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40685 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40686 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
40687 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
40688 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40689 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40690 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40691 };
40692 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40693 // Ensure we only shuffle whole vector src elements, unless its a logical
40694 // binops where we can more aggressively move shuffles from dst to src.
40695 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
40696 BinOp == X86ISD::ANDNP ||
40697 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40698 };
40699
40700 unsigned Opc = N.getOpcode();
40701 switch (Opc) {
40702 // Unary and Unary+Permute Shuffles.
40703 case X86ISD::PSHUFB: {
40704 // Don't merge PSHUFB if it contains zero'd elements.
40705 SmallVector<int> Mask;
40706 SmallVector<SDValue> Ops;
40707 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
40708 Mask))
40709 break;
40710 [[fallthrough]];
40711 }
40712 case X86ISD::VBROADCAST:
40713 case X86ISD::MOVDDUP:
40714 case X86ISD::PSHUFD:
40715 case X86ISD::PSHUFHW:
40716 case X86ISD::PSHUFLW:
40717 case X86ISD::VPERMI:
40718 case X86ISD::VPERMILPI: {
40719 if (N.getOperand(0).getValueType() == ShuffleVT &&
40720 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40721 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40722 unsigned SrcOpcode = N0.getOpcode();
40723 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40724 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
40725 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
40726 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
40727 IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
40728 SDValue LHS, RHS;
40729 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40730 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40731 if (N.getNumOperands() == 2) {
40732 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40733 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40734 } else {
40735 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40736 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40737 }
40738 EVT OpVT = N0.getValueType();
40739 return DAG.getBitcast(ShuffleVT,
40740 DAG.getNode(SrcOpcode, DL, OpVT,
40741 DAG.getBitcast(OpVT, LHS),
40742 DAG.getBitcast(OpVT, RHS)));
40743 }
40744 }
40745 }
40746 break;
40747 }
40748 // Binary and Binary+Permute Shuffles.
40749 case X86ISD::INSERTPS: {
40750 // Don't merge INSERTPS if it contains zero'd elements.
40751 unsigned InsertPSMask = N.getConstantOperandVal(2);
40752 unsigned ZeroMask = InsertPSMask & 0xF;
40753 if (ZeroMask != 0)
40754 break;
40755 [[fallthrough]];
40756 }
40757 case X86ISD::MOVSD:
40758 case X86ISD::MOVSS:
40759 case X86ISD::BLENDI:
40760 case X86ISD::SHUFP:
40761 case X86ISD::UNPCKH:
40762 case X86ISD::UNPCKL: {
40763 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40764 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40765 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40766 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40767 unsigned SrcOpcode = N0.getOpcode();
40768 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40769 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40770 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40771 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
40772 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
40773 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
40774 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
40775 // Ensure the total number of shuffles doesn't increase by folding this
40776 // shuffle through to the source ops.
40777 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40778 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40779 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40780 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40781 SDValue LHS, RHS;
40782 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40783 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40784 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40785 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40786 if (N.getNumOperands() == 3) {
40787 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40788 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40789 } else {
40790 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40791 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40792 }
40793 EVT OpVT = N0.getValueType();
40794 return DAG.getBitcast(ShuffleVT,
40795 DAG.getNode(SrcOpcode, DL, OpVT,
40796 DAG.getBitcast(OpVT, LHS),
40797 DAG.getBitcast(OpVT, RHS)));
40798 }
40799 }
40800 }
40801 break;
40802 }
40803 }
40804 return SDValue();
40805}
40806
40807/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40808static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
40809 SelectionDAG &DAG,
40810 const SDLoc &DL) {
40811 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40811, __extension__
__PRETTY_FUNCTION__))
;
40812
40813 MVT VT = V.getSimpleValueType();
40814 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40815 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40816 unsigned SrcOpc0 = Src0.getOpcode();
40817 unsigned SrcOpc1 = Src1.getOpcode();
40818 EVT SrcVT0 = Src0.getValueType();
40819 EVT SrcVT1 = Src1.getValueType();
40820
40821 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40822 return SDValue();
40823
40824 switch (SrcOpc0) {
40825 case X86ISD::MOVDDUP: {
40826 SDValue LHS = Src0.getOperand(0);
40827 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40828 SDValue Res =
40829 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40830 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40831 return DAG.getBitcast(VT, Res);
40832 }
40833 case X86ISD::VPERMILPI:
40834 // TODO: Handle v4f64 permutes with different low/high lane masks.
40835 if (SrcVT0 == MVT::v4f64) {
40836 uint64_t Mask = Src0.getConstantOperandVal(1);
40837 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40838 break;
40839 }
40840 [[fallthrough]];
40841 case X86ISD::VSHLI:
40842 case X86ISD::VSRLI:
40843 case X86ISD::VSRAI:
40844 case X86ISD::PSHUFD:
40845 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40846 SDValue LHS = Src0.getOperand(0);
40847 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40848 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40849 V.getOperand(2));
40850 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40851 return DAG.getBitcast(VT, Res);
40852 }
40853 break;
40854 }
40855
40856 return SDValue();
40857}
40858
40859/// Try to combine x86 target specific shuffles.
40860static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
40861 TargetLowering::DAGCombinerInfo &DCI,
40862 const X86Subtarget &Subtarget) {
40863 SDLoc DL(N);
40864 MVT VT = N.getSimpleValueType();
40865 SmallVector<int, 4> Mask;
40866 unsigned Opcode = N.getOpcode();
40867
40868 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40869 return R;
40870
40871 // Handle specific target shuffles.
40872 switch (Opcode) {
40873 case X86ISD::MOVDDUP: {
40874 SDValue Src = N.getOperand(0);
40875 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40876 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40877 ISD::isNormalLoad(Src.getNode())) {
40878 LoadSDNode *LN = cast<LoadSDNode>(Src);
40879 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40880 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40881 DCI.CombineTo(N.getNode(), Movddup);
40882 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40883 DCI.recursivelyDeleteUnusedNodes(LN);
40884 return N; // Return N so it doesn't get rechecked!
40885 }
40886 }
40887
40888 return SDValue();
40889 }
40890 case X86ISD::VBROADCAST: {
40891 SDValue Src = N.getOperand(0);
40892 SDValue BC = peekThroughBitcasts(Src);
40893 EVT SrcVT = Src.getValueType();
40894 EVT BCVT = BC.getValueType();
40895
40896 // If broadcasting from another shuffle, attempt to simplify it.
40897 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40898 if (isTargetShuffle(BC.getOpcode()) &&
40899 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40900 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40901 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40902 SM_SentinelUndef);
40903 for (unsigned i = 0; i != Scale; ++i)
40904 DemandedMask[i] = i;
40905 if (SDValue Res = combineX86ShufflesRecursively(
40906 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40907 X86::MaxShuffleCombineDepth,
40908 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40909 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40910 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40911 DAG.getBitcast(SrcVT, Res));
40912 }
40913
40914 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40915 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40916 if (Src.getOpcode() == ISD::BITCAST &&
40917 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40918 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
40919 FixedVectorType::isValidElementType(
40920 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40921 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40922 VT.getVectorNumElements());
40923 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40924 }
40925
40926 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40927 // If we're re-broadcasting a smaller type then broadcast with that type and
40928 // bitcast.
40929 // TODO: Do this for any splat?
40930 if (Src.getOpcode() == ISD::BITCAST &&
40931 (BC.getOpcode() == X86ISD::VBROADCAST ||
40932 BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
40933 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40934 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40935 MVT NewVT =
40936 MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
40937 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40938 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40939 }
40940
40941 // Reduce broadcast source vector to lowest 128-bits.
40942 if (SrcVT.getSizeInBits() > 128)
40943 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40944 extract128BitVector(Src, 0, DAG, DL));
40945
40946 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40947 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
40948 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40949
40950 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40951 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40952 isNullConstant(Src.getOperand(1)) &&
40953 DAG.getTargetLoweringInfo().isTypeLegal(
40954 Src.getOperand(0).getValueType()))
40955 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40956
40957 // Share broadcast with the longest vector and extract low subvector (free).
40958 // Ensure the same SDValue from the SDNode use is being used.
40959 for (SDNode *User : Src->uses())
40960 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40961 Src == User->getOperand(0) &&
40962 User->getValueSizeInBits(0).getFixedValue() >
40963 VT.getFixedSizeInBits()) {
40964 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40965 VT.getSizeInBits());
40966 }
40967
40968 // vbroadcast(scalarload X) -> vbroadcast_load X
40969 // For float loads, extract other uses of the scalar from the broadcast.
40970 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40971 ISD::isNormalLoad(Src.getNode())) {
40972 LoadSDNode *LN = cast<LoadSDNode>(Src);
40973 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40974 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40975 SDValue BcastLd =
40976 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
40977 LN->getMemoryVT(), LN->getMemOperand());
40978 // If the load value is used only by N, replace it via CombineTo N.
40979 bool NoReplaceExtract = Src.hasOneUse();
40980 DCI.CombineTo(N.getNode(), BcastLd);
40981 if (NoReplaceExtract) {
40982 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40983 DCI.recursivelyDeleteUnusedNodes(LN);
40984 } else {
40985 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40986 DAG.getIntPtrConstant(0, DL));
40987 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40988 }
40989 return N; // Return N so it doesn't get rechecked!
40990 }
40991
40992 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40993 // i16. So shrink it ourselves if we can make a broadcast_load.
40994 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40995 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40996 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40996, __extension__
__PRETTY_FUNCTION__))
;
40997 SDValue TruncIn = Src.getOperand(0);
40998
40999 // If this is a truncate of a non extending load we can just narrow it to
41000 // use a broadcast_load.
41001 if (ISD::isNormalLoad(TruncIn.getNode())) {
41002 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41003 // Unless its volatile or atomic.
41004 if (LN->isSimple()) {
41005 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41006 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41007 SDValue BcastLd = DAG.getMemIntrinsicNode(
41008 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41009 LN->getPointerInfo(), LN->getOriginalAlign(),
41010 LN->getMemOperand()->getFlags());
41011 DCI.CombineTo(N.getNode(), BcastLd);
41012 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41013 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41014 return N; // Return N so it doesn't get rechecked!
41015 }
41016 }
41017
41018 // If this is a truncate of an i16 extload, we can directly replace it.
41019 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41020 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41021 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41022 if (LN->getMemoryVT().getSizeInBits() == 16) {
41023 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41024 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41025 SDValue BcastLd =
41026 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41027 LN->getMemoryVT(), LN->getMemOperand());
41028 DCI.CombineTo(N.getNode(), BcastLd);
41029 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41030 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41031 return N; // Return N so it doesn't get rechecked!
41032 }
41033 }
41034
41035 // If this is a truncate of load that has been shifted right, we can
41036 // offset the pointer and use a narrower load.
41037 if (TruncIn.getOpcode() == ISD::SRL &&
41038 TruncIn.getOperand(0).hasOneUse() &&
41039 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
41040 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
41041 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
41042 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
41043 // Make sure the shift amount and the load size are divisible by 16.
41044 // Don't do this if the load is volatile or atomic.
41045 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
41046 LN->isSimple()) {
41047 unsigned Offset = ShiftAmt / 8;
41048 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41049 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
41050 TypeSize::Fixed(Offset), DL);
41051 SDValue Ops[] = { LN->getChain(), Ptr };
41052 SDValue BcastLd = DAG.getMemIntrinsicNode(
41053 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41054 LN->getPointerInfo().getWithOffset(Offset),
41055 LN->getOriginalAlign(),
41056 LN->getMemOperand()->getFlags());
41057 DCI.CombineTo(N.getNode(), BcastLd);
41058 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41059 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41060 return N; // Return N so it doesn't get rechecked!
41061 }
41062 }
41063 }
41064
41065 // vbroadcast(vzload X) -> vbroadcast_load X
41066 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
41067 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
41068 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41069 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41070 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41071 SDValue BcastLd =
41072 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
41073 LN->getMemoryVT(), LN->getMemOperand());
41074 DCI.CombineTo(N.getNode(), BcastLd);
41075 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41076 DCI.recursivelyDeleteUnusedNodes(LN);
41077 return N; // Return N so it doesn't get rechecked!
41078 }
41079 }
41080
41081 // vbroadcast(vector load X) -> vbroadcast_load
41082 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
41083 SrcVT == MVT::v4i32) &&
41084 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
41085 LoadSDNode *LN = cast<LoadSDNode>(Src);
41086 // Unless the load is volatile or atomic.
41087 if (LN->isSimple()) {
41088 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41089 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
41090 SDValue BcastLd = DAG.getMemIntrinsicNode(
41091 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
41092 LN->getPointerInfo(), LN->getOriginalAlign(),
41093 LN->getMemOperand()->getFlags());
41094 DCI.CombineTo(N.getNode(), BcastLd);
41095 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41096 DCI.recursivelyDeleteUnusedNodes(LN);
41097 return N; // Return N so it doesn't get rechecked!
41098 }
41099 }
41100
41101 return SDValue();
41102 }
41103 case X86ISD::VZEXT_MOVL: {
41104 SDValue N0 = N.getOperand(0);
41105
41106 // If this a vzmovl of a full vector load, replace it with a vzload, unless
41107 // the load is volatile.
41108 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
41109 auto *LN = cast<LoadSDNode>(N0);
41110 if (SDValue VZLoad =
41111 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
41112 DCI.CombineTo(N.getNode(), VZLoad);
41113 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41114 DCI.recursivelyDeleteUnusedNodes(LN);
41115 return N;
41116 }
41117 }
41118
41119 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
41120 // and can just use a VZEXT_LOAD.
41121 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
41122 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
41123 auto *LN = cast<MemSDNode>(N0);
41124 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
41125 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41126 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
41127 SDValue VZLoad =
41128 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
41129 LN->getMemoryVT(), LN->getMemOperand());
41130 DCI.CombineTo(N.getNode(), VZLoad);
41131 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41132 DCI.recursivelyDeleteUnusedNodes(LN);
41133 return N;
41134 }
41135 }
41136
41137 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
41138 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
41139 // if the upper bits of the i64 are zero.
41140 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
41141 N0.getOperand(0).hasOneUse() &&
41142 N0.getOperand(0).getValueType() == MVT::i64) {
41143 SDValue In = N0.getOperand(0);
41144 APInt Mask = APInt::getHighBitsSet(64, 32);
41145 if (DAG.MaskedValueIsZero(In, Mask)) {
41146 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
41147 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
41148 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
41149 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
41150 return DAG.getBitcast(VT, Movl);
41151 }
41152 }
41153
41154 // Load a scalar integer constant directly to XMM instead of transferring an
41155 // immediate value from GPR.
41156 // vzext_movl (scalar_to_vector C) --> load [C,0...]
41157 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
41158 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
41159 // Create a vector constant - scalar constant followed by zeros.
41160 EVT ScalarVT = N0.getOperand(0).getValueType();
41161 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
41162 unsigned NumElts = VT.getVectorNumElements();
41163 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
41164 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
41165 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
41166
41167 // Load the vector constant from constant pool.
41168 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
41169 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
41170 MachinePointerInfo MPI =
41171 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
41172 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
41173 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
41174 MachineMemOperand::MOLoad);
41175 }
41176 }
41177
41178 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
41179 // insert into a zero vector. This helps get VZEXT_MOVL closer to
41180 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
41181 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
41182 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
41183 SDValue V = peekThroughOneUseBitcasts(N0);
41184
41185 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
41186 isNullConstant(V.getOperand(2))) {
41187 SDValue In = V.getOperand(1);
41188 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
41189 In.getValueSizeInBits() /
41190 VT.getScalarSizeInBits());
41191 In = DAG.getBitcast(SubVT, In);
41192 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
41193 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
41194 getZeroVector(VT, Subtarget, DAG, DL), Movl,
41195 V.getOperand(2));
41196 }
41197 }
41198
41199 return SDValue();
41200 }
41201 case X86ISD::BLENDI: {
41202 SDValue N0 = N.getOperand(0);
41203 SDValue N1 = N.getOperand(1);
41204
41205 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41206 // TODO: Handle MVT::v16i16 repeated blend mask.
41207 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
41208 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41209 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41210 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
41211 SrcVT.getScalarSizeInBits() >= 32) {
41212 unsigned BlendMask = N.getConstantOperandVal(2);
41213 unsigned Size = VT.getVectorNumElements();
41214 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
41215 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
41216 return DAG.getBitcast(
41217 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41218 N1.getOperand(0),
41219 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
41220 }
41221 }
41222 return SDValue();
41223 }
41224 case X86ISD::SHUFP: {
41225 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
41226 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
41227 // TODO: Support types other than v4f32.
41228 if (VT == MVT::v4f32) {
41229 bool Updated = false;
41230 SmallVector<int> Mask;
41231 SmallVector<SDValue> Ops;
41232 if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
41233 Ops.size() == 2) {
41234 for (int i = 0; i != 2; ++i) {
41235 SmallVector<SDValue> SubOps;
41236 SmallVector<int> SubMask, SubScaledMask;
41237 SDValue Sub = peekThroughBitcasts(Ops[i]);
41238 // TODO: Scaling might be easier if we specify the demanded elts.
41239 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
41240 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
41241 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
41242 int Ofs = i * 2;
41243 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
41244 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
41245 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
41246 Updated = true;
41247 }
41248 }
41249 }
41250 if (Updated) {
41251 for (int &M : Mask)
41252 M %= 4;
41253 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41254 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
41255 }
41256 }
41257 return SDValue();
41258 }
41259 case X86ISD::VPERMI: {
41260 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
41261 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
41262 SDValue N0 = N.getOperand(0);
41263 SDValue N1 = N.getOperand(1);
41264 unsigned EltSizeInBits = VT.getScalarSizeInBits();
41265 if (N0.getOpcode() == ISD::BITCAST &&
41266 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
41267 SDValue Src = N0.getOperand(0);
41268 EVT SrcVT = Src.getValueType();
41269 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
41270 return DAG.getBitcast(VT, Res);
41271 }
41272 return SDValue();
41273 }
41274 case X86ISD::VPERM2X128: {
41275 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
41276 SDValue LHS = N->getOperand(0);
41277 SDValue RHS = N->getOperand(1);
41278 if (LHS.getOpcode() == ISD::BITCAST &&
41279 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
41280 EVT SrcVT = LHS.getOperand(0).getValueType();
41281 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
41282 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
41283 DAG.getBitcast(SrcVT, LHS),
41284 DAG.getBitcast(SrcVT, RHS),
41285 N->getOperand(2)));
41286 }
41287 }
41288
41289 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
41290 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
41291 return Res;
41292
41293 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
41294 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
41295 auto FindSubVector128 = [&](unsigned Idx) {
41296 if (Idx > 3)
41297 return SDValue();
41298 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
41299 SmallVector<SDValue> SubOps;
41300 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
41301 return SubOps[Idx & 1];
41302 unsigned NumElts = Src.getValueType().getVectorNumElements();
41303 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
41304 Src.getOperand(1).getValueSizeInBits() == 128 &&
41305 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
41306 return Src.getOperand(1);
41307 }
41308 return SDValue();
41309 };
41310 unsigned Imm = N.getConstantOperandVal(2);
41311 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
41312 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
41313 MVT SubVT = VT.getHalfNumVectorElementsVT();
41314 SubLo = DAG.getBitcast(SubVT, SubLo);
41315 SubHi = DAG.getBitcast(SubVT, SubHi);
41316 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
41317 }
41318 }
41319 return SDValue();
41320 }
41321 case X86ISD::PSHUFD:
41322 case X86ISD::PSHUFLW:
41323 case X86ISD::PSHUFHW:
41324 Mask = getPSHUFShuffleMask(N);
41325 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41325, __extension__ __PRETTY_FUNCTION__))
;
41326 break;
41327 case X86ISD::MOVSD:
41328 case X86ISD::MOVSH:
41329 case X86ISD::MOVSS: {
41330 SDValue N0 = N.getOperand(0);
41331 SDValue N1 = N.getOperand(1);
41332
41333 // Canonicalize scalar FPOps:
41334 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
41335 // If commutable, allow OP(N1[0], N0[0]).
41336 unsigned Opcode1 = N1.getOpcode();
41337 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
41338 Opcode1 == ISD::FDIV) {
41339 SDValue N10 = N1.getOperand(0);
41340 SDValue N11 = N1.getOperand(1);
41341 if (N10 == N0 ||
41342 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
41343 if (N10 != N0)
41344 std::swap(N10, N11);
41345 MVT SVT = VT.getVectorElementType();
41346 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
41347 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
41348 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
41349 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
41350 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
41351 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
41352 }
41353 }
41354
41355 return SDValue();
41356 }
41357 case X86ISD::INSERTPS: {
41358 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41358, __extension__
__PRETTY_FUNCTION__))
;
41359 SDValue Op0 = N.getOperand(0);
41360 SDValue Op1 = N.getOperand(1);
41361 unsigned InsertPSMask = N.getConstantOperandVal(2);
41362 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
41363 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
41364 unsigned ZeroMask = InsertPSMask & 0xF;
41365
41366 // If we zero out all elements from Op0 then we don't need to reference it.
41367 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
41368 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
41369 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41370
41371 // If we zero out the element from Op1 then we don't need to reference it.
41372 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
41373 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
41374 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41375
41376 // Attempt to merge insertps Op1 with an inner target shuffle node.
41377 SmallVector<int, 8> TargetMask1;
41378 SmallVector<SDValue, 2> Ops1;
41379 APInt KnownUndef1, KnownZero1;
41380 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
41381 KnownZero1)) {
41382 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
41383 // Zero/UNDEF insertion - zero out element and remove dependency.
41384 InsertPSMask |= (1u << DstIdx);
41385 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
41386 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41387 }
41388 // Update insertps mask srcidx and reference the source input directly.
41389 int M = TargetMask1[SrcIdx];
41390 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41390, __extension__
__PRETTY_FUNCTION__))
;
41391 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
41392 Op1 = Ops1[M < 4 ? 0 : 1];
41393 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41394 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41395 }
41396
41397 // Attempt to merge insertps Op0 with an inner target shuffle node.
41398 SmallVector<int, 8> TargetMask0;
41399 SmallVector<SDValue, 2> Ops0;
41400 APInt KnownUndef0, KnownZero0;
41401 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
41402 KnownZero0)) {
41403 bool Updated = false;
41404 bool UseInput00 = false;
41405 bool UseInput01 = false;
41406 for (int i = 0; i != 4; ++i) {
41407 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
41408 // No change if element is already zero or the inserted element.
41409 continue;
41410 }
41411
41412 if (KnownUndef0[i] || KnownZero0[i]) {
41413 // If the target mask is undef/zero then we must zero the element.
41414 InsertPSMask |= (1u << i);
41415 Updated = true;
41416 continue;
41417 }
41418
41419 // The input vector element must be inline.
41420 int M = TargetMask0[i];
41421 if (M != i && M != (i + 4))
41422 return SDValue();
41423
41424 // Determine which inputs of the target shuffle we're using.
41425 UseInput00 |= (0 <= M && M < 4);
41426 UseInput01 |= (4 <= M);
41427 }
41428
41429 // If we're not using both inputs of the target shuffle then use the
41430 // referenced input directly.
41431 if (UseInput00 && !UseInput01) {
41432 Updated = true;
41433 Op0 = Ops0[0];
41434 } else if (!UseInput00 && UseInput01) {
41435 Updated = true;
41436 Op0 = Ops0[1];
41437 }
41438
41439 if (Updated)
41440 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41441 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41442 }
41443
41444 // If we're inserting an element from a vbroadcast load, fold the
41445 // load into the X86insertps instruction. We need to convert the scalar
41446 // load to a vector and clear the source lane of the INSERTPS control.
41447 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
41448 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
41449 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
41450 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
41451 MemIntr->getBasePtr(),
41452 MemIntr->getMemOperand());
41453 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
41454 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
41455 Load),
41456 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
41457 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
41458 return Insert;
41459 }
41460 }
41461
41462 return SDValue();
41463 }
41464 default:
41465 return SDValue();
41466 }
41467
41468 // Nuke no-op shuffles that show up after combining.
41469 if (isNoopShuffleMask(Mask))
41470 return N.getOperand(0);
41471
41472 // Look for simplifications involving one or two shuffle instructions.
41473 SDValue V = N.getOperand(0);
41474 switch (N.getOpcode()) {
41475 default:
41476 break;
41477 case X86ISD::PSHUFLW:
41478 case X86ISD::PSHUFHW:
41479 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41479, __extension__
__PRETTY_FUNCTION__))
;
41480
41481 // See if this reduces to a PSHUFD which is no more expensive and can
41482 // combine with more operations. Note that it has to at least flip the
41483 // dwords as otherwise it would have been removed as a no-op.
41484 if (ArrayRef(Mask).equals({2, 3, 0, 1})) {
41485 int DMask[] = {0, 1, 2, 3};
41486 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
41487 DMask[DOffset + 0] = DOffset + 1;
41488 DMask[DOffset + 1] = DOffset + 0;
41489 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
41490 V = DAG.getBitcast(DVT, V);
41491 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
41492 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
41493 return DAG.getBitcast(VT, V);
41494 }
41495
41496 // Look for shuffle patterns which can be implemented as a single unpack.
41497 // FIXME: This doesn't handle the location of the PSHUFD generically, and
41498 // only works when we have a PSHUFD followed by two half-shuffles.
41499 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
41500 (V.getOpcode() == X86ISD::PSHUFLW ||
41501 V.getOpcode() == X86ISD::PSHUFHW) &&
41502 V.getOpcode() != N.getOpcode() &&
41503 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
41504 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
41505 if (D.getOpcode() == X86ISD::PSHUFD) {
41506 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
41507 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
41508 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41509 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41510 int WordMask[8];
41511 for (int i = 0; i < 4; ++i) {
41512 WordMask[i + NOffset] = Mask[i] + NOffset;
41513 WordMask[i + VOffset] = VMask[i] + VOffset;
41514 }
41515 // Map the word mask through the DWord mask.
41516 int MappedMask[8];
41517 for (int i = 0; i < 8; ++i)
41518 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
41519 if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
41520 ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
41521 // We can replace all three shuffles with an unpack.
41522 V = DAG.getBitcast(VT, D.getOperand(0));
41523 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
41524 : X86ISD::UNPCKH,
41525 DL, VT, V, V);
41526 }
41527 }
41528 }
41529
41530 break;
41531
41532 case X86ISD::PSHUFD:
41533 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
41534 return NewN;
41535
41536 break;
41537 }
41538
41539 return SDValue();
41540}
41541
41542/// Checks if the shuffle mask takes subsequent elements
41543/// alternately from two vectors.
41544/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
41545static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
41546
41547 int ParitySrc[2] = {-1, -1};
41548 unsigned Size = Mask.size();
41549 for (unsigned i = 0; i != Size; ++i) {
41550 int M = Mask[i];
41551 if (M < 0)
41552 continue;
41553
41554 // Make sure we are using the matching element from the input.
41555 if ((M % Size) != i)
41556 return false;
41557
41558 // Make sure we use the same input for all elements of the same parity.
41559 int Src = M / Size;
41560 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
41561 return false;
41562 ParitySrc[i % 2] = Src;
41563 }
41564
41565 // Make sure each input is used.
41566 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
41567 return false;
41568
41569 Op0Even = ParitySrc[0] == 0;
41570 return true;
41571}
41572
41573/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
41574/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41575/// are written to the parameters \p Opnd0 and \p Opnd1.
41576///
41577/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41578/// so it is easier to generically match. We also insert dummy vector shuffle
41579/// nodes for the operands which explicitly discard the lanes which are unused
41580/// by this operation to try to flow through the rest of the combiner
41581/// the fact that they're unused.
41582static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41583 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41584 bool &IsSubAdd) {
41585
41586 EVT VT = N->getValueType(0);
41587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41588 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41589 !VT.getSimpleVT().isFloatingPoint())
41590 return false;
41591
41592 // We only handle target-independent shuffles.
41593 // FIXME: It would be easy and harmless to use the target shuffle mask
41594 // extraction tool to support more.
41595 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41596 return false;
41597
41598 SDValue V1 = N->getOperand(0);
41599 SDValue V2 = N->getOperand(1);
41600
41601 // Make sure we have an FADD and an FSUB.
41602 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41603 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41604 V1.getOpcode() == V2.getOpcode())
41605 return false;
41606
41607 // If there are other uses of these operations we can't fold them.
41608 if (!V1->hasOneUse() || !V2->hasOneUse())
41609 return false;
41610
41611 // Ensure that both operations have the same operands. Note that we can
41612 // commute the FADD operands.
41613 SDValue LHS, RHS;
41614 if (V1.getOpcode() == ISD::FSUB) {
41615 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41616 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41617 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41618 return false;
41619 } else {
41620 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41620, __extension__
__PRETTY_FUNCTION__))
;
41621 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41622 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41623 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41624 return false;
41625 }
41626
41627 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41628 bool Op0Even;
41629 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41630 return false;
41631
41632 // It's a subadd if the vector in the even parity is an FADD.
41633 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41634 : V2->getOpcode() == ISD::FADD;
41635
41636 Opnd0 = LHS;
41637 Opnd1 = RHS;
41638 return true;
41639}
41640
41641/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41642static SDValue combineShuffleToFMAddSub(SDNode *N,
41643 const X86Subtarget &Subtarget,
41644 SelectionDAG &DAG) {
41645 // We only handle target-independent shuffles.
41646 // FIXME: It would be easy and harmless to use the target shuffle mask
41647 // extraction tool to support more.
41648 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41649 return SDValue();
41650
41651 MVT VT = N->getSimpleValueType(0);
41652 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41653 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41654 return SDValue();
41655
41656 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41657 SDValue Op0 = N->getOperand(0);
41658 SDValue Op1 = N->getOperand(1);
41659 SDValue FMAdd = Op0, FMSub = Op1;
41660 if (FMSub.getOpcode() != X86ISD::FMSUB)
41661 std::swap(FMAdd, FMSub);
41662
41663 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41664 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41665 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41666 FMAdd.getOperand(2) != FMSub.getOperand(2))
41667 return SDValue();
41668
41669 // Check for correct shuffle mask.
41670 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41671 bool Op0Even;
41672 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41673 return SDValue();
41674
41675 // FMAddSub takes zeroth operand from FMSub node.
41676 SDLoc DL(N);
41677 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41678 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41679 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41680 FMAdd.getOperand(2));
41681}
41682
41683/// Try to combine a shuffle into a target-specific add-sub or
41684/// mul-add-sub node.
41685static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
41686 const X86Subtarget &Subtarget,
41687 SelectionDAG &DAG) {
41688 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
41689 return V;
41690
41691 SDValue Opnd0, Opnd1;
41692 bool IsSubAdd;
41693 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41694 return SDValue();
41695
41696 MVT VT = N->getSimpleValueType(0);
41697 SDLoc DL(N);
41698
41699 // Try to generate X86ISD::FMADDSUB node here.
41700 SDValue Opnd2;
41701 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41702 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41703 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41704 }
41705
41706 if (IsSubAdd)
41707 return SDValue();
41708
41709 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41710 // the ADDSUB idiom has been successfully recognized. There are no known
41711 // X86 targets with 512-bit ADDSUB instructions!
41712 if (VT.is512BitVector())
41713 return SDValue();
41714
41715 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41716 // the ADDSUB idiom has been successfully recognized. There are no known
41717 // X86 targets with FP16 ADDSUB instructions!
41718 if (VT.getVectorElementType() == MVT::f16)
41719 return SDValue();
41720
41721 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41722}
41723
41724// We are looking for a shuffle where both sources are concatenated with undef
41725// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41726// if we can express this as a single-source shuffle, that's preferable.
41727static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
41728 const X86Subtarget &Subtarget) {
41729 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41730 return SDValue();
41731
41732 EVT VT = N->getValueType(0);
41733
41734 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41735 if (!VT.is128BitVector() && !VT.is256BitVector())
41736 return SDValue();
41737
41738 if (VT.getVectorElementType() != MVT::i32 &&
41739 VT.getVectorElementType() != MVT::i64 &&
41740 VT.getVectorElementType() != MVT::f32 &&
41741 VT.getVectorElementType() != MVT::f64)
41742 return SDValue();
41743
41744 SDValue N0 = N->getOperand(0);
41745 SDValue N1 = N->getOperand(1);
41746
41747 // Check that both sources are concats with undef.
41748 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41749 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41750 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41751 !N1.getOperand(1).isUndef())
41752 return SDValue();
41753
41754 // Construct the new shuffle mask. Elements from the first source retain their
41755 // index, but elements from the second source no longer need to skip an undef.
41756 SmallVector<int, 8> Mask;
41757 int NumElts = VT.getVectorNumElements();
41758
41759 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
41760 for (int Elt : SVOp->getMask())
41761 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41762
41763 SDLoc DL(N);
41764 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
41765 N1.getOperand(0));
41766 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41767}
41768
41769/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41770/// low half of each source vector and does not set any high half elements in
41771/// the destination vector, narrow the shuffle to half its original size.
41772static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
41773 if (!Shuf->getValueType(0).isSimple())
41774 return SDValue();
41775 MVT VT = Shuf->getSimpleValueType(0);
41776 if (!VT.is256BitVector() && !VT.is512BitVector())
41777 return SDValue();
41778
41779 // See if we can ignore all of the high elements of the shuffle.
41780 ArrayRef<int> Mask = Shuf->getMask();
41781 if (!isUndefUpperHalf(Mask))
41782 return SDValue();
41783
41784 // Check if the shuffle mask accesses only the low half of each input vector
41785 // (half-index output is 0 or 2).
41786 int HalfIdx1, HalfIdx2;
41787 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41788 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41789 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41790 return SDValue();
41791
41792 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41793 // The trick is knowing that all of the insert/extract are actually free
41794 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41795 // of narrow inputs into a narrow output, and that is always cheaper than
41796 // the wide shuffle that we started with.
41797 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41798 Shuf->getOperand(1), HalfMask, HalfIdx1,
41799 HalfIdx2, false, DAG, /*UseConcat*/true);
41800}
41801
41802static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
41803 TargetLowering::DAGCombinerInfo &DCI,
41804 const X86Subtarget &Subtarget) {
41805 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41806 if (SDValue V = narrowShuffle(Shuf, DAG))
41807 return V;
41808
41809 // If we have legalized the vector types, look for blends of FADD and FSUB
41810 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41811 SDLoc dl(N);
41812 EVT VT = N->getValueType(0);
41813 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41814 if (TLI.isTypeLegal(VT))
41815 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
41816 return AddSub;
41817
41818 // Attempt to combine into a vector load/broadcast.
41819 if (SDValue LD = combineToConsecutiveLoads(
41820 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41821 return LD;
41822
41823 // For AVX2, we sometimes want to combine
41824 // (vector_shuffle <mask> (concat_vectors t1, undef)
41825 // (concat_vectors t2, undef))
41826 // Into:
41827 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41828 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41829 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
41830 return ShufConcat;
41831
41832 if (isTargetShuffle(N->getOpcode())) {
41833 SDValue Op(N, 0);
41834 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
41835 return Shuffle;
41836
41837 // Try recursively combining arbitrary sequences of x86 shuffle
41838 // instructions into higher-order shuffles. We do this after combining
41839 // specific PSHUF instruction sequences into their minimal form so that we
41840 // can evaluate how many specialized shuffle instructions are involved in
41841 // a particular chain.
41842 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41843 return Res;
41844
41845 // Simplify source operands based on shuffle mask.
41846 // TODO - merge this into combineX86ShufflesRecursively.
41847 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41848 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41849 return SDValue(N, 0);
41850
41851 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41852 // Perform this after other shuffle combines to allow inner shuffles to be
41853 // combined away first.
41854 if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
41855 return BinOp;
41856 }
41857
41858 return SDValue();
41859}
41860
41861// Simplify variable target shuffle masks based on the demanded elements.
41862// TODO: Handle DemandedBits in mask indices as well?
41863bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
41864 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41865 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41866 // If we're demanding all elements don't bother trying to simplify the mask.
41867 unsigned NumElts = DemandedElts.getBitWidth();
41868 if (DemandedElts.isAllOnes())
41869 return false;
41870
41871 SDValue Mask = Op.getOperand(MaskIndex);
41872 if (!Mask.hasOneUse())
41873 return false;
41874
41875 // Attempt to generically simplify the variable shuffle mask.
41876 APInt MaskUndef, MaskZero;
41877 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41878 Depth + 1))
41879 return true;
41880
41881 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41882 // TODO: Support other types from getTargetShuffleMaskIndices?
41883 SDValue BC = peekThroughOneUseBitcasts(Mask);
41884 EVT BCVT = BC.getValueType();
41885 auto *Load = dyn_cast<LoadSDNode>(BC);
41886 if (!Load)
41887 return false;
41888
41889 const Constant *C = getTargetConstantFromNode(Load);
41890 if (!C)
41891 return false;
41892
41893 Type *CTy = C->getType();
41894 if (!CTy->isVectorTy() ||
41895 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41896 return false;
41897
41898 // Handle scaling for i64 elements on 32-bit targets.
41899 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41900 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41901 return false;
41902 unsigned Scale = NumCstElts / NumElts;
41903
41904 // Simplify mask if we have an undemanded element that is not undef.
41905 bool Simplified = false;
41906 SmallVector<Constant *, 32> ConstVecOps;
41907 for (unsigned i = 0; i != NumCstElts; ++i) {
41908 Constant *Elt = C->getAggregateElement(i);
41909 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41910 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41911 Simplified = true;
41912 continue;
41913 }
41914 ConstVecOps.push_back(Elt);
41915 }
41916 if (!Simplified)
41917 return false;
41918
41919 // Generate new constant pool entry + legalize immediately for the load.
41920 SDLoc DL(Op);
41921 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41922 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41923 SDValue NewMask = TLO.DAG.getLoad(
41924 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41925 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
41926 Load->getAlign());
41927 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41928}
41929
41930bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
41931 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41932 TargetLoweringOpt &TLO, unsigned Depth) const {
41933 int NumElts = DemandedElts.getBitWidth();
41934 unsigned Opc = Op.getOpcode();
41935 EVT VT = Op.getValueType();
41936
41937 // Handle special case opcodes.
41938 switch (Opc) {
41939 case X86ISD::PMULDQ:
41940 case X86ISD::PMULUDQ: {
41941 APInt LHSUndef, LHSZero;
41942 APInt RHSUndef, RHSZero;
41943 SDValue LHS = Op.getOperand(0);
41944 SDValue RHS = Op.getOperand(1);
41945 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41946 Depth + 1))
41947 return true;
41948 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41949 Depth + 1))
41950 return true;
41951 // Multiply by zero.
41952 KnownZero = LHSZero | RHSZero;
41953 break;
41954 }
41955 case X86ISD::VPMADDWD: {
41956 APInt LHSUndef, LHSZero;
41957 APInt RHSUndef, RHSZero;
41958 SDValue LHS = Op.getOperand(0);
41959 SDValue RHS = Op.getOperand(1);
41960 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41961
41962 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41963 Depth + 1))
41964 return true;
41965 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41966 Depth + 1))
41967 return true;
41968
41969 // TODO: Multiply by zero.
41970
41971 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41972 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41973 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41974 Depth + 1))
41975 return true;
41976 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41977 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41978 Depth + 1))
41979 return true;
41980 break;
41981 }
41982 case X86ISD::PSADBW: {
41983 SDValue LHS = Op.getOperand(0);
41984 SDValue RHS = Op.getOperand(1);
41985 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))
41986 LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))
41987 LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))
41988 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))
;
41989
41990 // Aggressively peek through ops to get at the demanded elts.
41991 if (!DemandedElts.isAllOnes()) {
41992 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41993 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41994 SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
41995 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41996 SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
41997 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41998 if (NewLHS || NewRHS) {
41999 NewLHS = NewLHS ? NewLHS : LHS;
42000 NewRHS = NewRHS ? NewRHS : RHS;
42001 return TLO.CombineTo(
42002 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42003 }
42004 }
42005 break;
42006 }
42007 case X86ISD::VSHL:
42008 case X86ISD::VSRL:
42009 case X86ISD::VSRA: {
42010 // We only need the bottom 64-bits of the (128-bit) shift amount.
42011 SDValue Amt = Op.getOperand(1);
42012 MVT AmtVT = Amt.getSimpleValueType();
42013 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42013, __extension__
__PRETTY_FUNCTION__))
;
42014
42015 // If we reuse the shift amount just for sse shift amounts then we know that
42016 // only the bottom 64-bits are only ever used.
42017 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
42018 unsigned UseOpc = Use->getOpcode();
42019 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
42020 UseOpc == X86ISD::VSRA) &&
42021 Use->getOperand(0) != Amt;
42022 });
42023
42024 APInt AmtUndef, AmtZero;
42025 unsigned NumAmtElts = AmtVT.getVectorNumElements();
42026 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
42027 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
42028 Depth + 1, AssumeSingleUse))
42029 return true;
42030 [[fallthrough]];
42031 }
42032 case X86ISD::VSHLI:
42033 case X86ISD::VSRLI:
42034 case X86ISD::VSRAI: {
42035 SDValue Src = Op.getOperand(0);
42036 APInt SrcUndef;
42037 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
42038 Depth + 1))
42039 return true;
42040
42041 // Fold shift(0,x) -> 0
42042 if (DemandedElts.isSubsetOf(KnownZero))
42043 return TLO.CombineTo(
42044 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42045
42046 // Aggressively peek through ops to get at the demanded elts.
42047 if (!DemandedElts.isAllOnes())
42048 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
42049 Src, DemandedElts, TLO.DAG, Depth + 1))
42050 return TLO.CombineTo(
42051 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
42052 break;
42053 }
42054 case X86ISD::VPSHA:
42055 case X86ISD::VPSHL:
42056 case X86ISD::VSHLV:
42057 case X86ISD::VSRLV:
42058 case X86ISD::VSRAV: {
42059 APInt LHSUndef, LHSZero;
42060 APInt RHSUndef, RHSZero;
42061 SDValue LHS = Op.getOperand(0);
42062 SDValue RHS = Op.getOperand(1);
42063 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42064 Depth + 1))
42065 return true;
42066
42067 // Fold shift(0,x) -> 0
42068 if (DemandedElts.isSubsetOf(LHSZero))
42069 return TLO.CombineTo(
42070 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42071
42072 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42073 Depth + 1))
42074 return true;
42075
42076 KnownZero = LHSZero;
42077 break;
42078 }
42079 case X86ISD::KSHIFTL: {
42080 SDValue Src = Op.getOperand(0);
42081 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42082 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42082, __extension__
__PRETTY_FUNCTION__))
;
42083 unsigned ShiftAmt = Amt->getZExtValue();
42084
42085 if (ShiftAmt == 0)
42086 return TLO.CombineTo(Op, Src);
42087
42088 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42089 // single shift. We can do this if the bottom bits (which are shifted
42090 // out) are never demanded.
42091 if (Src.getOpcode() == X86ISD::KSHIFTR) {
42092 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
42093 unsigned C1 = Src.getConstantOperandVal(1);
42094 unsigned NewOpc = X86ISD::KSHIFTL;
42095 int Diff = ShiftAmt - C1;
42096 if (Diff < 0) {
42097 Diff = -Diff;
42098 NewOpc = X86ISD::KSHIFTR;
42099 }
42100
42101 SDLoc dl(Op);
42102 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42103 return TLO.CombineTo(
42104 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42105 }
42106 }
42107
42108 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
42109 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42110 Depth + 1))
42111 return true;
42112
42113 KnownUndef <<= ShiftAmt;
42114 KnownZero <<= ShiftAmt;
42115 KnownZero.setLowBits(ShiftAmt);
42116 break;
42117 }
42118 case X86ISD::KSHIFTR: {
42119 SDValue Src = Op.getOperand(0);
42120 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42121 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42121, __extension__
__PRETTY_FUNCTION__))
;
42122 unsigned ShiftAmt = Amt->getZExtValue();
42123
42124 if (ShiftAmt == 0)
42125 return TLO.CombineTo(Op, Src);
42126
42127 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
42128 // single shift. We can do this if the top bits (which are shifted
42129 // out) are never demanded.
42130 if (Src.getOpcode() == X86ISD::KSHIFTL) {
42131 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
42132 unsigned C1 = Src.getConstantOperandVal(1);
42133 unsigned NewOpc = X86ISD::KSHIFTR;
42134 int Diff = ShiftAmt - C1;
42135 if (Diff < 0) {
42136 Diff = -Diff;
42137 NewOpc = X86ISD::KSHIFTL;
42138 }
42139
42140 SDLoc dl(Op);
42141 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42142 return TLO.CombineTo(
42143 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42144 }
42145 }
42146
42147 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
42148 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42149 Depth + 1))
42150 return true;
42151
42152 KnownUndef.lshrInPlace(ShiftAmt);
42153 KnownZero.lshrInPlace(ShiftAmt);
42154 KnownZero.setHighBits(ShiftAmt);
42155 break;
42156 }
42157 case X86ISD::ANDNP: {
42158 // ANDNP = (~LHS & RHS);
42159 SDValue LHS = Op.getOperand(0);
42160 SDValue RHS = Op.getOperand(1);
42161
42162 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
42163 APInt UndefElts;
42164 SmallVector<APInt> EltBits;
42165 int NumElts = VT.getVectorNumElements();
42166 int EltSizeInBits = VT.getScalarSizeInBits();
42167 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
42168 APInt OpElts = DemandedElts;
42169 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
42170 EltBits)) {
42171 OpBits.clearAllBits();
42172 OpElts.clearAllBits();
42173 for (int I = 0; I != NumElts; ++I) {
42174 if (!DemandedElts[I])
42175 continue;
42176 if (UndefElts[I]) {
42177 // We can't assume an undef src element gives an undef dst - the
42178 // other src might be zero.
42179 OpBits.setAllBits();
42180 OpElts.setBit(I);
42181 } else if ((Invert && !EltBits[I].isAllOnes()) ||
42182 (!Invert && !EltBits[I].isZero())) {
42183 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
42184 OpElts.setBit(I);
42185 }
42186 }
42187 }
42188 return std::make_pair(OpBits, OpElts);
42189 };
42190 APInt BitsLHS, EltsLHS;
42191 APInt BitsRHS, EltsRHS;
42192 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
42193 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
42194
42195 APInt LHSUndef, LHSZero;
42196 APInt RHSUndef, RHSZero;
42197 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
42198 Depth + 1))
42199 return true;
42200 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
42201 Depth + 1))
42202 return true;
42203
42204 if (!DemandedElts.isAllOnes()) {
42205 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
42206 TLO.DAG, Depth + 1);
42207 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
42208 TLO.DAG, Depth + 1);
42209 if (NewLHS || NewRHS) {
42210 NewLHS = NewLHS ? NewLHS : LHS;
42211 NewRHS = NewRHS ? NewRHS : RHS;
42212 return TLO.CombineTo(
42213 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42214 }
42215 }
42216 break;
42217 }
42218 case X86ISD::CVTSI2P:
42219 case X86ISD::CVTUI2P: {
42220 SDValue Src = Op.getOperand(0);
42221 MVT SrcVT = Src.getSimpleValueType();
42222 APInt SrcUndef, SrcZero;
42223 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42224 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42225 Depth + 1))
42226 return true;
42227 break;
42228 }
42229 case X86ISD::PACKSS:
42230 case X86ISD::PACKUS: {
42231 SDValue N0 = Op.getOperand(0);
42232 SDValue N1 = Op.getOperand(1);
42233
42234 APInt DemandedLHS, DemandedRHS;
42235 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42236
42237 APInt LHSUndef, LHSZero;
42238 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42239 Depth + 1))
42240 return true;
42241 APInt RHSUndef, RHSZero;
42242 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42243 Depth + 1))
42244 return true;
42245
42246 // TODO - pass on known zero/undef.
42247
42248 // Aggressively peek through ops to get at the demanded elts.
42249 // TODO - we should do this for all target/faux shuffles ops.
42250 if (!DemandedElts.isAllOnes()) {
42251 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42252 TLO.DAG, Depth + 1);
42253 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42254 TLO.DAG, Depth + 1);
42255 if (NewN0 || NewN1) {
42256 NewN0 = NewN0 ? NewN0 : N0;
42257 NewN1 = NewN1 ? NewN1 : N1;
42258 return TLO.CombineTo(Op,
42259 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42260 }
42261 }
42262 break;
42263 }
42264 case X86ISD::HADD:
42265 case X86ISD::HSUB:
42266 case X86ISD::FHADD:
42267 case X86ISD::FHSUB: {
42268 SDValue N0 = Op.getOperand(0);
42269 SDValue N1 = Op.getOperand(1);
42270
42271 APInt DemandedLHS, DemandedRHS;
42272 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42273
42274 APInt LHSUndef, LHSZero;
42275 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42276 Depth + 1))
42277 return true;
42278 APInt RHSUndef, RHSZero;
42279 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42280 Depth + 1))
42281 return true;
42282
42283 // TODO - pass on known zero/undef.
42284
42285 // Aggressively peek through ops to get at the demanded elts.
42286 // TODO: Handle repeated operands.
42287 if (N0 != N1 && !DemandedElts.isAllOnes()) {
42288 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42289 TLO.DAG, Depth + 1);
42290 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42291 TLO.DAG, Depth + 1);
42292 if (NewN0 || NewN1) {
42293 NewN0 = NewN0 ? NewN0 : N0;
42294 NewN1 = NewN1 ? NewN1 : N1;
42295 return TLO.CombineTo(Op,
42296 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42297 }
42298 }
42299 break;
42300 }
42301 case X86ISD::VTRUNC:
42302 case X86ISD::VTRUNCS:
42303 case X86ISD::VTRUNCUS: {
42304 SDValue Src = Op.getOperand(0);
42305 MVT SrcVT = Src.getSimpleValueType();
42306 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42307 APInt SrcUndef, SrcZero;
42308 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
42309 Depth + 1))
42310 return true;
42311 KnownZero = SrcZero.zextOrTrunc(NumElts);
42312 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
42313 break;
42314 }
42315 case X86ISD::BLENDV: {
42316 APInt SelUndef, SelZero;
42317 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
42318 SelZero, TLO, Depth + 1))
42319 return true;
42320
42321 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
42322 APInt LHSUndef, LHSZero;
42323 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
42324 LHSZero, TLO, Depth + 1))
42325 return true;
42326
42327 APInt RHSUndef, RHSZero;
42328 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
42329 RHSZero, TLO, Depth + 1))
42330 return true;
42331
42332 KnownZero = LHSZero & RHSZero;
42333 KnownUndef = LHSUndef & RHSUndef;
42334 break;
42335 }
42336 case X86ISD::VZEXT_MOVL: {
42337 // If upper demanded elements are already zero then we have nothing to do.
42338 SDValue Src = Op.getOperand(0);
42339 APInt DemandedUpperElts = DemandedElts;
42340 DemandedUpperElts.clearLowBits(1);
42341 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
42342 return TLO.CombineTo(Op, Src);
42343 break;
42344 }
42345 case X86ISD::VBROADCAST: {
42346 SDValue Src = Op.getOperand(0);
42347 MVT SrcVT = Src.getSimpleValueType();
42348 if (!SrcVT.isVector())
42349 break;
42350 // Don't bother broadcasting if we just need the 0'th element.
42351 if (DemandedElts == 1) {
42352 if (Src.getValueType() != VT)
42353 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
42354 SDLoc(Op));
42355 return TLO.CombineTo(Op, Src);
42356 }
42357 APInt SrcUndef, SrcZero;
42358 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
42359 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42360 Depth + 1))
42361 return true;
42362 // Aggressively peek through src to get at the demanded elt.
42363 // TODO - we should do this for all target/faux shuffles ops.
42364 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
42365 Src, SrcElts, TLO.DAG, Depth + 1))
42366 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42367 break;
42368 }
42369 case X86ISD::VPERMV:
42370 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
42371 Depth))
42372 return true;
42373 break;
42374 case X86ISD::PSHUFB:
42375 case X86ISD::VPERMV3:
42376 case X86ISD::VPERMILPV:
42377 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
42378 Depth))
42379 return true;
42380 break;
42381 case X86ISD::VPPERM:
42382 case X86ISD::VPERMIL2:
42383 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
42384 Depth))
42385 return true;
42386 break;
42387 }
42388
42389 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
42390 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
42391 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
42392 if ((VT.is256BitVector() || VT.is512BitVector()) &&
42393 DemandedElts.lshr(NumElts / 2) == 0) {
42394 unsigned SizeInBits = VT.getSizeInBits();
42395 unsigned ExtSizeInBits = SizeInBits / 2;
42396
42397 // See if 512-bit ops only use the bottom 128-bits.
42398 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
42399 ExtSizeInBits = SizeInBits / 4;
42400
42401 switch (Opc) {
42402 // Scalar broadcast.
42403 case X86ISD::VBROADCAST: {
42404 SDLoc DL(Op);
42405 SDValue Src = Op.getOperand(0);
42406 if (Src.getValueSizeInBits() > ExtSizeInBits)
42407 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
42408 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42409 ExtSizeInBits / VT.getScalarSizeInBits());
42410 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
42411 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42412 TLO.DAG, DL, ExtSizeInBits));
42413 }
42414 case X86ISD::VBROADCAST_LOAD: {
42415 SDLoc DL(Op);
42416 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42417 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42418 ExtSizeInBits / VT.getScalarSizeInBits());
42419 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
42420 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
42421 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
42422 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
42423 MemIntr->getMemOperand());
42424 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
42425 Bcst.getValue(1));
42426 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42427 TLO.DAG, DL, ExtSizeInBits));
42428 }
42429 // Subvector broadcast.
42430 case X86ISD::SUBV_BROADCAST_LOAD: {
42431 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42432 EVT MemVT = MemIntr->getMemoryVT();
42433 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
42434 SDLoc DL(Op);
42435 SDValue Ld =
42436 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
42437 MemIntr->getBasePtr(), MemIntr->getMemOperand());
42438 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
42439 Ld.getValue(1));
42440 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
42441 TLO.DAG, DL, ExtSizeInBits));
42442 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
42443 SDLoc DL(Op);
42444 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42445 ExtSizeInBits / VT.getScalarSizeInBits());
42446 if (SDValue BcstLd =
42447 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
42448 return TLO.CombineTo(Op,
42449 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
42450 TLO.DAG, DL, ExtSizeInBits));
42451 }
42452 break;
42453 }
42454 // Byte shifts by immediate.
42455 case X86ISD::VSHLDQ:
42456 case X86ISD::VSRLDQ:
42457 // Shift by uniform.
42458 case X86ISD::VSHL:
42459 case X86ISD::VSRL:
42460 case X86ISD::VSRA:
42461 // Shift by immediate.
42462 case X86ISD::VSHLI:
42463 case X86ISD::VSRLI:
42464 case X86ISD::VSRAI: {
42465 SDLoc DL(Op);
42466 SDValue Ext0 =
42467 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
42468 SDValue ExtOp =
42469 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
42470 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42471 SDValue Insert =
42472 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42473 return TLO.CombineTo(Op, Insert);
42474 }
42475 case X86ISD::VPERMI: {
42476 // Simplify PERMPD/PERMQ to extract_subvector.
42477 // TODO: This should be done in shuffle combining.
42478 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
42479 SmallVector<int, 4> Mask;
42480 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
42481 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
42482 SDLoc DL(Op);
42483 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
42484 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42485 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
42486 return TLO.CombineTo(Op, Insert);
42487 }
42488 }
42489 break;
42490 }
42491 case X86ISD::VPERM2X128: {
42492 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
42493 SDLoc DL(Op);
42494 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
42495 if (LoMask & 0x8)
42496 return TLO.CombineTo(
42497 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
42498 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
42499 unsigned SrcIdx = (LoMask & 0x2) >> 1;
42500 SDValue ExtOp =
42501 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
42502 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42503 SDValue Insert =
42504 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42505 return TLO.CombineTo(Op, Insert);
42506 }
42507 // Zero upper elements.
42508 case X86ISD::VZEXT_MOVL:
42509 // Target unary shuffles by immediate:
42510 case X86ISD::PSHUFD:
42511 case X86ISD::PSHUFLW:
42512 case X86ISD::PSHUFHW:
42513 case X86ISD::VPERMILPI:
42514 // (Non-Lane Crossing) Target Shuffles.
42515 case X86ISD::VPERMILPV:
42516 case X86ISD::VPERMIL2:
42517 case X86ISD::PSHUFB:
42518 case X86ISD::UNPCKL:
42519 case X86ISD::UNPCKH:
42520 case X86ISD::BLENDI:
42521 // Integer ops.
42522 case X86ISD::PACKSS:
42523 case X86ISD::PACKUS:
42524 // Horizontal Ops.
42525 case X86ISD::HADD:
42526 case X86ISD::HSUB:
42527 case X86ISD::FHADD:
42528 case X86ISD::FHSUB: {
42529 SDLoc DL(Op);
42530 SmallVector<SDValue, 4> Ops;
42531 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
42532 SDValue SrcOp = Op.getOperand(i);
42533 EVT SrcVT = SrcOp.getValueType();
42534 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42535, __extension__
__PRETTY_FUNCTION__))
42535 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42535, __extension__
__PRETTY_FUNCTION__))
;
42536 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42537 ExtSizeInBits)
42538 : SrcOp);
42539 }
42540 MVT ExtVT = VT.getSimpleVT();
42541 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42542 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42543 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42544 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42545 SDValue Insert =
42546 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42547 return TLO.CombineTo(Op, Insert);
42548 }
42549 }
42550 }
42551
42552 // For splats, unless we *only* demand the 0'th element,
42553 // stop attempts at simplification here, we aren't going to improve things,
42554 // this is better than any potential shuffle.
42555 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42556 return false;
42557
42558 // Get target/faux shuffle mask.
42559 APInt OpUndef, OpZero;
42560 SmallVector<int, 64> OpMask;
42561 SmallVector<SDValue, 2> OpInputs;
42562 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42563 OpZero, TLO.DAG, Depth, false))
42564 return false;
42565
42566 // Shuffle inputs must be the same size as the result.
42567 if (OpMask.size() != (unsigned)NumElts ||
42568 llvm::any_of(OpInputs, [VT](SDValue V) {
42569 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42570 !V.getValueType().isVector();
42571 }))
42572 return false;
42573
42574 KnownZero = OpZero;
42575 KnownUndef = OpUndef;
42576
42577 // Check if shuffle mask can be simplified to undef/zero/identity.
42578 int NumSrcs = OpInputs.size();
42579 for (int i = 0; i != NumElts; ++i)
42580 if (!DemandedElts[i])
42581 OpMask[i] = SM_SentinelUndef;
42582
42583 if (isUndefInRange(OpMask, 0, NumElts)) {
42584 KnownUndef.setAllBits();
42585 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42586 }
42587 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42588 KnownZero.setAllBits();
42589 return TLO.CombineTo(
42590 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42591 }
42592 for (int Src = 0; Src != NumSrcs; ++Src)
42593 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42594 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42595
42596 // Attempt to simplify inputs.
42597 for (int Src = 0; Src != NumSrcs; ++Src) {
42598 // TODO: Support inputs of different types.
42599 if (OpInputs[Src].getValueType() != VT)
42600 continue;
42601
42602 int Lo = Src * NumElts;
42603 APInt SrcElts = APInt::getZero(NumElts);
42604 for (int i = 0; i != NumElts; ++i)
42605 if (DemandedElts[i]) {
42606 int M = OpMask[i] - Lo;
42607 if (0 <= M && M < NumElts)
42608 SrcElts.setBit(M);
42609 }
42610
42611 // TODO - Propagate input undef/zero elts.
42612 APInt SrcUndef, SrcZero;
42613 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42614 TLO, Depth + 1))
42615 return true;
42616 }
42617
42618 // If we don't demand all elements, then attempt to combine to a simpler
42619 // shuffle.
42620 // We need to convert the depth to something combineX86ShufflesRecursively
42621 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42622 // to match. This prevents combineX86ShuffleChain from returning a
42623 // combined shuffle that's the same as the original root, causing an
42624 // infinite loop.
42625 if (!DemandedElts.isAllOnes()) {
42626 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42626, __extension__
__PRETTY_FUNCTION__))
;
42627
42628 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42629 for (int i = 0; i != NumElts; ++i)
42630 if (DemandedElts[i])
42631 DemandedMask[i] = i;
42632
42633 SDValue NewShuffle = combineX86ShufflesRecursively(
42634 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42635 /*HasVarMask*/ false,
42636 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42637 Subtarget);
42638 if (NewShuffle)
42639 return TLO.CombineTo(Op, NewShuffle);
42640 }
42641
42642 return false;
42643}
42644
42645bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
42646 SDValue Op, const APInt &OriginalDemandedBits,
42647 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42648 unsigned Depth) const {
42649 EVT VT = Op.getValueType();
42650 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42651 unsigned Opc = Op.getOpcode();
42652 switch(Opc) {
42653 case X86ISD::VTRUNC: {
42654 KnownBits KnownOp;
42655 SDValue Src = Op.getOperand(0);
42656 MVT SrcVT = Src.getSimpleValueType();
42657
42658 // Simplify the input, using demanded bit information.
42659 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42660 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42661 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42662 return true;
42663 break;
42664 }
42665 case X86ISD::PMULDQ:
42666 case X86ISD::PMULUDQ: {
42667 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42668 KnownBits KnownLHS, KnownRHS;
42669 SDValue LHS = Op.getOperand(0);
42670 SDValue RHS = Op.getOperand(1);
42671
42672 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42673 // FIXME: Can we bound this better?
42674 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42675 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42676 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42677
42678 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42679 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42680 DemandedMaskLHS = DemandedMask;
42681 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42682 DemandedMaskRHS = DemandedMask;
42683
42684 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42685 KnownLHS, TLO, Depth + 1))
42686 return true;
42687 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42688 KnownRHS, TLO, Depth + 1))
42689 return true;
42690
42691 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42692 KnownRHS = KnownRHS.trunc(32);
42693 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42694 KnownRHS.getConstant().isOne()) {
42695 SDLoc DL(Op);
42696 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42697 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42698 }
42699
42700 // Aggressively peek through ops to get at the demanded low bits.
42701 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
42702 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42703 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
42704 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42705 if (DemandedLHS || DemandedRHS) {
42706 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42707 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42708 return TLO.CombineTo(
42709 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42710 }
42711 break;
42712 }
42713 case X86ISD::VSHLI: {
42714 SDValue Op0 = Op.getOperand(0);
42715
42716 unsigned ShAmt = Op.getConstantOperandVal(1);
42717 if (ShAmt >= BitWidth)
42718 break;
42719
42720 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42721
42722 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42723 // single shift. We can do this if the bottom bits (which are shifted
42724 // out) are never demanded.
42725 if (Op0.getOpcode() == X86ISD::VSRLI &&
42726 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
42727 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42728 if (Shift2Amt < BitWidth) {
42729 int Diff = ShAmt - Shift2Amt;
42730 if (Diff == 0)
42731 return TLO.CombineTo(Op, Op0.getOperand(0));
42732
42733 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42734 SDValue NewShift = TLO.DAG.getNode(
42735 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42736 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42737 return TLO.CombineTo(Op, NewShift);
42738 }
42739 }
42740
42741 // If we are only demanding sign bits then we can use the shift source directly.
42742 unsigned NumSignBits =
42743 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42744 unsigned UpperDemandedBits =
42745 BitWidth - OriginalDemandedBits.countTrailingZeros();
42746 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42747 return TLO.CombineTo(Op, Op0);
42748
42749 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42750 TLO, Depth + 1))
42751 return true;
42752
42753 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42753, __extension__
__PRETTY_FUNCTION__))
;
42754 Known.Zero <<= ShAmt;
42755 Known.One <<= ShAmt;
42756
42757 // Low bits known zero.
42758 Known.Zero.setLowBits(ShAmt);
42759 return false;
42760 }
42761 case X86ISD::VSRLI: {
42762 unsigned ShAmt = Op.getConstantOperandVal(1);
42763 if (ShAmt >= BitWidth)
42764 break;
42765
42766 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42767
42768 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42769 OriginalDemandedElts, Known, TLO, Depth + 1))
42770 return true;
42771
42772 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42772, __extension__
__PRETTY_FUNCTION__))
;
42773 Known.Zero.lshrInPlace(ShAmt);
42774 Known.One.lshrInPlace(ShAmt);
42775
42776 // High bits known zero.
42777 Known.Zero.setHighBits(ShAmt);
42778 return false;
42779 }
42780 case X86ISD::VSRAI: {
42781 SDValue Op0 = Op.getOperand(0);
42782 SDValue Op1 = Op.getOperand(1);
42783
42784 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
42785 if (ShAmt >= BitWidth)
42786 break;
42787
42788 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42789
42790 // If we just want the sign bit then we don't need to shift it.
42791 if (OriginalDemandedBits.isSignMask())
42792 return TLO.CombineTo(Op, Op0);
42793
42794 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42795 if (Op0.getOpcode() == X86ISD::VSHLI &&
42796 Op.getOperand(1) == Op0.getOperand(1)) {
42797 SDValue Op00 = Op0.getOperand(0);
42798 unsigned NumSignBits =
42799 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42800 if (ShAmt < NumSignBits)
42801 return TLO.CombineTo(Op, Op00);
42802 }
42803
42804 // If any of the demanded bits are produced by the sign extension, we also
42805 // demand the input sign bit.
42806 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
42807 DemandedMask.setSignBit();
42808
42809 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42810 TLO, Depth + 1))
42811 return true;
42812
42813 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42813, __extension__
__PRETTY_FUNCTION__))
;
42814 Known.Zero.lshrInPlace(ShAmt);
42815 Known.One.lshrInPlace(ShAmt);
42816
42817 // If the input sign bit is known to be zero, or if none of the top bits
42818 // are demanded, turn this into an unsigned shift right.
42819 if (Known.Zero[BitWidth - ShAmt - 1] ||
42820 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
42821 return TLO.CombineTo(
42822 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42823
42824 // High bits are known one.
42825 if (Known.One[BitWidth - ShAmt - 1])
42826 Known.One.setHighBits(ShAmt);
42827 return false;
42828 }
42829 case X86ISD::BLENDV: {
42830 SDValue Sel = Op.getOperand(0);
42831 SDValue LHS = Op.getOperand(1);
42832 SDValue RHS = Op.getOperand(2);
42833
42834 APInt SignMask = APInt::getSignMask(BitWidth);
42835 SDValue NewSel = SimplifyMultipleUseDemandedBits(
42836 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42837 SDValue NewLHS = SimplifyMultipleUseDemandedBits(
42838 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42839 SDValue NewRHS = SimplifyMultipleUseDemandedBits(
42840 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42841
42842 if (NewSel || NewLHS || NewRHS) {
42843 NewSel = NewSel ? NewSel : Sel;
42844 NewLHS = NewLHS ? NewLHS : LHS;
42845 NewRHS = NewRHS ? NewRHS : RHS;
42846 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42847 NewSel, NewLHS, NewRHS));
42848 }
42849 break;
42850 }
42851 case X86ISD::PEXTRB:
42852 case X86ISD::PEXTRW: {
42853 SDValue Vec = Op.getOperand(0);
42854 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42855 MVT VecVT = Vec.getSimpleValueType();
42856 unsigned NumVecElts = VecVT.getVectorNumElements();
42857
42858 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42859 unsigned Idx = CIdx->getZExtValue();
42860 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42861
42862 // If we demand no bits from the vector then we must have demanded
42863 // bits from the implict zext - simplify to zero.
42864 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42865 if (DemandedVecBits == 0)
42866 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42867
42868 APInt KnownUndef, KnownZero;
42869 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42870 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42871 KnownZero, TLO, Depth + 1))
42872 return true;
42873
42874 KnownBits KnownVec;
42875 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42876 KnownVec, TLO, Depth + 1))
42877 return true;
42878
42879 if (SDValue V = SimplifyMultipleUseDemandedBits(
42880 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42881 return TLO.CombineTo(
42882 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42883
42884 Known = KnownVec.zext(BitWidth);
42885 return false;
42886 }
42887 break;
42888 }
42889 case X86ISD::PINSRB:
42890 case X86ISD::PINSRW: {
42891 SDValue Vec = Op.getOperand(0);
42892 SDValue Scl = Op.getOperand(1);
42893 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42894 MVT VecVT = Vec.getSimpleValueType();
42895
42896 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42897 unsigned Idx = CIdx->getZExtValue();
42898 if (!OriginalDemandedElts[Idx])
42899 return TLO.CombineTo(Op, Vec);
42900
42901 KnownBits KnownVec;
42902 APInt DemandedVecElts(OriginalDemandedElts);
42903 DemandedVecElts.clearBit(Idx);
42904 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42905 KnownVec, TLO, Depth + 1))
42906 return true;
42907
42908 KnownBits KnownScl;
42909 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42910 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42911 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42912 return true;
42913
42914 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42915 Known = KnownBits::commonBits(KnownVec, KnownScl);
42916 return false;
42917 }
42918 break;
42919 }
42920 case X86ISD::PACKSS:
42921 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42922 // sign bit then we can just ask for the source operands sign bit.
42923 // TODO - add known bits handling.
42924 if (OriginalDemandedBits.isSignMask()) {
42925 APInt DemandedLHS, DemandedRHS;
42926 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42927
42928 KnownBits KnownLHS, KnownRHS;
42929 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42930 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42931 KnownLHS, TLO, Depth + 1))
42932 return true;
42933 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42934 KnownRHS, TLO, Depth + 1))
42935 return true;
42936
42937 // Attempt to avoid multi-use ops if we don't need anything from them.
42938 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
42939 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42940 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
42941 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42942 if (DemandedOp0 || DemandedOp1) {
42943 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42944 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42945 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42946 }
42947 }
42948 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42949 break;
42950 case X86ISD::VBROADCAST: {
42951 SDValue Src = Op.getOperand(0);
42952 MVT SrcVT = Src.getSimpleValueType();
42953 APInt DemandedElts = APInt::getOneBitSet(
42954 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42955 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42956 TLO, Depth + 1))
42957 return true;
42958 // If we don't need the upper bits, attempt to narrow the broadcast source.
42959 // Don't attempt this on AVX512 as it might affect broadcast folding.
42960 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42961 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42962 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&
42963 Src->hasOneUse()) {
42964 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42965 SDValue NewSrc =
42966 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42967 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42968 SDValue NewBcst =
42969 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42970 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42971 }
42972 break;
42973 }
42974 case X86ISD::PCMPGT:
42975 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42976 // iff we only need the sign bit then we can use R directly.
42977 if (OriginalDemandedBits.isSignMask() &&
42978 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42979 return TLO.CombineTo(Op, Op.getOperand(1));
42980 break;
42981 case X86ISD::MOVMSK: {
42982 SDValue Src = Op.getOperand(0);
42983 MVT SrcVT = Src.getSimpleValueType();
42984 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42985 unsigned NumElts = SrcVT.getVectorNumElements();
42986
42987 // If we don't need the sign bits at all just return zero.
42988 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
42989 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42990
42991 // See if we only demand bits from the lower 128-bit vector.
42992 if (SrcVT.is256BitVector() &&
42993 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
42994 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
42995 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42996 }
42997
42998 // Only demand the vector elements of the sign bits we need.
42999 APInt KnownUndef, KnownZero;
43000 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
43001 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43002 TLO, Depth + 1))
43003 return true;
43004
43005 Known.Zero = KnownZero.zext(BitWidth);
43006 Known.Zero.setHighBits(BitWidth - NumElts);
43007
43008 // MOVMSK only uses the MSB from each vector element.
43009 KnownBits KnownSrc;
43010 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
43011 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
43012 Depth + 1))
43013 return true;
43014
43015 if (KnownSrc.One[SrcBits - 1])
43016 Known.One.setLowBits(NumElts);
43017 else if (KnownSrc.Zero[SrcBits - 1])
43018 Known.Zero.setLowBits(NumElts);
43019
43020 // Attempt to avoid multi-use os if we don't need anything from it.
43021 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
43022 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
43023 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43024 return false;
43025 }
43026 case X86ISD::BEXTR:
43027 case X86ISD::BEXTRI: {
43028 SDValue Op0 = Op.getOperand(0);
43029 SDValue Op1 = Op.getOperand(1);
43030
43031 // Only bottom 16-bits of the control bits are required.
43032 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
43033 // NOTE: SimplifyDemandedBits won't do this for constants.
43034 uint64_t Val1 = Cst1->getZExtValue();
43035 uint64_t MaskedVal1 = Val1 & 0xFFFF;
43036 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
43037 SDLoc DL(Op);
43038 return TLO.CombineTo(
43039 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
43040 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
43041 }
43042
43043 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
43044 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
43045
43046 // If the length is 0, the result is 0.
43047 if (Length == 0) {
43048 Known.setAllZero();
43049 return false;
43050 }
43051
43052 if ((Shift + Length) <= BitWidth) {
43053 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
43054 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
43055 return true;
43056
43057 Known = Known.extractBits(Length, Shift);
43058 Known = Known.zextOrTrunc(BitWidth);
43059 return false;
43060 }
43061 } else {
43062 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43062, __extension__
__PRETTY_FUNCTION__))
;
43063 KnownBits Known1;
43064 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
43065 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
43066 return true;
43067
43068 // If the length is 0, replace with 0.
43069 KnownBits LengthBits = Known1.extractBits(8, 8);
43070 if (LengthBits.isZero())
43071 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43072 }
43073
43074 break;
43075 }
43076 case X86ISD::PDEP: {
43077 SDValue Op0 = Op.getOperand(0);
43078 SDValue Op1 = Op.getOperand(1);
43079
43080 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
43081 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
43082
43083 // If the demanded bits has leading zeroes, we don't demand those from the
43084 // mask.
43085 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
43086 return true;
43087
43088 // The number of possible 1s in the mask determines the number of LSBs of
43089 // operand 0 used. Undemanded bits from the mask don't matter so filter
43090 // them before counting.
43091 KnownBits Known2;
43092 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
43093 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
43094 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
43095 return true;
43096
43097 // Zeroes are retained from the mask, but not ones.
43098 Known.One.clearAllBits();
43099 // The result will have at least as many trailing zeros as the non-mask
43100 // operand since bits can only map to the same or higher bit position.
43101 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
43102 return false;
43103 }
43104 }
43105
43106 return TargetLowering::SimplifyDemandedBitsForTargetNode(
43107 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
43108}
43109
43110SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
43111 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
43112 SelectionDAG &DAG, unsigned Depth) const {
43113 int NumElts = DemandedElts.getBitWidth();
43114 unsigned Opc = Op.getOpcode();
43115 EVT VT = Op.getValueType();
43116
43117 switch (Opc) {
43118 case X86ISD::PINSRB:
43119 case X86ISD::PINSRW: {
43120 // If we don't demand the inserted element, return the base vector.
43121 SDValue Vec = Op.getOperand(0);
43122 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43123 MVT VecVT = Vec.getSimpleValueType();
43124 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
43125 !DemandedElts[CIdx->getZExtValue()])
43126 return Vec;
43127 break;
43128 }
43129 case X86ISD::VSHLI: {
43130 // If we are only demanding sign bits then we can use the shift source
43131 // directly.
43132 SDValue Op0 = Op.getOperand(0);
43133 unsigned ShAmt = Op.getConstantOperandVal(1);
43134 unsigned BitWidth = DemandedBits.getBitWidth();
43135 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
43136 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
43137 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43138 return Op0;
43139 break;
43140 }
43141 case X86ISD::VSRAI:
43142 // iff we only need the sign bit then we can use the source directly.
43143 // TODO: generalize where we only demand extended signbits.
43144 if (DemandedBits.isSignMask())
43145 return Op.getOperand(0);
43146 break;
43147 case X86ISD::PCMPGT:
43148 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43149 // iff we only need the sign bit then we can use R directly.
43150 if (DemandedBits.isSignMask() &&
43151 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43152 return Op.getOperand(1);
43153 break;
43154 case X86ISD::ANDNP: {
43155 // ANDNP = (~LHS & RHS);
43156 SDValue LHS = Op.getOperand(0);
43157 SDValue RHS = Op.getOperand(1);
43158
43159 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
43160 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
43161
43162 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
43163 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
43164 // this context, so return RHS.
43165 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
43166 return RHS;
43167 break;
43168 }
43169 }
43170
43171 APInt ShuffleUndef, ShuffleZero;
43172 SmallVector<int, 16> ShuffleMask;
43173 SmallVector<SDValue, 2> ShuffleOps;
43174 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
43175 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
43176 // If all the demanded elts are from one operand and are inline,
43177 // then we can use the operand directly.
43178 int NumOps = ShuffleOps.size();
43179 if (ShuffleMask.size() == (unsigned)NumElts &&
43180 llvm::all_of(ShuffleOps, [VT](SDValue V) {
43181 return VT.getSizeInBits() == V.getValueSizeInBits();
43182 })) {
43183
43184 if (DemandedElts.isSubsetOf(ShuffleUndef))
43185 return DAG.getUNDEF(VT);
43186 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
43187 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
43188
43189 // Bitmask that indicates which ops have only been accessed 'inline'.
43190 APInt IdentityOp = APInt::getAllOnes(NumOps);
43191 for (int i = 0; i != NumElts; ++i) {
43192 int M = ShuffleMask[i];
43193 if (!DemandedElts[i] || ShuffleUndef[i])
43194 continue;
43195 int OpIdx = M / NumElts;
43196 int EltIdx = M % NumElts;
43197 if (M < 0 || EltIdx != i) {
43198 IdentityOp.clearAllBits();
43199 break;
43200 }
43201 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
43202 if (IdentityOp == 0)
43203 break;
43204 }
43205 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43206, __extension__
__PRETTY_FUNCTION__))
43206 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43206, __extension__
__PRETTY_FUNCTION__))
;
43207
43208 if (IdentityOp != 0)
43209 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
43210 }
43211 }
43212
43213 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
43214 Op, DemandedBits, DemandedElts, DAG, Depth);
43215}
43216
43217bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
43218 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43219 bool PoisonOnly, unsigned Depth) const {
43220 unsigned EltsBits = Op.getScalarValueSizeInBits();
43221 unsigned NumElts = DemandedElts.getBitWidth();
43222
43223 // TODO: Add more target shuffles.
43224 switch (Op.getOpcode()) {
43225 case X86ISD::PSHUFD:
43226 case X86ISD::VPERMILPI: {
43227 SmallVector<int, 8> Mask;
43228 DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
43229
43230 APInt DemandedSrcElts = APInt::getZero(NumElts);
43231 for (unsigned I = 0; I != NumElts; ++I)
43232 if (DemandedElts[I])
43233 DemandedSrcElts.setBit(Mask[I]);
43234
43235 return DAG.isGuaranteedNotToBeUndefOrPoison(
43236 Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
43237 }
43238 }
43239 return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
43240 Op, DemandedElts, DAG, PoisonOnly, Depth);
43241}
43242
43243bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
43244 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43245 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
43246
43247 // TODO: Add more target shuffles.
43248 switch (Op.getOpcode()) {
43249 case X86ISD::PSHUFD:
43250 case X86ISD::VPERMILPI:
43251 return false;
43252 }
43253 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
43254 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
43255}
43256
43257bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
43258 const APInt &DemandedElts,
43259 APInt &UndefElts,
43260 const SelectionDAG &DAG,
43261 unsigned Depth) const {
43262 unsigned NumElts = DemandedElts.getBitWidth();
43263 unsigned Opc = Op.getOpcode();
43264
43265 switch (Opc) {
43266 case X86ISD::VBROADCAST:
43267 case X86ISD::VBROADCAST_LOAD:
43268 UndefElts = APInt::getNullValue(NumElts);
43269 return true;
43270 }
43271
43272 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
43273 DAG, Depth);
43274}
43275
43276// Helper to peek through bitops/trunc/setcc to determine size of source vector.
43277// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
43278static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
43279 bool AllowTruncate) {
43280 switch (Src.getOpcode()) {
43281 case ISD::TRUNCATE:
43282 if (!AllowTruncate)
43283 return false;
43284 [[fallthrough]];
43285 case ISD::SETCC:
43286 return Src.getOperand(0).getValueSizeInBits() == Size;
43287 case ISD::AND:
43288 case ISD::XOR:
43289 case ISD::OR:
43290 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
43291 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
43292 case ISD::VSELECT:
43293 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
43294 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
43295 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
43296 case ISD::BUILD_VECTOR:
43297 return ISD::isBuildVectorAllZeros(Src.getNode());
43298
43299 }
43300 return false;
43301}
43302
43303// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
43304static unsigned getAltBitOpcode(unsigned Opcode) {
43305 switch(Opcode) {
43306 case ISD::AND: return X86ISD::FAND;
43307 case ISD::OR: return X86ISD::FOR;
43308 case ISD::XOR: return X86ISD::FXOR;
43309 case X86ISD::ANDNP: return X86ISD::FANDN;
43310 }
43311 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43311)
;
43312}
43313
43314// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
43315static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
43316 const SDLoc &DL) {
43317 EVT SrcVT = Src.getValueType();
43318 if (SrcVT != MVT::v4i1)
43319 return SDValue();
43320
43321 switch (Src.getOpcode()) {
43322 case ISD::SETCC:
43323 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
43324 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
43325 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
43326 SDValue Op0 = Src.getOperand(0);
43327 if (ISD::isNormalLoad(Op0.getNode()))
43328 return DAG.getBitcast(MVT::v4f32, Op0);
43329 if (Op0.getOpcode() == ISD::BITCAST &&
43330 Op0.getOperand(0).getValueType() == MVT::v4f32)
43331 return Op0.getOperand(0);
43332 }
43333 break;
43334 case ISD::AND:
43335 case ISD::XOR:
43336 case ISD::OR: {
43337 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
43338 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
43339 if (Op0 && Op1)
43340 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
43341 Op1);
43342 break;
43343 }
43344 }
43345 return SDValue();
43346}
43347
43348// Helper to push sign extension of vXi1 SETCC result through bitops.
43349static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
43350 SDValue Src, const SDLoc &DL) {
43351 switch (Src.getOpcode()) {
43352 case ISD::SETCC:
43353 case ISD::TRUNCATE:
43354 case ISD::BUILD_VECTOR:
43355 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43356 case ISD::AND:
43357 case ISD::XOR:
43358 case ISD::OR:
43359 return DAG.getNode(
43360 Src.getOpcode(), DL, SExtVT,
43361 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
43362 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
43363 case ISD::VSELECT:
43364 return DAG.getSelect(
43365 DL, SExtVT, Src.getOperand(0),
43366 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
43367 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
43368 }
43369 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43369)
;
43370}
43371
43372// Try to match patterns such as
43373// (i16 bitcast (v16i1 x))
43374// ->
43375// (i16 movmsk (16i8 sext (v16i1 x)))
43376// before the illegal vector is scalarized on subtargets that don't have legal
43377// vxi1 types.
43378static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
43379 const SDLoc &DL,
43380 const X86Subtarget &Subtarget) {
43381 EVT SrcVT = Src.getValueType();
43382 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
43383 return SDValue();
43384
43385 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
43386 // legalization destroys the v4i32 type.
43387 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
43388 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
43389 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
43390 DAG.getBitcast(MVT::v4f32, V));
43391 return DAG.getZExtOrTrunc(V, DL, VT);
43392 }
43393 }
43394
43395 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
43396 // movmskb even with avx512. This will be better than truncating to vXi1 and
43397 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
43398 // vpcmpeqb/vpcmpgtb.
43399 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
43400 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
43401 Src.getOperand(0).getValueType() == MVT::v32i8 ||
43402 Src.getOperand(0).getValueType() == MVT::v64i8);
43403
43404 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
43405 // directly with vpmovmskb/vmovmskps/vmovmskpd.
43406 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
43407 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
43408 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
43409 EVT CmpVT = Src.getOperand(0).getValueType();
43410 EVT EltVT = CmpVT.getVectorElementType();
43411 if (CmpVT.getSizeInBits() <= 256 &&
43412 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
43413 PreferMovMsk = true;
43414 }
43415
43416 // With AVX512 vxi1 types are legal and we prefer using k-regs.
43417 // MOVMSK is supported in SSE2 or later.
43418 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
43419 return SDValue();
43420
43421 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
43422 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43423 // v8i16 and v16i16.
43424 // For these two cases, we can shuffle the upper element bytes to a
43425 // consecutive sequence at the start of the vector and treat the results as
43426 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
43427 // for v16i16 this is not the case, because the shuffle is expensive, so we
43428 // avoid sign-extending to this type entirely.
43429 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
43430 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
43431 MVT SExtVT;
43432 bool PropagateSExt = false;
43433 switch (SrcVT.getSimpleVT().SimpleTy) {
43434 default:
43435 return SDValue();
43436 case MVT::v2i1:
43437 SExtVT = MVT::v2i64;
43438 break;
43439 case MVT::v4i1:
43440 SExtVT = MVT::v4i32;
43441 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
43442 // sign-extend to a 256-bit operation to avoid truncation.
43443 if (Subtarget.hasAVX() &&
43444 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43445 SExtVT = MVT::v4i64;
43446 PropagateSExt = true;
43447 }
43448 break;
43449 case MVT::v8i1:
43450 SExtVT = MVT::v8i16;
43451 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43452 // sign-extend to a 256-bit operation to match the compare.
43453 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43454 // 256-bit because the shuffle is cheaper than sign extending the result of
43455 // the compare.
43456 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43457 checkBitcastSrcVectorSize(Src, 512, true))) {
43458 SExtVT = MVT::v8i32;
43459 PropagateSExt = true;
43460 }
43461 break;
43462 case MVT::v16i1:
43463 SExtVT = MVT::v16i8;
43464 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43465 // it is not profitable to sign-extend to 256-bit because this will
43466 // require an extra cross-lane shuffle which is more expensive than
43467 // truncating the result of the compare to 128-bits.
43468 break;
43469 case MVT::v32i1:
43470 SExtVT = MVT::v32i8;
43471 break;
43472 case MVT::v64i1:
43473 // If we have AVX512F, but not AVX512BW and the input is truncated from
43474 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43475 if (Subtarget.hasAVX512()) {
43476 if (Subtarget.hasBWI())
43477 return SDValue();
43478 SExtVT = MVT::v64i8;
43479 break;
43480 }
43481 // Split if this is a <64 x i8> comparison result.
43482 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43483 SExtVT = MVT::v64i8;
43484 break;
43485 }
43486 return SDValue();
43487 };
43488
43489 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43490 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43491
43492 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43493 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43494 } else {
43495 if (SExtVT == MVT::v8i16)
43496 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
43497 DAG.getUNDEF(MVT::v8i16));
43498 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43499 }
43500
43501 EVT IntVT =
43502 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
43503 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43504 return DAG.getBitcast(VT, V);
43505}
43506
43507// Convert a vXi1 constant build vector to the same width scalar integer.
43508static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
43509 EVT SrcVT = Op.getValueType();
43510 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43511, __extension__
__PRETTY_FUNCTION__))
43511 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43511, __extension__
__PRETTY_FUNCTION__))
;
43512 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43513, __extension__
__PRETTY_FUNCTION__))
43513 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43513, __extension__
__PRETTY_FUNCTION__))
;
43514
43515 APInt Imm(SrcVT.getVectorNumElements(), 0);
43516 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43517 SDValue In = Op.getOperand(Idx);
43518 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
43519 Imm.setBit(Idx);
43520 }
43521 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43522 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43523}
43524
43525static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
43526 TargetLowering::DAGCombinerInfo &DCI,
43527 const X86Subtarget &Subtarget) {
43528 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43528, __extension__
__PRETTY_FUNCTION__))
;
43529
43530 if (!DCI.isBeforeLegalizeOps())
43531 return SDValue();
43532
43533 // Only do this if we have k-registers.
43534 if (!Subtarget.hasAVX512())
43535 return SDValue();
43536
43537 EVT DstVT = N->getValueType(0);
43538 SDValue Op = N->getOperand(0);
43539 EVT SrcVT = Op.getValueType();
43540
43541 if (!Op.hasOneUse())
43542 return SDValue();
43543
43544 // Look for logic ops.
43545 if (Op.getOpcode() != ISD::AND &&
43546 Op.getOpcode() != ISD::OR &&
43547 Op.getOpcode() != ISD::XOR)
43548 return SDValue();
43549
43550 // Make sure we have a bitcast between mask registers and a scalar type.
43551 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43552 DstVT.isScalarInteger()) &&
43553 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43554 SrcVT.isScalarInteger()))
43555 return SDValue();
43556
43557 SDValue LHS = Op.getOperand(0);
43558 SDValue RHS = Op.getOperand(1);
43559
43560 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43561 LHS.getOperand(0).getValueType() == DstVT)
43562 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43563 DAG.getBitcast(DstVT, RHS));
43564
43565 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43566 RHS.getOperand(0).getValueType() == DstVT)
43567 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43568 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43569
43570 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43571 // Most of these have to move a constant from the scalar domain anyway.
43572 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
43573 RHS = combinevXi1ConstantToInteger(RHS, DAG);
43574 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43575 DAG.getBitcast(DstVT, LHS), RHS);
43576 }
43577
43578 return SDValue();
43579}
43580
43581static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
43582 const X86Subtarget &Subtarget) {
43583 SDLoc DL(BV);
43584 unsigned NumElts = BV->getNumOperands();
43585 SDValue Splat = BV->getSplatValue();
43586
43587 // Build MMX element from integer GPR or SSE float values.
43588 auto CreateMMXElement = [&](SDValue V) {
43589 if (V.isUndef())
43590 return DAG.getUNDEF(MVT::x86mmx);
43591 if (V.getValueType().isFloatingPoint()) {
43592 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43593 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43594 V = DAG.getBitcast(MVT::v2i64, V);
43595 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43596 }
43597 V = DAG.getBitcast(MVT::i32, V);
43598 } else {
43599 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43600 }
43601 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43602 };
43603
43604 // Convert build vector ops to MMX data in the bottom elements.
43605 SmallVector<SDValue, 8> Ops;
43606
43607 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43608
43609 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43610 if (Splat) {
43611 if (Splat.isUndef())
43612 return DAG.getUNDEF(MVT::x86mmx);
43613
43614 Splat = CreateMMXElement(Splat);
43615
43616 if (Subtarget.hasSSE1()) {
43617 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43618 if (NumElts == 8)
43619 Splat = DAG.getNode(
43620 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43621 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43622 TLI.getPointerTy(DAG.getDataLayout())),
43623 Splat, Splat);
43624
43625 // Use PSHUFW to repeat 16-bit elements.
43626 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43627 return DAG.getNode(
43628 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43629 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43630 TLI.getPointerTy(DAG.getDataLayout())),
43631 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43632 }
43633 Ops.append(NumElts, Splat);
43634 } else {
43635 for (unsigned i = 0; i != NumElts; ++i)
43636 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43637 }
43638
43639 // Use tree of PUNPCKLs to build up general MMX vector.
43640 while (Ops.size() > 1) {
43641 unsigned NumOps = Ops.size();
43642 unsigned IntrinOp =
43643 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43644 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43645 : Intrinsic::x86_mmx_punpcklbw));
43646 SDValue Intrin = DAG.getTargetConstant(
43647 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43648 for (unsigned i = 0; i != NumOps; i += 2)
43649 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43650 Ops[i], Ops[i + 1]);
43651 Ops.resize(NumOps / 2);
43652 }
43653
43654 return Ops[0];
43655}
43656
43657// Recursive function that attempts to find if a bool vector node was originally
43658// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43659// integer. If so, replace the scalar ops with bool vector equivalents back down
43660// the chain.
43661static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
43662 SelectionDAG &DAG,
43663 const X86Subtarget &Subtarget) {
43664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43665 unsigned Opc = V.getOpcode();
43666 switch (Opc) {
43667 case ISD::BITCAST: {
43668 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43669 SDValue Src = V.getOperand(0);
43670 EVT SrcVT = Src.getValueType();
43671 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43672 return DAG.getBitcast(VT, Src);
43673 break;
43674 }
43675 case ISD::TRUNCATE: {
43676 // If we find a suitable source, a truncated scalar becomes a subvector.
43677 SDValue Src = V.getOperand(0);
43678 EVT NewSrcVT =
43679 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43680 if (TLI.isTypeLegal(NewSrcVT))
43681 if (SDValue N0 =
43682 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43683 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43684 DAG.getIntPtrConstant(0, DL));
43685 break;
43686 }
43687 case ISD::ANY_EXTEND:
43688 case ISD::ZERO_EXTEND: {
43689 // If we find a suitable source, an extended scalar becomes a subvector.
43690 SDValue Src = V.getOperand(0);
43691 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43692 Src.getScalarValueSizeInBits());
43693 if (TLI.isTypeLegal(NewSrcVT))
43694 if (SDValue N0 =
43695 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43696 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43697 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43698 : DAG.getConstant(0, DL, VT),
43699 N0, DAG.getIntPtrConstant(0, DL));
43700 break;
43701 }
43702 case ISD::OR: {
43703 // If we find suitable sources, we can just move an OR to the vector domain.
43704 SDValue Src0 = V.getOperand(0);
43705 SDValue Src1 = V.getOperand(1);
43706 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43707 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
43708 return DAG.getNode(Opc, DL, VT, N0, N1);
43709 break;
43710 }
43711 case ISD::SHL: {
43712 // If we find a suitable source, a SHL becomes a KSHIFTL.
43713 SDValue Src0 = V.getOperand(0);
43714 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43715 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43716 break;
43717
43718 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43719 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43720 return DAG.getNode(
43721 X86ISD::KSHIFTL, DL, VT, N0,
43722 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43723 break;
43724 }
43725 }
43726 return SDValue();
43727}
43728
43729static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
43730 TargetLowering::DAGCombinerInfo &DCI,
43731 const X86Subtarget &Subtarget) {
43732 SDValue N0 = N->getOperand(0);
43733 EVT VT = N->getValueType(0);
43734 EVT SrcVT = N0.getValueType();
43735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43736
43737 // Try to match patterns such as
43738 // (i16 bitcast (v16i1 x))
43739 // ->
43740 // (i16 movmsk (16i8 sext (v16i1 x)))
43741 // before the setcc result is scalarized on subtargets that don't have legal
43742 // vxi1 types.
43743 if (DCI.isBeforeLegalize()) {
43744 SDLoc dl(N);
43745 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43746 return V;
43747
43748 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43749 // type, widen both sides to avoid a trip through memory.
43750 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43751 Subtarget.hasAVX512()) {
43752 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43753 N0 = DAG.getBitcast(MVT::v8i1, N0);
43754 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43755 DAG.getIntPtrConstant(0, dl));
43756 }
43757
43758 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43759 // type, widen both sides to avoid a trip through memory.
43760 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43761 Subtarget.hasAVX512()) {
43762 // Use zeros for the widening if we already have some zeroes. This can
43763 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43764 // stream of this.
43765 // FIXME: It might make sense to detect a concat_vectors with a mix of
43766 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43767 // a separate combine. What we can't do is canonicalize the operands of
43768 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43769 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43770 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43771 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43772 SrcVT = LastOp.getValueType();
43773 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43774 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43775 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43776 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43777 N0 = DAG.getBitcast(MVT::i8, N0);
43778 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43779 }
43780 }
43781
43782 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43783 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43784 Ops[0] = N0;
43785 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43786 N0 = DAG.getBitcast(MVT::i8, N0);
43787 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43788 }
43789 } else {
43790 // If we're bitcasting from iX to vXi1, see if the integer originally
43791 // began as a vXi1 and whether we can remove the bitcast entirely.
43792 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43793 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43794 if (SDValue V =
43795 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43796 return V;
43797 }
43798 }
43799
43800 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43801 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43802 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43803 // we can help with known bits propagation from the vXi1 domain to the
43804 // scalar domain.
43805 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43806 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43807 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43808 isNullConstant(N0.getOperand(1)))
43809 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43810 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43811
43812 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43813 // and the vbroadcast_load are both integer or both fp. In some cases this
43814 // will remove the bitcast entirely.
43815 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43816 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43817 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43818 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43819 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43820 // Don't swap i8/i16 since don't have fp types that size.
43821 if (MemSize >= 32) {
43822 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43823 : MVT::getIntegerVT(MemSize);
43824 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43825 : MVT::getIntegerVT(SrcVTSize);
43826 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43827
43828 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43829 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43830 SDValue ResNode =
43831 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
43832 MemVT, BCast->getMemOperand());
43833 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43834 return DAG.getBitcast(VT, ResNode);
43835 }
43836 }
43837
43838 // Since MMX types are special and don't usually play with other vector types,
43839 // it's better to handle them early to be sure we emit efficient code by
43840 // avoiding store-load conversions.
43841 if (VT == MVT::x86mmx) {
43842 // Detect MMX constant vectors.
43843 APInt UndefElts;
43844 SmallVector<APInt, 1> EltBits;
43845 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
43846 SDLoc DL(N0);
43847 // Handle zero-extension of i32 with MOVD.
43848 if (EltBits[0].countLeadingZeros() >= 32)
43849 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43850 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43851 // Else, bitcast to a double.
43852 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43853 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43854 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43855 }
43856
43857 // Detect bitcasts to x86mmx low word.
43858 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43859 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43860 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43861 bool LowUndef = true, AllUndefOrZero = true;
43862 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43863 SDValue Op = N0.getOperand(i);
43864 LowUndef &= Op.isUndef() || (i >= e/2);
43865 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
43866 }
43867 if (AllUndefOrZero) {
43868 SDValue N00 = N0.getOperand(0);
43869 SDLoc dl(N00);
43870 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43871 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43872 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43873 }
43874 }
43875
43876 // Detect bitcasts of 64-bit build vectors and convert to a
43877 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
43878 // lowest element.
43879 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43880 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
43881 SrcVT == MVT::v8i8))
43882 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
43883
43884 // Detect bitcasts between element or subvector extraction to x86mmx.
43885 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
43886 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
43887 isNullConstant(N0.getOperand(1))) {
43888 SDValue N00 = N0.getOperand(0);
43889 if (N00.getValueType().is128BitVector())
43890 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
43891 DAG.getBitcast(MVT::v2i64, N00));
43892 }
43893
43894 // Detect bitcasts from FP_TO_SINT to x86mmx.
43895 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
43896 SDLoc DL(N0);
43897 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
43898 DAG.getUNDEF(MVT::v2i32));
43899 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
43900 DAG.getBitcast(MVT::v2i64, Res));
43901 }
43902 }
43903
43904 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
43905 // most of these to scalar anyway.
43906 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
43907 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43908 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
43909 return combinevXi1ConstantToInteger(N0, DAG);
43910 }
43911
43912 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43913 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43914 isa<ConstantSDNode>(N0)) {
43915 auto *C = cast<ConstantSDNode>(N0);
43916 if (C->isAllOnes())
43917 return DAG.getConstant(1, SDLoc(N0), VT);
43918 if (C->isZero())
43919 return DAG.getConstant(0, SDLoc(N0), VT);
43920 }
43921
43922 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
43923 // Turn it into a sign bit compare that produces a k-register. This avoids
43924 // a trip through a GPR.
43925 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43926 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43927 isPowerOf2_32(VT.getVectorNumElements())) {
43928 unsigned NumElts = VT.getVectorNumElements();
43929 SDValue Src = N0;
43930
43931 // Peek through truncate.
43932 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
43933 Src = N0.getOperand(0);
43934
43935 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
43936 SDValue MovmskIn = Src.getOperand(0);
43937 MVT MovmskVT = MovmskIn.getSimpleValueType();
43938 unsigned MovMskElts = MovmskVT.getVectorNumElements();
43939
43940 // We allow extra bits of the movmsk to be used since they are known zero.
43941 // We can't convert a VPMOVMSKB without avx512bw.
43942 if (MovMskElts <= NumElts &&
43943 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
43944 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
43945 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
43946 SDLoc dl(N);
43947 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
43948 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
43949 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
43950 if (EVT(CmpVT) == VT)
43951 return Cmp;
43952
43953 // Pad with zeroes up to original VT to replace the zeroes that were
43954 // being used from the MOVMSK.
43955 unsigned NumConcats = NumElts / MovMskElts;
43956 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
43957 Ops[0] = Cmp;
43958 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
43959 }
43960 }
43961 }
43962
43963 // Try to remove bitcasts from input and output of mask arithmetic to
43964 // remove GPR<->K-register crossings.
43965 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
43966 return V;
43967
43968 // Convert a bitcasted integer logic operation that has one bitcasted
43969 // floating-point operand into a floating-point logic operation. This may
43970 // create a load of a constant, but that is cheaper than materializing the
43971 // constant in an integer register and transferring it to an SSE register or
43972 // transferring the SSE operand to integer register and back.
43973 unsigned FPOpcode;
43974 switch (N0.getOpcode()) {
43975 case ISD::AND: FPOpcode = X86ISD::FAND; break;
43976 case ISD::OR: FPOpcode = X86ISD::FOR; break;
43977 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
43978 default: return SDValue();
43979 }
43980
43981 // Check if we have a bitcast from another integer type as well.
43982 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43983 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43984 (Subtarget.hasFP16() && VT == MVT::f16) ||
43985 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
43986 TLI.isTypeLegal(VT))))
43987 return SDValue();
43988
43989 SDValue LogicOp0 = N0.getOperand(0);
43990 SDValue LogicOp1 = N0.getOperand(1);
43991 SDLoc DL0(N0);
43992
43993 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
43994 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
43995 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
43996 LogicOp0.getOperand(0).getValueType() == VT &&
43997 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
43998 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
43999 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44000 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
44001 }
44002 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
44003 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
44004 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
44005 LogicOp1.getOperand(0).getValueType() == VT &&
44006 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
44007 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
44008 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44009 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
44010 }
44011
44012 return SDValue();
44013}
44014
44015// (mul (zext a), (sext, b))
44016static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
44017 SDValue &Op1) {
44018 Op0 = Mul.getOperand(0);
44019 Op1 = Mul.getOperand(1);
44020
44021 // The operand1 should be signed extend
44022 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
44023 std::swap(Op0, Op1);
44024
44025 auto IsFreeTruncation = [](SDValue &Op) -> bool {
44026 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
44027 Op.getOpcode() == ISD::SIGN_EXTEND) &&
44028 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
44029 return true;
44030
44031 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
44032 return (BV && BV->isConstant());
44033 };
44034
44035 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
44036 // value, we need to check Op0 is zero extended value. Op1 should be signed
44037 // value, so we just check the signed bits.
44038 if ((IsFreeTruncation(Op0) &&
44039 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
44040 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
44041 return true;
44042
44043 return false;
44044}
44045
44046// Given a ABS node, detect the following pattern:
44047// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
44048// This is useful as it is the input into a SAD pattern.
44049static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
44050 SDValue AbsOp1 = Abs->getOperand(0);
44051 if (AbsOp1.getOpcode() != ISD::SUB)
44052 return false;
44053
44054 Op0 = AbsOp1.getOperand(0);
44055 Op1 = AbsOp1.getOperand(1);
44056
44057 // Check if the operands of the sub are zero-extended from vectors of i8.
44058 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
44059 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
44060 Op1.getOpcode() != ISD::ZERO_EXTEND ||
44061 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
44062 return false;
44063
44064 return true;
44065}
44066
44067static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
44068 unsigned &LogBias, const SDLoc &DL,
44069 const X86Subtarget &Subtarget) {
44070 // Extend or truncate to MVT::i8 first.
44071 MVT Vi8VT =
44072 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
44073 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
44074 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
44075
44076 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
44077 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
44078 // The src A, B element type is i8, but the dst C element type is i32.
44079 // When we calculate the reduce stage, we use src vector type vXi8 for it
44080 // so we need logbias 2 to avoid extra 2 stages.
44081 LogBias = 2;
44082
44083 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
44084 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
44085 RegSize = std::max(512u, RegSize);
44086
44087 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44088 // fill in the missing vector elements with 0.
44089 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
44090 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
44091 Ops[0] = LHS;
44092 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44093 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44094 Ops[0] = RHS;
44095 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44096
44097 // Actually build the DotProduct, split as 256/512 bits for
44098 // AVXVNNI/AVX512VNNI.
44099 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44100 ArrayRef<SDValue> Ops) {
44101 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44102 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
44103 };
44104 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
44105 SDValue Zero = DAG.getConstant(0, DL, DpVT);
44106
44107 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
44108 DpBuilder, false);
44109}
44110
44111// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
44112// to these zexts.
44113static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
44114 const SDValue &Zext1, const SDLoc &DL,
44115 const X86Subtarget &Subtarget) {
44116 // Find the appropriate width for the PSADBW.
44117 EVT InVT = Zext0.getOperand(0).getValueType();
44118 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
44119
44120 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44121 // fill in the missing vector elements with 0.
44122 unsigned NumConcat = RegSize / InVT.getSizeInBits();
44123 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
44124 Ops[0] = Zext0.getOperand(0);
44125 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44126 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44127 Ops[0] = Zext1.getOperand(0);
44128 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44129
44130 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44131 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44132 ArrayRef<SDValue> Ops) {
44133 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44134 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
44135 };
44136 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
44137 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
44138 PSADBWBuilder);
44139}
44140
44141// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
44142// PHMINPOSUW.
44143static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
44144 const X86Subtarget &Subtarget) {
44145 // Bail without SSE41.
44146 if (!Subtarget.hasSSE41())
44147 return SDValue();
44148
44149 EVT ExtractVT = Extract->getValueType(0);
44150 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
44151 return SDValue();
44152
44153 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
44154 ISD::NodeType BinOp;
44155 SDValue Src = DAG.matchBinOpReduction(
44156 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
44157 if (!Src)
44158 return SDValue();
44159
44160 EVT SrcVT = Src.getValueType();
44161 EVT SrcSVT = SrcVT.getScalarType();
44162 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
44163 return SDValue();
44164
44165 SDLoc DL(Extract);
44166 SDValue MinPos = Src;
44167
44168 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
44169 while (SrcVT.getSizeInBits() > 128) {
44170 SDValue Lo, Hi;
44171 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
44172 SrcVT = Lo.getValueType();
44173 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
44174 }
44175 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44177, __extension__
__PRETTY_FUNCTION__))
44176 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44177, __extension__
__PRETTY_FUNCTION__))
44177 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44177, __extension__
__PRETTY_FUNCTION__))
;
44178
44179 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
44180 // to flip the value accordingly.
44181 SDValue Mask;
44182 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
44183 if (BinOp == ISD::SMAX)
44184 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
44185 else if (BinOp == ISD::SMIN)
44186 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
44187 else if (BinOp == ISD::UMAX)
44188 Mask = DAG.getAllOnesConstant(DL, SrcVT);
44189
44190 if (Mask)
44191 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44192
44193 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
44194 // shuffling each upper element down and insert zeros. This means that the
44195 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
44196 // ready for the PHMINPOS.
44197 if (ExtractVT == MVT::i8) {
44198 SDValue Upper = DAG.getVectorShuffle(
44199 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
44200 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
44201 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
44202 }
44203
44204 // Perform the PHMINPOS on a v8i16 vector,
44205 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
44206 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
44207 MinPos = DAG.getBitcast(SrcVT, MinPos);
44208
44209 if (Mask)
44210 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44211
44212 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
44213 DAG.getIntPtrConstant(0, DL));
44214}
44215
44216// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
44217static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
44218 const X86Subtarget &Subtarget) {
44219 // Bail without SSE2.
44220 if (!Subtarget.hasSSE2())
44221 return SDValue();
44222
44223 EVT ExtractVT = Extract->getValueType(0);
44224 unsigned BitWidth = ExtractVT.getSizeInBits();
44225 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
44226 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
44227 return SDValue();
44228
44229 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
44230 ISD::NodeType BinOp;
44231 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
44232 if (!Match && ExtractVT == MVT::i1)
44233 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
44234 if (!Match)
44235 return SDValue();
44236
44237 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
44238 // which we can't support here for now.
44239 if (Match.getScalarValueSizeInBits() != BitWidth)
44240 return SDValue();
44241
44242 SDValue Movmsk;
44243 SDLoc DL(Extract);
44244 EVT MatchVT = Match.getValueType();
44245 unsigned NumElts = MatchVT.getVectorNumElements();
44246 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
44247 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44248
44249 if (ExtractVT == MVT::i1) {
44250 // Special case for (pre-legalization) vXi1 reductions.
44251 if (NumElts > 64 || !isPowerOf2_32(NumElts))
44252 return SDValue();
44253 if (TLI.isTypeLegal(MatchVT)) {
44254 // If this is a legal AVX512 predicate type then we can just bitcast.
44255 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44256 Movmsk = DAG.getBitcast(MovmskVT, Match);
44257 } else {
44258 // For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).
44259 if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&
44260 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
44261 ISD::CondCode::SETEQ) {
44262 EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();
44263 if (VecSVT != MVT::i8 && (VecSVT.getSizeInBits() % 8) == 0) {
44264 NumElts *= VecSVT.getSizeInBits() / 8;
44265 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);
44266 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44267 Match = DAG.getSetCC(
44268 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
44269 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
44270 }
44271 }
44272
44273 // Use combineBitcastvxi1 to create the MOVMSK.
44274 while (NumElts > MaxElts) {
44275 SDValue Lo, Hi;
44276 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44277 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44278 NumElts /= 2;
44279 }
44280 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44281 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
44282 }
44283 if (!Movmsk)
44284 return SDValue();
44285 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
44286 } else {
44287 // FIXME: Better handling of k-registers or 512-bit vectors?
44288 unsigned MatchSizeInBits = Match.getValueSizeInBits();
44289 if (!(MatchSizeInBits == 128 ||
44290 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
44291 return SDValue();
44292
44293 // Make sure this isn't a vector of 1 element. The perf win from using
44294 // MOVMSK diminishes with less elements in the reduction, but it is
44295 // generally better to get the comparison over to the GPRs as soon as
44296 // possible to reduce the number of vector ops.
44297 if (Match.getValueType().getVectorNumElements() < 2)
44298 return SDValue();
44299
44300 // Check that we are extracting a reduction of all sign bits.
44301 if (DAG.ComputeNumSignBits(Match) != BitWidth)
44302 return SDValue();
44303
44304 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
44305 SDValue Lo, Hi;
44306 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44307 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44308 MatchSizeInBits = Match.getValueSizeInBits();
44309 }
44310
44311 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
44312 MVT MaskSrcVT;
44313 if (64 == BitWidth || 32 == BitWidth)
44314 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
44315 MatchSizeInBits / BitWidth);
44316 else
44317 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
44318
44319 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
44320 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
44321 NumElts = MaskSrcVT.getVectorNumElements();
44322 }
44323 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44324, __extension__
__PRETTY_FUNCTION__))
44324 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44324, __extension__
__PRETTY_FUNCTION__))
;
44325
44326 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
44327 if (BinOp == ISD::XOR) {
44328 // parity -> (PARITY(MOVMSK X))
44329 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
44330 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
44331 }
44332
44333 SDValue CmpC;
44334 ISD::CondCode CondCode;
44335 if (BinOp == ISD::OR) {
44336 // any_of -> MOVMSK != 0
44337 CmpC = DAG.getConstant(0, DL, CmpVT);
44338 CondCode = ISD::CondCode::SETNE;
44339 } else {
44340 // all_of -> MOVMSK == ((1 << NumElts) - 1)
44341 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
44342 DL, CmpVT);
44343 CondCode = ISD::CondCode::SETEQ;
44344 }
44345
44346 // The setcc produces an i8 of 0/1, so extend that to the result width and
44347 // negate to get the final 0/-1 mask value.
44348 EVT SetccVT =
44349 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
44350 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
44351 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
44352 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
44353 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
44354}
44355
44356static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
44357 const X86Subtarget &Subtarget) {
44358 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
44359 return SDValue();
44360
44361 EVT ExtractVT = Extract->getValueType(0);
44362 // Verify the type we're extracting is i32, as the output element type of
44363 // vpdpbusd is i32.
44364 if (ExtractVT != MVT::i32)
44365 return SDValue();
44366
44367 EVT VT = Extract->getOperand(0).getValueType();
44368 if (!isPowerOf2_32(VT.getVectorNumElements()))
44369 return SDValue();
44370
44371 // Match shuffle + add pyramid.
44372 ISD::NodeType BinOp;
44373 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44374
44375 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
44376 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
44377 // before adding into the accumulator.
44378 // TODO:
44379 // We also need to verify that the multiply has at least 2x the number of bits
44380 // of the input. We shouldn't match
44381 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
44382 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
44383 // Root = Root.getOperand(0);
44384
44385 // If there was a match, we want Root to be a mul.
44386 if (!Root || Root.getOpcode() != ISD::MUL)
44387 return SDValue();
44388
44389 // Check whether we have an extend and mul pattern
44390 SDValue LHS, RHS;
44391 if (!detectExtMul(DAG, Root, LHS, RHS))
44392 return SDValue();
44393
44394 // Create the dot product instruction.
44395 SDLoc DL(Extract);
44396 unsigned StageBias;
44397 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
44398
44399 // If the original vector was wider than 4 elements, sum over the results
44400 // in the DP vector.
44401 unsigned Stages = Log2_32(VT.getVectorNumElements());
44402 EVT DpVT = DP.getValueType();
44403
44404 if (Stages > StageBias) {
44405 unsigned DpElems = DpVT.getVectorNumElements();
44406
44407 for (unsigned i = Stages - StageBias; i > 0; --i) {
44408 SmallVector<int, 16> Mask(DpElems, -1);
44409 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44410 Mask[j] = MaskEnd + j;
44411
44412 SDValue Shuffle =
44413 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
44414 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
44415 }
44416 }
44417
44418 // Return the lowest ExtractSizeInBits bits.
44419 EVT ResVT =
44420 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44421 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
44422 DP = DAG.getBitcast(ResVT, DP);
44423 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
44424 Extract->getOperand(1));
44425}
44426
44427static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
44428 const X86Subtarget &Subtarget) {
44429 // PSADBW is only supported on SSE2 and up.
44430 if (!Subtarget.hasSSE2())
44431 return SDValue();
44432
44433 EVT ExtractVT = Extract->getValueType(0);
44434 // Verify the type we're extracting is either i32 or i64.
44435 // FIXME: Could support other types, but this is what we have coverage for.
44436 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
44437 return SDValue();
44438
44439 EVT VT = Extract->getOperand(0).getValueType();
44440 if (!isPowerOf2_32(VT.getVectorNumElements()))
44441 return SDValue();
44442
44443 // Match shuffle + add pyramid.
44444 ISD::NodeType BinOp;
44445 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44446
44447 // The operand is expected to be zero extended from i8
44448 // (verified in detectZextAbsDiff).
44449 // In order to convert to i64 and above, additional any/zero/sign
44450 // extend is expected.
44451 // The zero extend from 32 bit has no mathematical effect on the result.
44452 // Also the sign extend is basically zero extend
44453 // (extends the sign bit which is zero).
44454 // So it is correct to skip the sign/zero extend instruction.
44455 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44456 Root.getOpcode() == ISD::ZERO_EXTEND ||
44457 Root.getOpcode() == ISD::ANY_EXTEND))
44458 Root = Root.getOperand(0);
44459
44460 // If there was a match, we want Root to be a select that is the root of an
44461 // abs-diff pattern.
44462 if (!Root || Root.getOpcode() != ISD::ABS)
44463 return SDValue();
44464
44465 // Check whether we have an abs-diff pattern feeding into the select.
44466 SDValue Zext0, Zext1;
44467 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44468 return SDValue();
44469
44470 // Create the SAD instruction.
44471 SDLoc DL(Extract);
44472 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44473
44474 // If the original vector was wider than 8 elements, sum over the results
44475 // in the SAD vector.
44476 unsigned Stages = Log2_32(VT.getVectorNumElements());
44477 EVT SadVT = SAD.getValueType();
44478 if (Stages > 3) {
44479 unsigned SadElems = SadVT.getVectorNumElements();
44480
44481 for(unsigned i = Stages - 3; i > 0; --i) {
44482 SmallVector<int, 16> Mask(SadElems, -1);
44483 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44484 Mask[j] = MaskEnd + j;
44485
44486 SDValue Shuffle =
44487 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44488 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44489 }
44490 }
44491
44492 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44493 // Return the lowest ExtractSizeInBits bits.
44494 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44495 SadVT.getSizeInBits() / ExtractSizeInBits);
44496 SAD = DAG.getBitcast(ResVT, SAD);
44497 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44498 Extract->getOperand(1));
44499}
44500
44501// Attempt to peek through a target shuffle and extract the scalar from the
44502// source.
44503static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
44504 TargetLowering::DAGCombinerInfo &DCI,
44505 const X86Subtarget &Subtarget) {
44506 if (DCI.isBeforeLegalizeOps())
44507 return SDValue();
44508
44509 SDLoc dl(N);
44510 SDValue Src = N->getOperand(0);
44511 SDValue Idx = N->getOperand(1);
44512
44513 EVT VT = N->getValueType(0);
44514 EVT SrcVT = Src.getValueType();
44515 EVT SrcSVT = SrcVT.getVectorElementType();
44516 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44517 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44518
44519 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44520 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44521 return SDValue();
44522
44523 const APInt &IdxC = N->getConstantOperandAPInt(1);
44524 if (IdxC.uge(NumSrcElts))
44525 return SDValue();
44526
44527 SDValue SrcBC = peekThroughBitcasts(Src);
44528
44529 // Handle extract(bitcast(broadcast(scalar_value))).
44530 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44531 SDValue SrcOp = SrcBC.getOperand(0);
44532 EVT SrcOpVT = SrcOp.getValueType();
44533 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44534 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44535 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44536 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44537 // TODO support non-zero offsets.
44538 if (Offset == 0) {
44539 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44540 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44541 return SrcOp;
44542 }
44543 }
44544 }
44545
44546 // If we're extracting a single element from a broadcast load and there are
44547 // no other users, just create a single load.
44548 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44549 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44550 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44551 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44552 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44553 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44554 MemIntr->getBasePtr(),
44555 MemIntr->getPointerInfo(),
44556 MemIntr->getOriginalAlign(),
44557 MemIntr->getMemOperand()->getFlags());
44558 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44559 return Load;
44560 }
44561 }
44562
44563 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44564 // TODO: Move to DAGCombine?
44565 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44566 SrcBC.getValueType().isInteger() &&
44567 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44568 SrcBC.getScalarValueSizeInBits() ==
44569 SrcBC.getOperand(0).getValueSizeInBits()) {
44570 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44571 if (IdxC.ult(Scale)) {
44572 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44573 SDValue Scl = SrcBC.getOperand(0);
44574 EVT SclVT = Scl.getValueType();
44575 if (Offset) {
44576 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44577 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44578 }
44579 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44580 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44581 return Scl;
44582 }
44583 }
44584
44585 // Handle extract(truncate(x)) for 0'th index.
44586 // TODO: Treat this as a faux shuffle?
44587 // TODO: When can we use this for general indices?
44588 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44589 (SrcVT.getSizeInBits() % 128) == 0) {
44590 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44591 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44592 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44593 Idx);
44594 }
44595
44596 // We can only legally extract other elements from 128-bit vectors and in
44597 // certain circumstances, depending on SSE-level.
44598 // TODO: Investigate float/double extraction if it will be just stored.
44599 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44600 unsigned Idx) {
44601 EVT VecSVT = VecVT.getScalarType();
44602 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44603 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44604 VecSVT == MVT::i64)) {
44605 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44606 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44607 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44608 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44609 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44610 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44611 Idx &= (NumEltsPerLane - 1);
44612 }
44613 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44614 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44615 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44616 DAG.getBitcast(VecVT, Vec),
44617 DAG.getIntPtrConstant(Idx, dl));
44618 }
44619 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44620 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44621 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44622 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44623 DAG.getTargetConstant(Idx, dl, MVT::i8));
44624 }
44625 return SDValue();
44626 };
44627
44628 // Resolve the target shuffle inputs and mask.
44629 SmallVector<int, 16> Mask;
44630 SmallVector<SDValue, 2> Ops;
44631 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44632 return SDValue();
44633
44634 // Shuffle inputs must be the same size as the result.
44635 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44636 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44637 }))
44638 return SDValue();
44639
44640 // Attempt to narrow/widen the shuffle mask to the correct size.
44641 if (Mask.size() != NumSrcElts) {
44642 if ((NumSrcElts % Mask.size()) == 0) {
44643 SmallVector<int, 16> ScaledMask;
44644 int Scale = NumSrcElts / Mask.size();
44645 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44646 Mask = std::move(ScaledMask);
44647 } else if ((Mask.size() % NumSrcElts) == 0) {
44648 // Simplify Mask based on demanded element.
44649 int ExtractIdx = (int)IdxC.getZExtValue();
44650 int Scale = Mask.size() / NumSrcElts;
44651 int Lo = Scale * ExtractIdx;
44652 int Hi = Scale * (ExtractIdx + 1);
44653 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44654 if (i < Lo || Hi <= i)
44655 Mask[i] = SM_SentinelUndef;
44656
44657 SmallVector<int, 16> WidenedMask;
44658 while (Mask.size() > NumSrcElts &&
44659 canWidenShuffleElements(Mask, WidenedMask))
44660 Mask = std::move(WidenedMask);
44661 }
44662 }
44663
44664 // If narrowing/widening failed, see if we can extract+zero-extend.
44665 int ExtractIdx;
44666 EVT ExtractVT;
44667 if (Mask.size() == NumSrcElts) {
44668 ExtractIdx = Mask[IdxC.getZExtValue()];
44669 ExtractVT = SrcVT;
44670 } else {
44671 unsigned Scale = Mask.size() / NumSrcElts;
44672 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44673 return SDValue();
44674 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44675 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44676 return SDValue();
44677 ExtractIdx = Mask[ScaledIdx];
44678 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44679 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44680 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44681, __extension__
__PRETTY_FUNCTION__))
44681 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44681, __extension__
__PRETTY_FUNCTION__))
;
44682 }
44683
44684 // If the shuffle source element is undef/zero then we can just accept it.
44685 if (ExtractIdx == SM_SentinelUndef)
44686 return DAG.getUNDEF(VT);
44687
44688 if (ExtractIdx == SM_SentinelZero)
44689 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44690 : DAG.getConstant(0, dl, VT);
44691
44692 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44693 ExtractIdx = ExtractIdx % Mask.size();
44694 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44695 return DAG.getZExtOrTrunc(V, dl, VT);
44696
44697 return SDValue();
44698}
44699
44700/// Extracting a scalar FP value from vector element 0 is free, so extract each
44701/// operand first, then perform the math as a scalar op.
44702static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
44703 const X86Subtarget &Subtarget) {
44704 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44704, __extension__
__PRETTY_FUNCTION__))
;
44705 SDValue Vec = ExtElt->getOperand(0);
44706 SDValue Index = ExtElt->getOperand(1);
44707 EVT VT = ExtElt->getValueType(0);
44708 EVT VecVT = Vec.getValueType();
44709
44710 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44711 // non-zero element because the shuffle+scalar op will be cheaper?
44712 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44713 return SDValue();
44714
44715 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44716 // extract, the condition code), so deal with those as a special-case.
44717 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44718 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44719 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44720 return SDValue();
44721
44722 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44723 SDLoc DL(ExtElt);
44724 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44725 Vec.getOperand(0), Index);
44726 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44727 Vec.getOperand(1), Index);
44728 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44729 }
44730
44731 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44732 VT != MVT::f64)
44733 return SDValue();
44734
44735 // Vector FP selects don't fit the pattern of FP math ops (because the
44736 // condition has a different type and we have to change the opcode), so deal
44737 // with those here.
44738 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44739 // has i1 elements. If we loosen this we need to convert vector bool to a
44740 // scalar bool.
44741 if (Vec.getOpcode() == ISD::VSELECT &&
44742 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44743 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44744 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44745 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44746 SDLoc DL(ExtElt);
44747 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
44748 Vec.getOperand(0).getValueType().getScalarType(),
44749 Vec.getOperand(0), Index);
44750 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44751 Vec.getOperand(1), Index);
44752 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44753 Vec.getOperand(2), Index);
44754 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44755 }
44756
44757 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44758 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44759 // missed load folding and fma+fneg combining.
44760 switch (Vec.getOpcode()) {
44761 case ISD::FMA: // Begin 3 operands
44762 case ISD::FMAD:
44763 case ISD::FADD: // Begin 2 operands
44764 case ISD::FSUB:
44765 case ISD::FMUL:
44766 case ISD::FDIV:
44767 case ISD::FREM:
44768 case ISD::FCOPYSIGN:
44769 case ISD::FMINNUM:
44770 case ISD::FMAXNUM:
44771 case ISD::FMINNUM_IEEE:
44772 case ISD::FMAXNUM_IEEE:
44773 case ISD::FMAXIMUM:
44774 case ISD::FMINIMUM:
44775 case X86ISD::FMAX:
44776 case X86ISD::FMIN:
44777 case ISD::FABS: // Begin 1 operand
44778 case ISD::FSQRT:
44779 case ISD::FRINT:
44780 case ISD::FCEIL:
44781 case ISD::FTRUNC:
44782 case ISD::FNEARBYINT:
44783 case ISD::FROUND:
44784 case ISD::FFLOOR:
44785 case X86ISD::FRCP:
44786 case X86ISD::FRSQRT: {
44787 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44788 SDLoc DL(ExtElt);
44789 SmallVector<SDValue, 4> ExtOps;
44790 for (SDValue Op : Vec->ops())
44791 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44792 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44793 }
44794 default:
44795 return SDValue();
44796 }
44797 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44797)
;
44798}
44799
44800/// Try to convert a vector reduction sequence composed of binops and shuffles
44801/// into horizontal ops.
44802static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
44803 const X86Subtarget &Subtarget) {
44804 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44804, __extension__
__PRETTY_FUNCTION__))
;
44805
44806 // We need at least SSE2 to anything here.
44807 if (!Subtarget.hasSSE2())
44808 return SDValue();
44809
44810 ISD::NodeType Opc;
44811 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44812 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44813 if (!Rdx)
44814 return SDValue();
44815
44816 SDValue Index = ExtElt->getOperand(1);
44817 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44818, __extension__
__PRETTY_FUNCTION__))
44818 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44818, __extension__
__PRETTY_FUNCTION__))
;
44819
44820 EVT VT = ExtElt->getValueType(0);
44821 EVT VecVT = Rdx.getValueType();
44822 if (VecVT.getScalarType() != VT)
44823 return SDValue();
44824
44825 SDLoc DL(ExtElt);
44826 unsigned NumElts = VecVT.getVectorNumElements();
44827 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
44828
44829 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
44830 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
44831 if (V.getValueType() == MVT::v4i8) {
44832 if (ZeroExtend && Subtarget.hasSSE41()) {
44833 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
44834 DAG.getConstant(0, DL, MVT::v4i32),
44835 DAG.getBitcast(MVT::i32, V),
44836 DAG.getIntPtrConstant(0, DL));
44837 return DAG.getBitcast(MVT::v16i8, V);
44838 }
44839 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
44840 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
44841 : DAG.getUNDEF(MVT::v4i8));
44842 }
44843 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
44844 DAG.getUNDEF(MVT::v8i8));
44845 };
44846
44847 // vXi8 mul reduction - promote to vXi16 mul reduction.
44848 if (Opc == ISD::MUL) {
44849 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
44850 return SDValue();
44851 if (VecVT.getSizeInBits() >= 128) {
44852 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
44853 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44854 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44855 Lo = DAG.getBitcast(WideVT, Lo);
44856 Hi = DAG.getBitcast(WideVT, Hi);
44857 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
44858 while (Rdx.getValueSizeInBits() > 128) {
44859 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44860 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
44861 }
44862 } else {
44863 Rdx = WidenToV16I8(Rdx, false);
44864 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
44865 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
44866 }
44867 if (NumElts >= 8)
44868 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44869 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44870 {4, 5, 6, 7, -1, -1, -1, -1}));
44871 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44872 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44873 {2, 3, -1, -1, -1, -1, -1, -1}));
44874 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44875 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44876 {1, -1, -1, -1, -1, -1, -1, -1}));
44877 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44879 }
44880
44881 // vXi8 add reduction - sub 128-bit vector.
44882 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
44883 Rdx = WidenToV16I8(Rdx, true);
44884 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44885 DAG.getConstant(0, DL, MVT::v16i8));
44886 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44887 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44888 }
44889
44890 // Must be a >=128-bit vector with pow2 elements.
44891 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
44892 return SDValue();
44893
44894 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
44895 if (VT == MVT::i8) {
44896 while (Rdx.getValueSizeInBits() > 128) {
44897 SDValue Lo, Hi;
44898 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44899 VecVT = Lo.getValueType();
44900 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44901 }
44902 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44902, __extension__
__PRETTY_FUNCTION__))
;
44903
44904 SDValue Hi = DAG.getVectorShuffle(
44905 MVT::v16i8, DL, Rdx, Rdx,
44906 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
44907 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
44908 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44909 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
44910 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44911 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44912 }
44913
44914 // See if we can use vXi8 PSADBW add reduction for larger zext types.
44915 // If the source vector values are 0-255, then we can use PSADBW to
44916 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
44917 // TODO: See if its worth avoiding vXi16/i32 truncations?
44918 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
44919 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
44920 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
44921 Subtarget.hasAVX512())) {
44922 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
44923 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
44924 if (ByteVT.getSizeInBits() < 128)
44925 Rdx = WidenToV16I8(Rdx, true);
44926
44927 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44928 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44929 ArrayRef<SDValue> Ops) {
44930 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44931 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
44932 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
44933 };
44934 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
44935 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
44936
44937 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
44938 while (Rdx.getValueSizeInBits() > 128) {
44939 SDValue Lo, Hi;
44940 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44941 VecVT = Lo.getValueType();
44942 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44943 }
44944 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44944, __extension__
__PRETTY_FUNCTION__))
;
44945
44946 if (NumElts > 8) {
44947 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
44948 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
44949 }
44950
44951 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
44952 Rdx = DAG.getBitcast(VecVT, Rdx);
44953 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44954 }
44955
44956 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
44957 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
44958 return SDValue();
44959
44960 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
44961
44962 // 256-bit horizontal instructions operate on 128-bit chunks rather than
44963 // across the whole vector, so we need an extract + hop preliminary stage.
44964 // This is the only step where the operands of the hop are not the same value.
44965 // TODO: We could extend this to handle 512-bit or even longer vectors.
44966 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
44967 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
44968 unsigned NumElts = VecVT.getVectorNumElements();
44969 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
44970 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
44971 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
44972 VecVT = Rdx.getValueType();
44973 }
44974 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
44975 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
44976 return SDValue();
44977
44978 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
44979 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
44980 for (unsigned i = 0; i != ReductionSteps; ++i)
44981 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
44982
44983 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44984}
44985
44986/// Detect vector gather/scatter index generation and convert it from being a
44987/// bunch of shuffles and extracts into a somewhat faster sequence.
44988/// For i686, the best sequence is apparently storing the value and loading
44989/// scalars back, while for x64 we should use 64-bit extracts and shifts.
44990static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
44991 TargetLowering::DAGCombinerInfo &DCI,
44992 const X86Subtarget &Subtarget) {
44993 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
44994 return NewOp;
44995
44996 SDValue InputVector = N->getOperand(0);
44997 SDValue EltIdx = N->getOperand(1);
44998 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
44999
45000 EVT SrcVT = InputVector.getValueType();
45001 EVT VT = N->getValueType(0);
45002 SDLoc dl(InputVector);
45003 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
45004 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45005 unsigned NumEltBits = VT.getScalarSizeInBits();
45006 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45007
45008 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
45009 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45010
45011 // Integer Constant Folding.
45012 if (CIdx && VT.isInteger()) {
45013 APInt UndefVecElts;
45014 SmallVector<APInt, 16> EltBits;
45015 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
45016 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
45017 EltBits, true, false)) {
45018 uint64_t Idx = CIdx->getZExtValue();
45019 if (UndefVecElts[Idx])
45020 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45021 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
45022 }
45023
45024 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
45025 // Improves lowering of bool masks on rust which splits them into byte array.
45026 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
45027 SDValue Src = peekThroughBitcasts(InputVector);
45028 if (Src.getValueType().getScalarType() == MVT::i1 &&
45029 TLI.isTypeLegal(Src.getValueType())) {
45030 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
45031 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
45032 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
45033 return DAG.getBitcast(VT, Sub);
45034 }
45035 }
45036 }
45037
45038 if (IsPextr) {
45039 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
45040 DCI))
45041 return SDValue(N, 0);
45042
45043 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
45044 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
45045 InputVector.getOpcode() == X86ISD::PINSRW) &&
45046 InputVector.getOperand(2) == EltIdx) {
45047 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45048, __extension__
__PRETTY_FUNCTION__))
45048 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45048, __extension__
__PRETTY_FUNCTION__))
;
45049 SDValue Scl = InputVector.getOperand(1);
45050 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
45051 return DAG.getZExtOrTrunc(Scl, dl, VT);
45052 }
45053
45054 // TODO - Remove this once we can handle the implicit zero-extension of
45055 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
45056 // combineBasicSADPattern.
45057 return SDValue();
45058 }
45059
45060 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
45061 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
45062 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
45063 SDValue MMXSrc = InputVector.getOperand(0);
45064
45065 // The bitcast source is a direct mmx result.
45066 if (MMXSrc.getValueType() == MVT::x86mmx)
45067 return DAG.getBitcast(VT, InputVector);
45068 }
45069
45070 // Detect mmx to i32 conversion through a v2i32 elt extract.
45071 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
45072 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
45073 SDValue MMXSrc = InputVector.getOperand(0);
45074
45075 // The bitcast source is a direct mmx result.
45076 if (MMXSrc.getValueType() == MVT::x86mmx)
45077 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
45078 }
45079
45080 // Check whether this extract is the root of a sum of absolute differences
45081 // pattern. This has to be done here because we really want it to happen
45082 // pre-legalization,
45083 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
45084 return SAD;
45085
45086 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
45087 return VPDPBUSD;
45088
45089 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
45090 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
45091 return Cmp;
45092
45093 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
45094 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
45095 return MinMax;
45096
45097 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
45098 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
45099 return V;
45100
45101 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
45102 return V;
45103
45104 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
45105 // and then testing the relevant element.
45106 //
45107 // Note that we only combine extracts on the *same* result number, i.e.
45108 // t0 = merge_values a0, a1, a2, a3
45109 // i1 = extract_vector_elt t0, Constant:i64<2>
45110 // i1 = extract_vector_elt t0, Constant:i64<3>
45111 // but not
45112 // i1 = extract_vector_elt t0:1, Constant:i64<2>
45113 // since the latter would need its own MOVMSK.
45114 if (SrcVT.getScalarType() == MVT::i1) {
45115 bool IsVar = !CIdx;
45116 SmallVector<SDNode *, 16> BoolExtracts;
45117 unsigned ResNo = InputVector.getResNo();
45118 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
45119 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45120 Use->getOperand(0).getResNo() == ResNo &&
45121 Use->getValueType(0) == MVT::i1) {
45122 BoolExtracts.push_back(Use);
45123 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
45124 return true;
45125 }
45126 return false;
45127 };
45128 // TODO: Can we drop the oneuse check for constant extracts?
45129 if (all_of(InputVector->uses(), IsBoolExtract) &&
45130 (IsVar || BoolExtracts.size() > 1)) {
45131 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
45132 if (SDValue BC =
45133 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
45134 for (SDNode *Use : BoolExtracts) {
45135 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
45136 // Mask = 1 << MaskIdx
45137 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
45138 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
45139 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
45140 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
45141 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
45142 DCI.CombineTo(Use, Res);
45143 }
45144 return SDValue(N, 0);
45145 }
45146 }
45147 }
45148
45149 // If this extract is from a loaded vector value and will be used as an
45150 // integer, that requires a potentially expensive XMM -> GPR transfer.
45151 // Additionally, if we can convert to a scalar integer load, that will likely
45152 // be folded into a subsequent integer op.
45153 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
45154 // to a single-use of the loaded vector. For the reasons above, we
45155 // expect this to be profitable even if it creates an extra load.
45156 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
45157 return Use->getOpcode() == ISD::STORE ||
45158 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45159 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45160 });
45161 auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
45162 if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
45163 SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
45164 !LikelyUsedAsVector && LoadVec->isSimple()) {
45165 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45166 SDValue NewPtr =
45167 TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
45168 unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
45169 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45170 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45171 SDValue Load =
45172 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45173 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45174 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
45175 return Load;
45176 }
45177
45178 return SDValue();
45179}
45180
45181// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
45182// This is more or less the reverse of combineBitcastvxi1.
45183static SDValue combineToExtendBoolVectorInReg(
45184 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
45185 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
45186 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
45187 Opcode != ISD::ANY_EXTEND)
45188 return SDValue();
45189 if (!DCI.isBeforeLegalizeOps())
45190 return SDValue();
45191 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
45192 return SDValue();
45193
45194 EVT SVT = VT.getScalarType();
45195 EVT InSVT = N0.getValueType().getScalarType();
45196 unsigned EltSizeInBits = SVT.getSizeInBits();
45197
45198 // Input type must be extending a bool vector (bit-casted from a scalar
45199 // integer) to legal integer types.
45200 if (!VT.isVector())
45201 return SDValue();
45202 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
45203 return SDValue();
45204 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
45205 return SDValue();
45206
45207 SDValue N00 = N0.getOperand(0);
45208 EVT SclVT = N00.getValueType();
45209 if (!SclVT.isScalarInteger())
45210 return SDValue();
45211
45212 SDValue Vec;
45213 SmallVector<int> ShuffleMask;
45214 unsigned NumElts = VT.getVectorNumElements();
45215 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45215, __extension__
__PRETTY_FUNCTION__))
;
45216
45217 // Broadcast the scalar integer to the vector elements.
45218 if (NumElts > EltSizeInBits) {
45219 // If the scalar integer is greater than the vector element size, then we
45220 // must split it down into sub-sections for broadcasting. For example:
45221 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
45222 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
45223 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45223, __extension__
__PRETTY_FUNCTION__))
;
45224 unsigned Scale = NumElts / EltSizeInBits;
45225 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
45226 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45227 Vec = DAG.getBitcast(VT, Vec);
45228
45229 for (unsigned i = 0; i != Scale; ++i)
45230 ShuffleMask.append(EltSizeInBits, i);
45231 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45232 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
45233 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
45234 // If we have register broadcast instructions, use the scalar size as the
45235 // element type for the shuffle. Then cast to the wider element type. The
45236 // widened bits won't be used, and this might allow the use of a broadcast
45237 // load.
45238 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45238, __extension__
__PRETTY_FUNCTION__))
;
45239 unsigned Scale = EltSizeInBits / NumElts;
45240 EVT BroadcastVT =
45241 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
45242 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45243 ShuffleMask.append(NumElts * Scale, 0);
45244 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
45245 Vec = DAG.getBitcast(VT, Vec);
45246 } else {
45247 // For smaller scalar integers, we can simply any-extend it to the vector
45248 // element size (we don't care about the upper bits) and broadcast it to all
45249 // elements.
45250 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
45251 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
45252 ShuffleMask.append(NumElts, 0);
45253 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45254 }
45255
45256 // Now, mask the relevant bit in each element.
45257 SmallVector<SDValue, 32> Bits;
45258 for (unsigned i = 0; i != NumElts; ++i) {
45259 int BitIdx = (i % EltSizeInBits);
45260 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
45261 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
45262 }
45263 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
45264 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
45265
45266 // Compare against the bitmask and extend the result.
45267 EVT CCVT = VT.changeVectorElementType(MVT::i1);
45268 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
45269 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
45270
45271 // For SEXT, this is now done, otherwise shift the result down for
45272 // zero-extension.
45273 if (Opcode == ISD::SIGN_EXTEND)
45274 return Vec;
45275 return DAG.getNode(ISD::SRL, DL, VT, Vec,
45276 DAG.getConstant(EltSizeInBits - 1, DL, VT));
45277}
45278
45279/// If a vector select has an operand that is -1 or 0, try to simplify the
45280/// select to a bitwise logic operation.
45281/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
45282static SDValue
45283combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
45284 TargetLowering::DAGCombinerInfo &DCI,
45285 const X86Subtarget &Subtarget) {
45286 SDValue Cond = N->getOperand(0);
45287 SDValue LHS = N->getOperand(1);
45288 SDValue RHS = N->getOperand(2);
45289 EVT VT = LHS.getValueType();
45290 EVT CondVT = Cond.getValueType();
45291 SDLoc DL(N);
45292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45293
45294 if (N->getOpcode() != ISD::VSELECT)
45295 return SDValue();
45296
45297 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45297, __extension__
__PRETTY_FUNCTION__))
;
45298
45299 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
45300 // TODO: Can we assert that both operands are not zeros (because that should
45301 // get simplified at node creation time)?
45302 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
45303 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
45304
45305 // If both inputs are 0/undef, create a complete zero vector.
45306 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
45307 if (TValIsAllZeros && FValIsAllZeros) {
45308 if (VT.isFloatingPoint())
45309 return DAG.getConstantFP(0.0, DL, VT);
45310 return DAG.getConstant(0, DL, VT);
45311 }
45312
45313 // To use the condition operand as a bitwise mask, it must have elements that
45314 // are the same size as the select elements. Ie, the condition operand must
45315 // have already been promoted from the IR select condition type <N x i1>.
45316 // Don't check if the types themselves are equal because that excludes
45317 // vector floating-point selects.
45318 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
45319 return SDValue();
45320
45321 // Try to invert the condition if true value is not all 1s and false value is
45322 // not all 0s. Only do this if the condition has one use.
45323 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
45324 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
45325 // Check if the selector will be produced by CMPP*/PCMP*.
45326 Cond.getOpcode() == ISD::SETCC &&
45327 // Check if SETCC has already been promoted.
45328 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
45329 CondVT) {
45330 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
45331
45332 if (TValIsAllZeros || FValIsAllOnes) {
45333 SDValue CC = Cond.getOperand(2);
45334 ISD::CondCode NewCC = ISD::getSetCCInverse(
45335 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
45336 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
45337 NewCC);
45338 std::swap(LHS, RHS);
45339 TValIsAllOnes = FValIsAllOnes;
45340 FValIsAllZeros = TValIsAllZeros;
45341 }
45342 }
45343
45344 // Cond value must be 'sign splat' to be converted to a logical op.
45345 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
45346 return SDValue();
45347
45348 // vselect Cond, 111..., 000... -> Cond
45349 if (TValIsAllOnes && FValIsAllZeros)
45350 return DAG.getBitcast(VT, Cond);
45351
45352 if (!TLI.isTypeLegal(CondVT))
45353 return SDValue();
45354
45355 // vselect Cond, 111..., X -> or Cond, X
45356 if (TValIsAllOnes) {
45357 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45358 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
45359 return DAG.getBitcast(VT, Or);
45360 }
45361
45362 // vselect Cond, X, 000... -> and Cond, X
45363 if (FValIsAllZeros) {
45364 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
45365 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
45366 return DAG.getBitcast(VT, And);
45367 }
45368
45369 // vselect Cond, 000..., X -> andn Cond, X
45370 if (TValIsAllZeros) {
45371 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45372 SDValue AndN;
45373 // The canonical form differs for i1 vectors - x86andnp is not used
45374 if (CondVT.getScalarType() == MVT::i1)
45375 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
45376 CastRHS);
45377 else
45378 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
45379 return DAG.getBitcast(VT, AndN);
45380 }
45381
45382 return SDValue();
45383}
45384
45385/// If both arms of a vector select are concatenated vectors, split the select,
45386/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45387/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45388/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
45389static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
45390 const X86Subtarget &Subtarget) {
45391 unsigned Opcode = N->getOpcode();
45392 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
45393 return SDValue();
45394
45395 // TODO: Split 512-bit vectors too?
45396 EVT VT = N->getValueType(0);
45397 if (!VT.is256BitVector())
45398 return SDValue();
45399
45400 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45401 SDValue Cond = N->getOperand(0);
45402 SDValue TVal = N->getOperand(1);
45403 SDValue FVal = N->getOperand(2);
45404 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
45405 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45406 !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
45407 !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
45408 return SDValue();
45409
45410 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45411 ArrayRef<SDValue> Ops) {
45412 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45413 };
45414 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
45415 makeBlend, /*CheckBWI*/ false);
45416}
45417
45418static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
45419 SDValue Cond = N->getOperand(0);
45420 SDValue LHS = N->getOperand(1);
45421 SDValue RHS = N->getOperand(2);
45422 SDLoc DL(N);
45423
45424 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45425 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45426 if (!TrueC || !FalseC)
45427 return SDValue();
45428
45429 // Don't do this for crazy integer types.
45430 EVT VT = N->getValueType(0);
45431 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45432 return SDValue();
45433
45434 // We're going to use the condition bit in math or logic ops. We could allow
45435 // this with a wider condition value (post-legalization it becomes an i8),
45436 // but if nothing is creating selects that late, it doesn't matter.
45437 if (Cond.getValueType() != MVT::i1)
45438 return SDValue();
45439
45440 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45441 // 3, 5, or 9 with i32/i64, so those get transformed too.
45442 // TODO: For constants that overflow or do not differ by power-of-2 or small
45443 // multiplier, convert to 'and' + 'add'.
45444 const APInt &TrueVal = TrueC->getAPIntValue();
45445 const APInt &FalseVal = FalseC->getAPIntValue();
45446
45447 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45448 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45449 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45450 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45451 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45452 return SDValue();
45453 }
45454
45455 bool OV;
45456 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45457 if (OV)
45458 return SDValue();
45459
45460 APInt AbsDiff = Diff.abs();
45461 if (AbsDiff.isPowerOf2() ||
45462 ((VT == MVT::i32 || VT == MVT::i64) &&
45463 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45464
45465 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45466 // of the condition can usually be folded into a compare predicate, but even
45467 // without that, the sequence should be cheaper than a CMOV alternative.
45468 if (TrueVal.slt(FalseVal)) {
45469 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45470 std::swap(TrueC, FalseC);
45471 }
45472
45473 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45474 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45475
45476 // Multiply condition by the difference if non-one.
45477 if (!AbsDiff.isOne())
45478 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45479
45480 // Add the base if non-zero.
45481 if (!FalseC->isZero())
45482 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45483
45484 return R;
45485 }
45486
45487 return SDValue();
45488}
45489
45490/// If this is a *dynamic* select (non-constant condition) and we can match
45491/// this node with one of the variable blend instructions, restructure the
45492/// condition so that blends can use the high (sign) bit of each element.
45493/// This function will also call SimplifyDemandedBits on already created
45494/// BLENDV to perform additional simplifications.
45495static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
45496 TargetLowering::DAGCombinerInfo &DCI,
45497 const X86Subtarget &Subtarget) {
45498 SDValue Cond = N->getOperand(0);
45499 if ((N->getOpcode() != ISD::VSELECT &&
45500 N->getOpcode() != X86ISD::BLENDV) ||
45501 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
45502 return SDValue();
45503
45504 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45505 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45506 EVT VT = N->getValueType(0);
45507
45508 // We can only handle the cases where VSELECT is directly legal on the
45509 // subtarget. We custom lower VSELECT nodes with constant conditions and
45510 // this makes it hard to see whether a dynamic VSELECT will correctly
45511 // lower, so we both check the operation's status and explicitly handle the
45512 // cases where a *dynamic* blend will fail even though a constant-condition
45513 // blend could be custom lowered.
45514 // FIXME: We should find a better way to handle this class of problems.
45515 // Potentially, we should combine constant-condition vselect nodes
45516 // pre-legalization into shuffles and not mark as many types as custom
45517 // lowered.
45518 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
45519 return SDValue();
45520 // FIXME: We don't support i16-element blends currently. We could and
45521 // should support them by making *all* the bits in the condition be set
45522 // rather than just the high bit and using an i8-element blend.
45523 if (VT.getVectorElementType() == MVT::i16)
45524 return SDValue();
45525 // Dynamic blending was only available from SSE4.1 onward.
45526 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45527 return SDValue();
45528 // Byte blends are only available in AVX2
45529 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45530 return SDValue();
45531 // There are no 512-bit blend instructions that use sign bits.
45532 if (VT.is512BitVector())
45533 return SDValue();
45534
45535 // Don't optimize before the condition has been transformed to a legal type
45536 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45537 if (BitWidth < 8 || BitWidth > 64)
45538 return SDValue();
45539
45540 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45541 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45542 UI != UE; ++UI)
45543 if ((UI->getOpcode() != ISD::VSELECT &&
45544 UI->getOpcode() != X86ISD::BLENDV) ||
45545 UI.getOperandNo() != 0)
45546 return false;
45547
45548 return true;
45549 };
45550
45551 APInt DemandedBits(APInt::getSignMask(BitWidth));
45552
45553 if (OnlyUsedAsSelectCond(Cond)) {
45554 KnownBits Known;
45555 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
45556 !DCI.isBeforeLegalizeOps());
45557 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45558 return SDValue();
45559
45560 // If we changed the computation somewhere in the DAG, this change will
45561 // affect all users of Cond. Update all the nodes so that we do not use
45562 // the generic VSELECT anymore. Otherwise, we may perform wrong
45563 // optimizations as we messed with the actual expectation for the vector
45564 // boolean values.
45565 for (SDNode *U : Cond->uses()) {
45566 if (U->getOpcode() == X86ISD::BLENDV)
45567 continue;
45568
45569 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45570 Cond, U->getOperand(1), U->getOperand(2));
45571 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45572 DCI.AddToWorklist(U);
45573 }
45574 DCI.CommitTargetLoweringOpt(TLO);
45575 return SDValue(N, 0);
45576 }
45577
45578 // Otherwise we can still at least try to simplify multiple use bits.
45579 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
45580 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
45581 N->getOperand(1), N->getOperand(2));
45582
45583 return SDValue();
45584}
45585
45586// Try to match:
45587// (or (and (M, (sub 0, X)), (pandn M, X)))
45588// which is a special case of:
45589// (select M, (sub 0, X), X)
45590// Per:
45591// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45592// We know that, if fNegate is 0 or 1:
45593// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45594//
45595// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45596// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45597// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45598// This lets us transform our vselect to:
45599// (add (xor X, M), (and M, 1))
45600// And further to:
45601// (sub (xor X, M), M)
45602static SDValue combineLogicBlendIntoConditionalNegate(
45603 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45604 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45605 EVT MaskVT = Mask.getValueType();
45606 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45608, __extension__
__PRETTY_FUNCTION__))
45607 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45608, __extension__
__PRETTY_FUNCTION__))
45608 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45608, __extension__
__PRETTY_FUNCTION__))
;
45609
45610 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45611 return SDValue();
45612 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
45613 return SDValue();
45614
45615 auto IsNegV = [](SDNode *N, SDValue V) {
45616 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45617 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45618 };
45619
45620 SDValue V;
45621 if (IsNegV(Y.getNode(), X))
45622 V = X;
45623 else if (IsNegV(X.getNode(), Y))
45624 V = Y;
45625 else
45626 return SDValue();
45627
45628 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45629 SDValue SubOp2 = Mask;
45630
45631 // If the negate was on the false side of the select, then
45632 // the operands of the SUB need to be swapped. PR 27251.
45633 // This is because the pattern being matched above is
45634 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45635 // but if the pattern matched was
45636 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45637 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45638 // pattern also needs to be a negation of the replacement pattern above.
45639 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45640 // sub accomplishes the negation of the replacement pattern.
45641 if (V == Y)
45642 std::swap(SubOp1, SubOp2);
45643
45644 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45645 return DAG.getBitcast(VT, Res);
45646}
45647
45648/// Do target-specific dag combines on SELECT and VSELECT nodes.
45649static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
45650 TargetLowering::DAGCombinerInfo &DCI,
45651 const X86Subtarget &Subtarget) {
45652 SDLoc DL(N);
45653 SDValue Cond = N->getOperand(0);
45654 SDValue LHS = N->getOperand(1);
45655 SDValue RHS = N->getOperand(2);
45656
45657 // Try simplification again because we use this function to optimize
45658 // BLENDV nodes that are not handled by the generic combiner.
45659 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45660 return V;
45661
45662 EVT VT = LHS.getValueType();
45663 EVT CondVT = Cond.getValueType();
45664 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45665 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45666
45667 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45668 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45669 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45670 if (CondVT.isVector() && CondVT.isInteger() &&
45671 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45672 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45673 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
45674 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
45675 DL, DAG, Subtarget))
45676 return V;
45677
45678 // Convert vselects with constant condition into shuffles.
45679 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45680 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45681 SmallVector<int, 64> Mask;
45682 if (createShuffleMaskFromVSELECT(Mask, Cond,
45683 N->getOpcode() == X86ISD::BLENDV))
45684 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45685 }
45686
45687 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45688 // by forcing the unselected elements to zero.
45689 // TODO: Can we handle more shuffles with this?
45690 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45691 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45692 LHS.hasOneUse() && RHS.hasOneUse()) {
45693 MVT SimpleVT = VT.getSimpleVT();
45694 SmallVector<SDValue, 1> LHSOps, RHSOps;
45695 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45696 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45697 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
45698 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
45699 int NumElts = VT.getVectorNumElements();
45700 for (int i = 0; i != NumElts; ++i) {
45701 // getConstVector sets negative shuffle mask values as undef, so ensure
45702 // we hardcode SM_SentinelZero values to zero (0x80).
45703 if (CondMask[i] < NumElts) {
45704 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45705 RHSMask[i] = 0x80;
45706 } else {
45707 LHSMask[i] = 0x80;
45708 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45709 }
45710 }
45711 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45712 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45713 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45714 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45715 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45716 }
45717 }
45718
45719 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45720 // instructions match the semantics of the common C idiom x<y?x:y but not
45721 // x<=y?x:y, because of how they handle negative zero (which can be
45722 // ignored in unsafe-math mode).
45723 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45724 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45725 VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
45726 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45727 (Subtarget.hasSSE2() ||
45728 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45729 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45730
45731 unsigned Opcode = 0;
45732 // Check for x CC y ? x : y.
45733 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45734 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45735 switch (CC) {
45736 default: break;
45737 case ISD::SETULT:
45738 // Converting this to a min would handle NaNs incorrectly, and swapping
45739 // the operands would cause it to handle comparisons between positive
45740 // and negative zero incorrectly.
45741 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45742 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
45743 !(DAG.isKnownNeverZeroFloat(LHS) ||
45744 DAG.isKnownNeverZeroFloat(RHS)))
45745 break;
45746 std::swap(LHS, RHS);
45747 }
45748 Opcode = X86ISD::FMIN;
45749 break;
45750 case ISD::SETOLE:
45751 // Converting this to a min would handle comparisons between positive
45752 // and negative zero incorrectly.
45753 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
45754 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
45755 break;
45756 Opcode = X86ISD::FMIN;
45757 break;
45758 case ISD::SETULE:
45759 // Converting this to a min would handle both negative zeros and NaNs
45760 // incorrectly, but we can swap the operands to fix both.
45761 std::swap(LHS, RHS);
45762 [[fallthrough]];
45763 case ISD::SETOLT:
45764 case ISD::SETLT:
45765 case ISD::SETLE:
45766 Opcode = X86ISD::FMIN;
45767 break;
45768
45769 case ISD::SETOGE:
45770 // Converting this to a max would handle comparisons between positive
45771 // and negative zero incorrectly.
45772 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
45773 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
45774 break;
45775 Opcode = X86ISD::FMAX;
45776 break;
45777 case ISD::SETUGT:
45778 // Converting this to a max would handle NaNs incorrectly, and swapping
45779 // the operands would cause it to handle comparisons between positive
45780 // and negative zero incorrectly.
45781 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45782 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
45783 !(DAG.isKnownNeverZeroFloat(LHS) ||
45784 DAG.isKnownNeverZeroFloat(RHS)))
45785 break;
45786 std::swap(LHS, RHS);
45787 }
45788 Opcode = X86ISD::FMAX;
45789 break;
45790 case ISD::SETUGE:
45791 // Converting this to a max would handle both negative zeros and NaNs
45792 // incorrectly, but we can swap the operands to fix both.
45793 std::swap(LHS, RHS);
45794 [[fallthrough]];
45795 case ISD::SETOGT:
45796 case ISD::SETGT:
45797 case ISD::SETGE:
45798 Opcode = X86ISD::FMAX;
45799 break;
45800 }
45801 // Check for x CC y ? y : x -- a min/max with reversed arms.
45802 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
45803 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
45804 switch (CC) {
45805 default: break;
45806 case ISD::SETOGE:
45807 // Converting this to a min would handle comparisons between positive
45808 // and negative zero incorrectly, and swapping the operands would
45809 // cause it to handle NaNs incorrectly.
45810 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
45811 !(DAG.isKnownNeverZeroFloat(LHS) ||
45812 DAG.isKnownNeverZeroFloat(RHS))) {
45813 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45814 break;
45815 std::swap(LHS, RHS);
45816 }
45817 Opcode = X86ISD::FMIN;
45818 break;
45819 case ISD::SETUGT:
45820 // Converting this to a min would handle NaNs incorrectly.
45821 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45822 break;
45823 Opcode = X86ISD::FMIN;
45824 break;
45825 case ISD::SETUGE:
45826 // Converting this to a min would handle both negative zeros and NaNs
45827 // incorrectly, but we can swap the operands to fix both.
45828 std::swap(LHS, RHS);
45829 [[fallthrough]];
45830 case ISD::SETOGT:
45831 case ISD::SETGT:
45832 case ISD::SETGE:
45833 Opcode = X86ISD::FMIN;
45834 break;
45835
45836 case ISD::SETULT:
45837 // Converting this to a max would handle NaNs incorrectly.
45838 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45839 break;
45840 Opcode = X86ISD::FMAX;
45841 break;
45842 case ISD::SETOLE:
45843 // Converting this to a max would handle comparisons between positive
45844 // and negative zero incorrectly, and swapping the operands would
45845 // cause it to handle NaNs incorrectly.
45846 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
45847 !DAG.isKnownNeverZeroFloat(LHS) &&
45848 !DAG.isKnownNeverZeroFloat(RHS)) {
45849 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45850 break;
45851 std::swap(LHS, RHS);
45852 }
45853 Opcode = X86ISD::FMAX;
45854 break;
45855 case ISD::SETULE:
45856 // Converting this to a max would handle both negative zeros and NaNs
45857 // incorrectly, but we can swap the operands to fix both.
45858 std::swap(LHS, RHS);
45859 [[fallthrough]];
45860 case ISD::SETOLT:
45861 case ISD::SETLT:
45862 case ISD::SETLE:
45863 Opcode = X86ISD::FMAX;
45864 break;
45865 }
45866 }
45867
45868 if (Opcode)
45869 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
45870 }
45871
45872 // Some mask scalar intrinsics rely on checking if only one bit is set
45873 // and implement it in C code like this:
45874 // A[0] = (U & 1) ? A[0] : W[0];
45875 // This creates some redundant instructions that break pattern matching.
45876 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
45877 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
45878 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
45879 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45880 SDValue AndNode = Cond.getOperand(0);
45881 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
45882 isNullConstant(Cond.getOperand(1)) &&
45883 isOneConstant(AndNode.getOperand(1))) {
45884 // LHS and RHS swapped due to
45885 // setcc outputting 1 when AND resulted in 0 and vice versa.
45886 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
45887 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
45888 }
45889 }
45890
45891 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
45892 // lowering on KNL. In this case we convert it to
45893 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
45894 // The same situation all vectors of i8 and i16 without BWI.
45895 // Make sure we extend these even before type legalization gets a chance to
45896 // split wide vectors.
45897 // Since SKX these selects have a proper lowering.
45898 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
45899 CondVT.getVectorElementType() == MVT::i1 &&
45900 (VT.getVectorElementType() == MVT::i8 ||
45901 VT.getVectorElementType() == MVT::i16)) {
45902 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
45903 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
45904 }
45905
45906 // AVX512 - Extend select with zero to merge with target shuffle.
45907 // select(mask, extract_subvector(shuffle(x)), zero) -->
45908 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
45909 // TODO - support non target shuffles as well.
45910 if (Subtarget.hasAVX512() && CondVT.isVector() &&
45911 CondVT.getVectorElementType() == MVT::i1) {
45912 auto SelectableOp = [&TLI](SDValue Op) {
45913 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45914 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
45915 isNullConstant(Op.getOperand(1)) &&
45916 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
45917 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
45918 };
45919
45920 bool SelectableLHS = SelectableOp(LHS);
45921 bool SelectableRHS = SelectableOp(RHS);
45922 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
45923 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
45924
45925 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
45926 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
45927 : RHS.getOperand(0).getValueType();
45928 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
45929 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
45930 VT.getSizeInBits());
45931 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
45932 VT.getSizeInBits());
45933 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
45934 DAG.getUNDEF(SrcCondVT), Cond,
45935 DAG.getIntPtrConstant(0, DL));
45936 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
45937 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
45938 }
45939 }
45940
45941 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
45942 return V;
45943
45944 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
45945 Cond.hasOneUse()) {
45946 EVT CondVT = Cond.getValueType();
45947 SDValue Cond0 = Cond.getOperand(0);
45948 SDValue Cond1 = Cond.getOperand(1);
45949 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45950
45951 // Canonicalize min/max:
45952 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
45953 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
45954 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
45955 // the need for an extra compare against zero. e.g.
45956 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
45957 // subl %esi, %edi
45958 // testl %edi, %edi
45959 // movl $0, %eax
45960 // cmovgl %edi, %eax
45961 // =>
45962 // xorl %eax, %eax
45963 // subl %esi, $edi
45964 // cmovsl %eax, %edi
45965 //
45966 // We can also canonicalize
45967 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
45968 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
45969 // This allows the use of a test instruction for the compare.
45970 if (LHS == Cond0 && RHS == Cond1) {
45971 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
45972 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
45973 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
45974 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45975 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45976 }
45977 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
45978 ISD::CondCode NewCC = ISD::SETUGE;
45979 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45980 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45981 }
45982 }
45983
45984 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
45985 // fold eq + gt/lt nested selects into ge/le selects
45986 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
45987 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
45988 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
45989 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
45990 // .. etc ..
45991 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
45992 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
45993 SDValue InnerSetCC = RHS.getOperand(0);
45994 ISD::CondCode InnerCC =
45995 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
45996 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
45997 Cond0 == InnerSetCC.getOperand(0) &&
45998 Cond1 == InnerSetCC.getOperand(1)) {
45999 ISD::CondCode NewCC;
46000 switch (CC == ISD::SETEQ ? InnerCC : CC) {
46001 case ISD::SETGT: NewCC = ISD::SETGE; break;
46002 case ISD::SETLT: NewCC = ISD::SETLE; break;
46003 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
46004 case ISD::SETULT: NewCC = ISD::SETULE; break;
46005 default: NewCC = ISD::SETCC_INVALID; break;
46006 }
46007 if (NewCC != ISD::SETCC_INVALID) {
46008 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
46009 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
46010 }
46011 }
46012 }
46013 }
46014
46015 // Check if the first operand is all zeros and Cond type is vXi1.
46016 // If this an avx512 target we can improve the use of zero masking by
46017 // swapping the operands and inverting the condition.
46018 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
46019 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
46020 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
46021 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
46022 // Invert the cond to not(cond) : xor(op,allones)=not(op)
46023 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
46024 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
46025 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
46026 }
46027
46028 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
46029 // get split by legalization.
46030 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
46031 CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&
46032 TLI.isTypeLegal(VT.getScalarType())) {
46033 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
46034 if (SDValue ExtCond = combineToExtendBoolVectorInReg(
46035 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
46036 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
46037 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
46038 }
46039 }
46040
46041 // Early exit check
46042 if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
46043 return SDValue();
46044
46045 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
46046 return V;
46047
46048 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
46049 return V;
46050
46051 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
46052 return V;
46053
46054 // select(~Cond, X, Y) -> select(Cond, Y, X)
46055 if (CondVT.getScalarType() != MVT::i1) {
46056 if (SDValue CondNot = IsNOT(Cond, DAG))
46057 return DAG.getNode(N->getOpcode(), DL, VT,
46058 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
46059
46060 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse()) {
46061 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
46062 // signbit.
46063 if (ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
46064 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
46065 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
46066 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
46067 }
46068
46069 // smin(LHS, RHS) : select(pcmpgt(RHS, LHS), LHS, RHS)
46070 // -> select(pcmpgt(LHS, RHS), RHS, LHS)
46071 // iff the commuted pcmpgt() already exists.
46072 // TODO: Could DAGCombiner::combine cse search for SETCC nodes, like it
46073 // does for commutative binops?
46074 if (Cond.getOperand(0) == RHS && Cond.getOperand(1) == LHS) {
46075 if (SDNode *FlipCond =
46076 DAG.getNodeIfExists(X86ISD::PCMPGT, DAG.getVTList(CondVT),
46077 {Cond.getOperand(1), Cond.getOperand(0)})) {
46078 return DAG.getNode(N->getOpcode(), DL, VT, SDValue(FlipCond, 0), RHS,
46079 LHS);
46080 }
46081 }
46082 }
46083 }
46084
46085 // Try to optimize vXi1 selects if both operands are either all constants or
46086 // bitcasts from scalar integer type. In that case we can convert the operands
46087 // to integer and use an integer select which will be converted to a CMOV.
46088 // We need to take a little bit of care to avoid creating an i64 type after
46089 // type legalization.
46090 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
46091 VT.getVectorElementType() == MVT::i1 &&
46092 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
46093 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46094 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
46095 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
46096
46097 if ((LHSIsConst ||
46098 (LHS.getOpcode() == ISD::BITCAST &&
46099 LHS.getOperand(0).getValueType() == IntVT)) &&
46100 (RHSIsConst ||
46101 (RHS.getOpcode() == ISD::BITCAST &&
46102 RHS.getOperand(0).getValueType() == IntVT))) {
46103 if (LHSIsConst)
46104 LHS = combinevXi1ConstantToInteger(LHS, DAG);
46105 else
46106 LHS = LHS.getOperand(0);
46107
46108 if (RHSIsConst)
46109 RHS = combinevXi1ConstantToInteger(RHS, DAG);
46110 else
46111 RHS = RHS.getOperand(0);
46112
46113 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
46114 return DAG.getBitcast(VT, Select);
46115 }
46116 }
46117
46118 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
46119 // single bits, then invert the predicate and swap the select operands.
46120 // This can lower using a vector shift bit-hack rather than mask and compare.
46121 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
46122 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
46123 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
46124 Cond.getOperand(0).getOpcode() == ISD::AND &&
46125 isNullOrNullSplat(Cond.getOperand(1)) &&
46126 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
46127 Cond.getOperand(0).getValueType() == VT) {
46128 // The 'and' mask must be composed of power-of-2 constants.
46129 SDValue And = Cond.getOperand(0);
46130 auto *C = isConstOrConstSplat(And.getOperand(1));
46131 if (C && C->getAPIntValue().isPowerOf2()) {
46132 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
46133 SDValue NotCond =
46134 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
46135 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
46136 }
46137
46138 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
46139 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
46140 // 16-bit lacks a proper blendv.
46141 unsigned EltBitWidth = VT.getScalarSizeInBits();
46142 bool CanShiftBlend =
46143 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
46144 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
46145 (Subtarget.hasXOP()));
46146 if (CanShiftBlend &&
46147 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
46148 return C->getAPIntValue().isPowerOf2();
46149 })) {
46150 // Create a left-shift constant to get the mask bits over to the sign-bit.
46151 SDValue Mask = And.getOperand(1);
46152 SmallVector<int, 32> ShlVals;
46153 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
46154 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
46155 ShlVals.push_back(EltBitWidth - 1 -
46156 MaskVal->getAPIntValue().exactLogBase2());
46157 }
46158 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
46159 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
46160 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
46161 SDValue NewCond =
46162 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
46163 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
46164 }
46165 }
46166
46167 return SDValue();
46168}
46169
46170/// Combine:
46171/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
46172/// to:
46173/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
46174/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
46175/// Note that this is only legal for some op/cc combinations.
46176static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
46177 SelectionDAG &DAG,
46178 const X86Subtarget &Subtarget) {
46179 // This combine only operates on CMP-like nodes.
46180 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46181 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46182 return SDValue();
46183
46184 // Can't replace the cmp if it has more uses than the one we're looking at.
46185 // FIXME: We would like to be able to handle this, but would need to make sure
46186 // all uses were updated.
46187 if (!Cmp.hasOneUse())
46188 return SDValue();
46189
46190 // This only applies to variations of the common case:
46191 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46192 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46193 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46194 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46195 // Using the proper condcodes (see below), overflow is checked for.
46196
46197 // FIXME: We can generalize both constraints:
46198 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46199 // - LHS != 1
46200 // if the result is compared.
46201
46202 SDValue CmpLHS = Cmp.getOperand(0);
46203 SDValue CmpRHS = Cmp.getOperand(1);
46204 EVT CmpVT = CmpLHS.getValueType();
46205
46206 if (!CmpLHS.hasOneUse())
46207 return SDValue();
46208
46209 unsigned Opc = CmpLHS.getOpcode();
46210 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
46211 return SDValue();
46212
46213 SDValue OpRHS = CmpLHS.getOperand(2);
46214 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
46215 if (!OpRHSC)
46216 return SDValue();
46217
46218 APInt Addend = OpRHSC->getAPIntValue();
46219 if (Opc == ISD::ATOMIC_LOAD_SUB)
46220 Addend = -Addend;
46221
46222 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
46223 if (!CmpRHSC)
46224 return SDValue();
46225
46226 APInt Comparison = CmpRHSC->getAPIntValue();
46227 APInt NegAddend = -Addend;
46228
46229 // See if we can adjust the CC to make the comparison match the negated
46230 // addend.
46231 if (Comparison != NegAddend) {
46232 APInt IncComparison = Comparison + 1;
46233 if (IncComparison == NegAddend) {
46234 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
46235 Comparison = IncComparison;
46236 CC = X86::COND_AE;
46237 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
46238 Comparison = IncComparison;
46239 CC = X86::COND_L;
46240 }
46241 }
46242 APInt DecComparison = Comparison - 1;
46243 if (DecComparison == NegAddend) {
46244 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
46245 Comparison = DecComparison;
46246 CC = X86::COND_A;
46247 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
46248 Comparison = DecComparison;
46249 CC = X86::COND_LE;
46250 }
46251 }
46252 }
46253
46254 // If the addend is the negation of the comparison value, then we can do
46255 // a full comparison by emitting the atomic arithmetic as a locked sub.
46256 if (Comparison == NegAddend) {
46257 // The CC is fine, but we need to rewrite the LHS of the comparison as an
46258 // atomic sub.
46259 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
46260 auto AtomicSub = DAG.getAtomic(
46261 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
46262 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
46263 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
46264 AN->getMemOperand());
46265 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
46266 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46267 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46268 return LockOp;
46269 }
46270
46271 // We can handle comparisons with zero in a number of cases by manipulating
46272 // the CC used.
46273 if (!Comparison.isZero())
46274 return SDValue();
46275
46276 if (CC == X86::COND_S && Addend == 1)
46277 CC = X86::COND_LE;
46278 else if (CC == X86::COND_NS && Addend == 1)
46279 CC = X86::COND_G;
46280 else if (CC == X86::COND_G && Addend == -1)
46281 CC = X86::COND_GE;
46282 else if (CC == X86::COND_LE && Addend == -1)
46283 CC = X86::COND_L;
46284 else
46285 return SDValue();
46286
46287 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
46288 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46289 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46290 return LockOp;
46291}
46292
46293// Check whether a boolean test is testing a boolean value generated by
46294// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
46295// code.
46296//
46297// Simplify the following patterns:
46298// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
46299// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
46300// to (Op EFLAGS Cond)
46301//
46302// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
46303// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
46304// to (Op EFLAGS !Cond)
46305//
46306// where Op could be BRCOND or CMOV.
46307//
46308static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
46309 // This combine only operates on CMP-like nodes.
46310 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46311 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46312 return SDValue();
46313
46314 // Quit if not used as a boolean value.
46315 if (CC != X86::COND_E && CC != X86::COND_NE)
46316 return SDValue();
46317
46318 // Check CMP operands. One of them should be 0 or 1 and the other should be
46319 // an SetCC or extended from it.
46320 SDValue Op1 = Cmp.getOperand(0);
46321 SDValue Op2 = Cmp.getOperand(1);
46322
46323 SDValue SetCC;
46324 const ConstantSDNode* C = nullptr;
46325 bool needOppositeCond = (CC == X86::COND_E);
46326 bool checkAgainstTrue = false; // Is it a comparison against 1?
46327
46328 if ((C = dyn_cast<ConstantSDNode>(Op1)))
46329 SetCC = Op2;
46330 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
46331 SetCC = Op1;
46332 else // Quit if all operands are not constants.
46333 return SDValue();
46334
46335 if (C->getZExtValue() == 1) {
46336 needOppositeCond = !needOppositeCond;
46337 checkAgainstTrue = true;
46338 } else if (C->getZExtValue() != 0)
46339 // Quit if the constant is neither 0 or 1.
46340 return SDValue();
46341
46342 bool truncatedToBoolWithAnd = false;
46343 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
46344 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
46345 SetCC.getOpcode() == ISD::TRUNCATE ||
46346 SetCC.getOpcode() == ISD::AND) {
46347 if (SetCC.getOpcode() == ISD::AND) {
46348 int OpIdx = -1;
46349 if (isOneConstant(SetCC.getOperand(0)))
46350 OpIdx = 1;
46351 if (isOneConstant(SetCC.getOperand(1)))
46352 OpIdx = 0;
46353 if (OpIdx < 0)
46354 break;
46355 SetCC = SetCC.getOperand(OpIdx);
46356 truncatedToBoolWithAnd = true;
46357 } else
46358 SetCC = SetCC.getOperand(0);
46359 }
46360
46361 switch (SetCC.getOpcode()) {
46362 case X86ISD::SETCC_CARRY:
46363 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
46364 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
46365 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
46366 // truncated to i1 using 'and'.
46367 if (checkAgainstTrue && !truncatedToBoolWithAnd)
46368 break;
46369 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46370, __extension__
__PRETTY_FUNCTION__))
46370 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46370, __extension__
__PRETTY_FUNCTION__))
;
46371 [[fallthrough]];
46372 case X86ISD::SETCC:
46373 // Set the condition code or opposite one if necessary.
46374 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
46375 if (needOppositeCond)
46376 CC = X86::GetOppositeBranchCondition(CC);
46377 return SetCC.getOperand(1);
46378 case X86ISD::CMOV: {
46379 // Check whether false/true value has canonical one, i.e. 0 or 1.
46380 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46381 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46382 // Quit if true value is not a constant.
46383 if (!TVal)
46384 return SDValue();
46385 // Quit if false value is not a constant.
46386 if (!FVal) {
46387 SDValue Op = SetCC.getOperand(0);
46388 // Skip 'zext' or 'trunc' node.
46389 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46390 Op.getOpcode() == ISD::TRUNCATE)
46391 Op = Op.getOperand(0);
46392 // A special case for rdrand/rdseed, where 0 is set if false cond is
46393 // found.
46394 if ((Op.getOpcode() != X86ISD::RDRAND &&
46395 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46396 return SDValue();
46397 }
46398 // Quit if false value is not the constant 0 or 1.
46399 bool FValIsFalse = true;
46400 if (FVal && FVal->getZExtValue() != 0) {
46401 if (FVal->getZExtValue() != 1)
46402 return SDValue();
46403 // If FVal is 1, opposite cond is needed.
46404 needOppositeCond = !needOppositeCond;
46405 FValIsFalse = false;
46406 }
46407 // Quit if TVal is not the constant opposite of FVal.
46408 if (FValIsFalse && TVal->getZExtValue() != 1)
46409 return SDValue();
46410 if (!FValIsFalse && TVal->getZExtValue() != 0)
46411 return SDValue();
46412 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
46413 if (needOppositeCond)
46414 CC = X86::GetOppositeBranchCondition(CC);
46415 return SetCC.getOperand(3);
46416 }
46417 }
46418
46419 return SDValue();
46420}
46421
46422/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46423/// Match:
46424/// (X86or (X86setcc) (X86setcc))
46425/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46426static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
46427 X86::CondCode &CC1, SDValue &Flags,
46428 bool &isAnd) {
46429 if (Cond->getOpcode() == X86ISD::CMP) {
46430 if (!isNullConstant(Cond->getOperand(1)))
46431 return false;
46432
46433 Cond = Cond->getOperand(0);
46434 }
46435
46436 isAnd = false;
46437
46438 SDValue SetCC0, SetCC1;
46439 switch (Cond->getOpcode()) {
46440 default: return false;
46441 case ISD::AND:
46442 case X86ISD::AND:
46443 isAnd = true;
46444 [[fallthrough]];
46445 case ISD::OR:
46446 case X86ISD::OR:
46447 SetCC0 = Cond->getOperand(0);
46448 SetCC1 = Cond->getOperand(1);
46449 break;
46450 };
46451
46452 // Make sure we have SETCC nodes, using the same flags value.
46453 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46454 SetCC1.getOpcode() != X86ISD::SETCC ||
46455 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46456 return false;
46457
46458 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46459 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46460 Flags = SetCC0->getOperand(1);
46461 return true;
46462}
46463
46464// When legalizing carry, we create carries via add X, -1
46465// If that comes from an actual carry, via setcc, we use the
46466// carry directly.
46467static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
46468 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46469 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46470 bool FoundAndLSB = false;
46471 SDValue Carry = EFLAGS.getOperand(0);
46472 while (Carry.getOpcode() == ISD::TRUNCATE ||
46473 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46474 (Carry.getOpcode() == ISD::AND &&
46475 isOneConstant(Carry.getOperand(1)))) {
46476 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46477 Carry = Carry.getOperand(0);
46478 }
46479 if (Carry.getOpcode() == X86ISD::SETCC ||
46480 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46481 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46482 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46483 SDValue CarryOp1 = Carry.getOperand(1);
46484 if (CarryCC == X86::COND_B)
46485 return CarryOp1;
46486 if (CarryCC == X86::COND_A) {
46487 // Try to convert COND_A into COND_B in an attempt to facilitate
46488 // materializing "setb reg".
46489 //
46490 // Do not flip "e > c", where "c" is a constant, because Cmp
46491 // instruction cannot take an immediate as its first operand.
46492 //
46493 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46494 CarryOp1.getNode()->hasOneUse() &&
46495 CarryOp1.getValueType().isInteger() &&
46496 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46497 SDValue SubCommute =
46498 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46499 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46500 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46501 }
46502 }
46503 // If this is a check of the z flag of an add with 1, switch to the
46504 // C flag.
46505 if (CarryCC == X86::COND_E &&
46506 CarryOp1.getOpcode() == X86ISD::ADD &&
46507 isOneConstant(CarryOp1.getOperand(1)))
46508 return CarryOp1;
46509 } else if (FoundAndLSB) {
46510 SDLoc DL(Carry);
46511 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46512 if (Carry.getOpcode() == ISD::SRL) {
46513 BitNo = Carry.getOperand(1);
46514 Carry = Carry.getOperand(0);
46515 }
46516 return getBT(Carry, BitNo, DL, DAG);
46517 }
46518 }
46519 }
46520
46521 return SDValue();
46522}
46523
46524/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46525/// to avoid the inversion.
46526static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
46527 SelectionDAG &DAG,
46528 const X86Subtarget &Subtarget) {
46529 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46530 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46531 EFLAGS.getOpcode() != X86ISD::TESTP)
46532 return SDValue();
46533
46534 // PTEST/TESTP sets EFLAGS as:
46535 // TESTZ: ZF = (Op0 & Op1) == 0
46536 // TESTC: CF = (~Op0 & Op1) == 0
46537 // TESTNZC: ZF == 0 && CF == 0
46538 EVT VT = EFLAGS.getValueType();
46539 SDValue Op0 = EFLAGS.getOperand(0);
46540 SDValue Op1 = EFLAGS.getOperand(1);
46541 EVT OpVT = Op0.getValueType();
46542
46543 // TEST*(~X,Y) == TEST*(X,Y)
46544 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46545 X86::CondCode InvCC;
46546 switch (CC) {
46547 case X86::COND_B:
46548 // testc -> testz.
46549 InvCC = X86::COND_E;
46550 break;
46551 case X86::COND_AE:
46552 // !testc -> !testz.
46553 InvCC = X86::COND_NE;
46554 break;
46555 case X86::COND_E:
46556 // testz -> testc.
46557 InvCC = X86::COND_B;
46558 break;
46559 case X86::COND_NE:
46560 // !testz -> !testc.
46561 InvCC = X86::COND_AE;
46562 break;
46563 case X86::COND_A:
46564 case X86::COND_BE:
46565 // testnzc -> testnzc (no change).
46566 InvCC = CC;
46567 break;
46568 default:
46569 InvCC = X86::COND_INVALID;
46570 break;
46571 }
46572
46573 if (InvCC != X86::COND_INVALID) {
46574 CC = InvCC;
46575 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46576 DAG.getBitcast(OpVT, NotOp0), Op1);
46577 }
46578 }
46579
46580 if (CC == X86::COND_E || CC == X86::COND_NE) {
46581 // TESTZ(X,~Y) == TESTC(Y,X)
46582 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46583 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
46584 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46585 DAG.getBitcast(OpVT, NotOp1), Op0);
46586 }
46587
46588 if (Op0 == Op1) {
46589 SDValue BC = peekThroughBitcasts(Op0);
46590 EVT BCVT = BC.getValueType();
46591 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))
46592 "Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))
;
46593
46594 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46595 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46596 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46597 DAG.getBitcast(OpVT, BC.getOperand(0)),
46598 DAG.getBitcast(OpVT, BC.getOperand(1)));
46599 }
46600
46601 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46602 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46603 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
46604 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46605 DAG.getBitcast(OpVT, BC.getOperand(0)),
46606 DAG.getBitcast(OpVT, BC.getOperand(1)));
46607 }
46608
46609 // If every element is an all-sign value, see if we can use MOVMSK to
46610 // more efficiently extract the sign bits and compare that.
46611 // TODO: Handle TESTC with comparison inversion.
46612 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46613 // MOVMSK combines to make sure its never worse than PTEST?
46614 unsigned EltBits = BCVT.getScalarSizeInBits();
46615 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46616 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46616, __extension__
__PRETTY_FUNCTION__))
;
46617 APInt SignMask = APInt::getSignMask(EltBits);
46618 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46619 if (SDValue Res =
46620 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46621 // For vXi16 cases we need to use pmovmksb and extract every other
46622 // sign bit.
46623 SDLoc DL(EFLAGS);
46624 if (EltBits == 16) {
46625 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46626 Res = DAG.getBitcast(MovmskVT, Res);
46627 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46628 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46629 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46630 } else {
46631 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46632 }
46633 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46634 DAG.getConstant(0, DL, MVT::i32));
46635 }
46636 }
46637 }
46638
46639 // TESTZ(-1,X) == TESTZ(X,X)
46640 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
46641 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46642
46643 // TESTZ(X,-1) == TESTZ(X,X)
46644 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
46645 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46646
46647 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46648 // TODO: Add COND_NE handling?
46649 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46650 SDValue Src0 = peekThroughBitcasts(Op0);
46651 SDValue Src1 = peekThroughBitcasts(Op1);
46652 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46653 Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
46654 peekThroughBitcasts(Src0.getOperand(1)), true);
46655 Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
46656 peekThroughBitcasts(Src1.getOperand(1)), true);
46657 if (Src0 && Src1)
46658 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46659 DAG.getBitcast(MVT::v4i64, Src0),
46660 DAG.getBitcast(MVT::v4i64, Src1));
46661 }
46662 }
46663 }
46664
46665 return SDValue();
46666}
46667
46668// Attempt to simplify the MOVMSK input based on the comparison type.
46669static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
46670 SelectionDAG &DAG,
46671 const X86Subtarget &Subtarget) {
46672 // Handle eq/ne against zero (any_of).
46673 // Handle eq/ne against -1 (all_of).
46674 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46675 return SDValue();
46676 if (EFLAGS.getValueType() != MVT::i32)
46677 return SDValue();
46678 unsigned CmpOpcode = EFLAGS.getOpcode();
46679 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46680 return SDValue();
46681 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46682 if (!CmpConstant)
46683 return SDValue();
46684 const APInt &CmpVal = CmpConstant->getAPIntValue();
46685
46686 SDValue CmpOp = EFLAGS.getOperand(0);
46687 unsigned CmpBits = CmpOp.getValueSizeInBits();
46688 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46688, __extension__
__PRETTY_FUNCTION__))
;
46689
46690 // Peek through any truncate.
46691 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46692 CmpOp = CmpOp.getOperand(0);
46693
46694 // Bail if we don't find a MOVMSK.
46695 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46696 return SDValue();
46697
46698 SDValue Vec = CmpOp.getOperand(0);
46699 MVT VecVT = Vec.getSimpleValueType();
46700 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46701, __extension__
__PRETTY_FUNCTION__))
46701 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46701, __extension__
__PRETTY_FUNCTION__))
;
46702 unsigned NumElts = VecVT.getVectorNumElements();
46703 unsigned NumEltBits = VecVT.getScalarSizeInBits();
46704
46705 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
46706 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
46707 NumElts <= CmpBits && CmpVal.isMask(NumElts);
46708 if (!IsAnyOf && !IsAllOf)
46709 return SDValue();
46710
46711 // TODO: Check more combining cases for me.
46712 // Here we check the cmp use number to decide do combining or not.
46713 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
46714 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
46715 bool IsOneUse = CmpOp.getNode()->hasOneUse();
46716
46717 // See if we can peek through to a vector with a wider element type, if the
46718 // signbits extend down to all the sub-elements as well.
46719 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
46720 // potential SimplifyDemandedBits/Elts cases.
46721 // If we looked through a truncate that discard bits, we can't do this
46722 // transform.
46723 // FIXME: We could do this transform for truncates that discarded bits by
46724 // inserting an AND mask between the new MOVMSK and the CMP.
46725 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
46726 SDValue BC = peekThroughBitcasts(Vec);
46727 MVT BCVT = BC.getSimpleValueType();
46728 unsigned BCNumElts = BCVT.getVectorNumElements();
46729 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
46730 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
46731 BCNumEltBits > NumEltBits &&
46732 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
46733 SDLoc DL(EFLAGS);
46734 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
46735 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46736 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
46737 DAG.getConstant(CmpMask, DL, MVT::i32));
46738 }
46739 }
46740
46741 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
46742 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
46743 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
46744 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
46745 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
46746 SmallVector<SDValue> Ops;
46747 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
46748 Ops.size() == 2) {
46749 SDLoc DL(EFLAGS);
46750 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
46751 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
46752 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
46753 DAG.getBitcast(SubVT, Ops[0]),
46754 DAG.getBitcast(SubVT, Ops[1]));
46755 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
46756 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46757 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
46758 DAG.getConstant(CmpMask, DL, MVT::i32));
46759 }
46760 }
46761
46762 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
46763 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
46764 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
46765 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
46766 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
46767 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
46768 SDValue BC = peekThroughBitcasts(Vec);
46769 // Ensure MOVMSK was testing every signbit of BC.
46770 if (BC.getValueType().getVectorNumElements() <= NumElts) {
46771 if (BC.getOpcode() == X86ISD::PCMPEQ) {
46772 SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
46773 BC.getOperand(0), BC.getOperand(1));
46774 V = DAG.getBitcast(TestVT, V);
46775 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46776 }
46777 // Check for 256-bit split vector cases.
46778 if (BC.getOpcode() == ISD::AND &&
46779 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
46780 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
46781 SDValue LHS = BC.getOperand(0);
46782 SDValue RHS = BC.getOperand(1);
46783 LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
46784 LHS.getOperand(0), LHS.getOperand(1));
46785 RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
46786 RHS.getOperand(0), RHS.getOperand(1));
46787 LHS = DAG.getBitcast(TestVT, LHS);
46788 RHS = DAG.getBitcast(TestVT, RHS);
46789 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
46790 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46791 }
46792 }
46793 }
46794
46795 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
46796 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
46797 // sign bits prior to the comparison with zero unless we know that
46798 // the vXi16 splats the sign bit down to the lower i8 half.
46799 // TODO: Handle all_of patterns.
46800 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
46801 SDValue VecOp0 = Vec.getOperand(0);
46802 SDValue VecOp1 = Vec.getOperand(1);
46803 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
46804 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
46805 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
46806 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
46807 SDLoc DL(EFLAGS);
46808 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
46809 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46810 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
46811 if (!SignExt0) {
46812 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
46813 DAG.getConstant(0xAAAA, DL, MVT::i16));
46814 }
46815 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46816 DAG.getConstant(0, DL, MVT::i16));
46817 }
46818 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
46819 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
46820 if (CmpBits >= 16 && Subtarget.hasInt256() &&
46821 (IsAnyOf || (SignExt0 && SignExt1))) {
46822 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
46823 SDLoc DL(EFLAGS);
46824 SDValue Result = peekThroughBitcasts(Src);
46825 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
46826 Result.getValueType().getVectorNumElements() <= NumElts) {
46827 SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),
46828 Result.getOperand(0), Result.getOperand(1));
46829 V = DAG.getBitcast(MVT::v4i64, V);
46830 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46831 }
46832 Result = DAG.getBitcast(MVT::v32i8, Result);
46833 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46834 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
46835 if (!SignExt0 || !SignExt1) {
46836 assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46837, __extension__
__PRETTY_FUNCTION__))
46837 "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46837, __extension__
__PRETTY_FUNCTION__))
;
46838 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
46839 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46840 }
46841 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46842 DAG.getConstant(CmpMask, DL, MVT::i32));
46843 }
46844 }
46845 }
46846
46847 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
46848 SmallVector<int, 32> ShuffleMask;
46849 SmallVector<SDValue, 2> ShuffleInputs;
46850 if (NumElts <= CmpBits &&
46851 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
46852 ShuffleMask, DAG) &&
46853 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
46854 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
46855 unsigned NumShuffleElts = ShuffleMask.size();
46856 APInt DemandedElts = APInt::getZero(NumShuffleElts);
46857 for (int M : ShuffleMask) {
46858 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46858, __extension__
__PRETTY_FUNCTION__))
;
46859 DemandedElts.setBit(M);
46860 }
46861 if (DemandedElts.isAllOnes()) {
46862 SDLoc DL(EFLAGS);
46863 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
46864 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46865 Result =
46866 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
46867 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46868 EFLAGS.getOperand(1));
46869 }
46870 }
46871
46872 return SDValue();
46873}
46874
46875/// Optimize an EFLAGS definition used according to the condition code \p CC
46876/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
46877/// uses of chain values.
46878static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
46879 SelectionDAG &DAG,
46880 const X86Subtarget &Subtarget) {
46881 if (CC == X86::COND_B)
46882 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
46883 return Flags;
46884
46885 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
46886 return R;
46887
46888 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
46889 return R;
46890
46891 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
46892 return R;
46893
46894 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
46895}
46896
46897/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
46898static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
46899 TargetLowering::DAGCombinerInfo &DCI,
46900 const X86Subtarget &Subtarget) {
46901 SDLoc DL(N);
46902
46903 SDValue FalseOp = N->getOperand(0);
46904 SDValue TrueOp = N->getOperand(1);
46905 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
46906 SDValue Cond = N->getOperand(3);
46907
46908 // cmov X, X, ?, ? --> X
46909 if (TrueOp == FalseOp)
46910 return TrueOp;
46911
46912 // Try to simplify the EFLAGS and condition code operands.
46913 // We can't always do this as FCMOV only supports a subset of X86 cond.
46914 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
46915 if (!(FalseOp.getValueType() == MVT::f80 ||
46916 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
46917 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
46918 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
46919 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
46920 Flags};
46921 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46922 }
46923 }
46924
46925 // If this is a select between two integer constants, try to do some
46926 // optimizations. Note that the operands are ordered the opposite of SELECT
46927 // operands.
46928 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
46929 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
46930 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
46931 // larger than FalseC (the false value).
46932 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
46933 CC = X86::GetOppositeBranchCondition(CC);
46934 std::swap(TrueC, FalseC);
46935 std::swap(TrueOp, FalseOp);
46936 }
46937
46938 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
46939 // This is efficient for any integer data type (including i8/i16) and
46940 // shift amount.
46941 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
46942 Cond = getSETCC(CC, Cond, DL, DAG);
46943
46944 // Zero extend the condition if needed.
46945 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
46946
46947 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
46948 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
46949 DAG.getConstant(ShAmt, DL, MVT::i8));
46950 return Cond;
46951 }
46952
46953 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
46954 // for any integer data type, including i8/i16.
46955 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
46956 Cond = getSETCC(CC, Cond, DL, DAG);
46957
46958 // Zero extend the condition if needed.
46959 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
46960 FalseC->getValueType(0), Cond);
46961 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46962 SDValue(FalseC, 0));
46963 return Cond;
46964 }
46965
46966 // Optimize cases that will turn into an LEA instruction. This requires
46967 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
46968 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
46969 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
46970 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46971, __extension__
__PRETTY_FUNCTION__))
46971 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46971, __extension__
__PRETTY_FUNCTION__))
;
46972
46973 bool isFastMultiplier = false;
46974 if (Diff.ult(10)) {
46975 switch (Diff.getZExtValue()) {
46976 default: break;
46977 case 1: // result = add base, cond
46978 case 2: // result = lea base( , cond*2)
46979 case 3: // result = lea base(cond, cond*2)
46980 case 4: // result = lea base( , cond*4)
46981 case 5: // result = lea base(cond, cond*4)
46982 case 8: // result = lea base( , cond*8)
46983 case 9: // result = lea base(cond, cond*8)
46984 isFastMultiplier = true;
46985 break;
46986 }
46987 }
46988
46989 if (isFastMultiplier) {
46990 Cond = getSETCC(CC, Cond, DL ,DAG);
46991 // Zero extend the condition if needed.
46992 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46993 Cond);
46994 // Scale the condition by the difference.
46995 if (Diff != 1)
46996 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46997 DAG.getConstant(Diff, DL, Cond.getValueType()));
46998
46999 // Add the base if non-zero.
47000 if (FalseC->getAPIntValue() != 0)
47001 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47002 SDValue(FalseC, 0));
47003 return Cond;
47004 }
47005 }
47006 }
47007 }
47008
47009 // Handle these cases:
47010 // (select (x != c), e, c) -> select (x != c), e, x),
47011 // (select (x == c), c, e) -> select (x == c), x, e)
47012 // where the c is an integer constant, and the "select" is the combination
47013 // of CMOV and CMP.
47014 //
47015 // The rationale for this change is that the conditional-move from a constant
47016 // needs two instructions, however, conditional-move from a register needs
47017 // only one instruction.
47018 //
47019 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
47020 // some instruction-combining opportunities. This opt needs to be
47021 // postponed as late as possible.
47022 //
47023 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
47024 // the DCI.xxxx conditions are provided to postpone the optimization as
47025 // late as possible.
47026
47027 ConstantSDNode *CmpAgainst = nullptr;
47028 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
47029 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
47030 !isa<ConstantSDNode>(Cond.getOperand(0))) {
47031
47032 if (CC == X86::COND_NE &&
47033 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
47034 CC = X86::GetOppositeBranchCondition(CC);
47035 std::swap(TrueOp, FalseOp);
47036 }
47037
47038 if (CC == X86::COND_E &&
47039 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
47040 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
47041 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
47042 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47043 }
47044 }
47045 }
47046
47047 // Fold and/or of setcc's to double CMOV:
47048 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
47049 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
47050 //
47051 // This combine lets us generate:
47052 // cmovcc1 (jcc1 if we don't have CMOV)
47053 // cmovcc2 (same)
47054 // instead of:
47055 // setcc1
47056 // setcc2
47057 // and/or
47058 // cmovne (jne if we don't have CMOV)
47059 // When we can't use the CMOV instruction, it might increase branch
47060 // mispredicts.
47061 // When we can use CMOV, or when there is no mispredict, this improves
47062 // throughput and reduces register pressure.
47063 //
47064 if (CC == X86::COND_NE) {
47065 SDValue Flags;
47066 X86::CondCode CC0, CC1;
47067 bool isAndSetCC;
47068 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
47069 if (isAndSetCC) {
47070 std::swap(FalseOp, TrueOp);
47071 CC0 = X86::GetOppositeBranchCondition(CC0);
47072 CC1 = X86::GetOppositeBranchCondition(CC1);
47073 }
47074
47075 SDValue LOps[] = {FalseOp, TrueOp,
47076 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
47077 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
47078 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
47079 Flags};
47080 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47081 return CMOV;
47082 }
47083 }
47084
47085 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
47086 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
47087 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
47088 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
47089 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
47090 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
47091 SDValue Add = TrueOp;
47092 SDValue Const = FalseOp;
47093 // Canonicalize the condition code for easier matching and output.
47094 if (CC == X86::COND_E)
47095 std::swap(Add, Const);
47096
47097 // We might have replaced the constant in the cmov with the LHS of the
47098 // compare. If so change it to the RHS of the compare.
47099 if (Const == Cond.getOperand(0))
47100 Const = Cond.getOperand(1);
47101
47102 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
47103 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
47104 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
47105 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
47106 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
47107 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
47108 EVT VT = N->getValueType(0);
47109 // This should constant fold.
47110 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
47111 SDValue CMov =
47112 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
47113 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
47114 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
47115 }
47116 }
47117
47118 return SDValue();
47119}
47120
47121/// Different mul shrinking modes.
47122enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
47123
47124static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
47125 EVT VT = N->getOperand(0).getValueType();
47126 if (VT.getScalarSizeInBits() != 32)
47127 return false;
47128
47129 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47129, __extension__
__PRETTY_FUNCTION__))
;
47130 unsigned SignBits[2] = {1, 1};
47131 bool IsPositive[2] = {false, false};
47132 for (unsigned i = 0; i < 2; i++) {
47133 SDValue Opd = N->getOperand(i);
47134
47135 SignBits[i] = DAG.ComputeNumSignBits(Opd);
47136 IsPositive[i] = DAG.SignBitIsZero(Opd);
47137 }
47138
47139 bool AllPositive = IsPositive[0] && IsPositive[1];
47140 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
47141 // When ranges are from -128 ~ 127, use MULS8 mode.
47142 if (MinSignBits >= 25)
47143 Mode = ShrinkMode::MULS8;
47144 // When ranges are from 0 ~ 255, use MULU8 mode.
47145 else if (AllPositive && MinSignBits >= 24)
47146 Mode = ShrinkMode::MULU8;
47147 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47148 else if (MinSignBits >= 17)
47149 Mode = ShrinkMode::MULS16;
47150 // When ranges are from 0 ~ 65535, use MULU16 mode.
47151 else if (AllPositive && MinSignBits >= 16)
47152 Mode = ShrinkMode::MULU16;
47153 else
47154 return false;
47155 return true;
47156}
47157
47158/// When the operands of vector mul are extended from smaller size values,
47159/// like i8 and i16, the type of mul may be shrinked to generate more
47160/// efficient code. Two typical patterns are handled:
47161/// Pattern1:
47162/// %2 = sext/zext <N x i8> %1 to <N x i32>
47163/// %4 = sext/zext <N x i8> %3 to <N x i32>
47164// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47165/// %5 = mul <N x i32> %2, %4
47166///
47167/// Pattern2:
47168/// %2 = zext/sext <N x i16> %1 to <N x i32>
47169/// %4 = zext/sext <N x i16> %3 to <N x i32>
47170/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47171/// %5 = mul <N x i32> %2, %4
47172///
47173/// There are four mul shrinking modes:
47174/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
47175/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
47176/// generate pmullw+sext32 for it (MULS8 mode).
47177/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
47178/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
47179/// generate pmullw+zext32 for it (MULU8 mode).
47180/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
47181/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
47182/// generate pmullw+pmulhw for it (MULS16 mode).
47183/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
47184/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
47185/// generate pmullw+pmulhuw for it (MULU16 mode).
47186static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
47187 const X86Subtarget &Subtarget) {
47188 // Check for legality
47189 // pmullw/pmulhw are not supported by SSE.
47190 if (!Subtarget.hasSSE2())
47191 return SDValue();
47192
47193 // Check for profitability
47194 // pmulld is supported since SSE41. It is better to use pmulld
47195 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
47196 // the expansion.
47197 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
47198 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
47199 return SDValue();
47200
47201 ShrinkMode Mode;
47202 if (!canReduceVMulWidth(N, DAG, Mode))
47203 return SDValue();
47204
47205 SDLoc DL(N);
47206 SDValue N0 = N->getOperand(0);
47207 SDValue N1 = N->getOperand(1);
47208 EVT VT = N->getOperand(0).getValueType();
47209 unsigned NumElts = VT.getVectorNumElements();
47210 if ((NumElts % 2) != 0)
47211 return SDValue();
47212
47213 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
47214
47215 // Shrink the operands of mul.
47216 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
47217 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
47218
47219 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
47220 // lower part is needed.
47221 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
47222 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
47223 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
47224 : ISD::SIGN_EXTEND,
47225 DL, VT, MulLo);
47226
47227 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
47228 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
47229 // the higher part is also needed.
47230 SDValue MulHi =
47231 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
47232 ReducedVT, NewN0, NewN1);
47233
47234 // Repack the lower part and higher part result of mul into a wider
47235 // result.
47236 // Generate shuffle functioning as punpcklwd.
47237 SmallVector<int, 16> ShuffleMask(NumElts);
47238 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47239 ShuffleMask[2 * i] = i;
47240 ShuffleMask[2 * i + 1] = i + NumElts;
47241 }
47242 SDValue ResLo =
47243 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47244 ResLo = DAG.getBitcast(ResVT, ResLo);
47245 // Generate shuffle functioning as punpckhwd.
47246 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47247 ShuffleMask[2 * i] = i + NumElts / 2;
47248 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
47249 }
47250 SDValue ResHi =
47251 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47252 ResHi = DAG.getBitcast(ResVT, ResHi);
47253 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
47254}
47255
47256static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
47257 EVT VT, const SDLoc &DL) {
47258
47259 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
47260 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47261 DAG.getConstant(Mult, DL, VT));
47262 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
47263 DAG.getConstant(Shift, DL, MVT::i8));
47264 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47265 N->getOperand(0));
47266 return Result;
47267 };
47268
47269 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
47270 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47271 DAG.getConstant(Mul1, DL, VT));
47272 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
47273 DAG.getConstant(Mul2, DL, VT));
47274 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47275 N->getOperand(0));
47276 return Result;
47277 };
47278
47279 switch (MulAmt) {
47280 default:
47281 break;
47282 case 11:
47283 // mul x, 11 => add ((shl (mul x, 5), 1), x)
47284 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
47285 case 21:
47286 // mul x, 21 => add ((shl (mul x, 5), 2), x)
47287 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
47288 case 41:
47289 // mul x, 41 => add ((shl (mul x, 5), 3), x)
47290 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
47291 case 22:
47292 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
47293 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47294 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
47295 case 19:
47296 // mul x, 19 => add ((shl (mul x, 9), 1), x)
47297 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
47298 case 37:
47299 // mul x, 37 => add ((shl (mul x, 9), 2), x)
47300 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47301 case 73:
47302 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47303 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47304 case 13:
47305 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47306 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47307 case 23:
47308 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47309 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47310 case 26:
47311 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47312 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47313 case 28:
47314 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47315 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47316 case 29:
47317 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47318 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47319 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47320 }
47321
47322 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47323 // by a single LEA.
47324 // First check if this a sum of two power of 2s because that's easy. Then
47325 // count how many zeros are up to the first bit.
47326 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47327 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47328 unsigned ScaleShift = countTrailingZeros(MulAmt);
47329 if (ScaleShift >= 1 && ScaleShift < 4) {
47330 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47331 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47332 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47333 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47334 DAG.getConstant(ScaleShift, DL, MVT::i8));
47335 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47336 }
47337 }
47338
47339 return SDValue();
47340}
47341
47342// If the upper 17 bits of either element are zero and the other element are
47343// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47344// PMULLD, except on KNL.
47345static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
47346 const X86Subtarget &Subtarget) {
47347 if (!Subtarget.hasSSE2())
47348 return SDValue();
47349
47350 if (Subtarget.isPMADDWDSlow())
47351 return SDValue();
47352
47353 EVT VT = N->getValueType(0);
47354
47355 // Only support vXi32 vectors.
47356 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47357 return SDValue();
47358
47359 // Make sure the type is legal or can split/widen to a legal type.
47360 // With AVX512 but without BWI, we would need to split v32i16.
47361 unsigned NumElts = VT.getVectorNumElements();
47362 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47363 return SDValue();
47364
47365 // With AVX512 but without BWI, we would need to split v32i16.
47366 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47367 return SDValue();
47368
47369 SDValue N0 = N->getOperand(0);
47370 SDValue N1 = N->getOperand(1);
47371
47372 // If we are zero/sign extending two steps without SSE4.1, its better to
47373 // reduce the vmul width instead.
47374 if (!Subtarget.hasSSE41() &&
47375 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47376 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47377 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47378 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47379 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47380 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47381 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47382 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47383 return SDValue();
47384
47385 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47386 // the vmul width instead.
47387 if (!Subtarget.hasSSE41() &&
47388 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47389 N0.getOperand(0).getValueSizeInBits() > 128) &&
47390 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47391 N1.getOperand(0).getValueSizeInBits() > 128))
47392 return SDValue();
47393
47394 // Sign bits must extend down to the lowest i16.
47395 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47396 DAG.ComputeMaxSignificantBits(N0) > 16)
47397 return SDValue();
47398
47399 // At least one of the elements must be zero in the upper 17 bits, or can be
47400 // safely made zero without altering the final result.
47401 auto GetZeroableOp = [&](SDValue Op) {
47402 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47403 if (DAG.MaskedValueIsZero(Op, Mask17))
47404 return Op;
47405 // Mask off upper 16-bits of sign-extended constants.
47406 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
47407 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
47408 DAG.getConstant(0xFFFF, SDLoc(N), VT));
47409 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47410 SDValue Src = Op.getOperand(0);
47411 // Convert sext(vXi16) to zext(vXi16).
47412 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47413 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
47414 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47415 // which will expand the extension.
47416 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47417 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47418 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
47419 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
47420 }
47421 }
47422 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47423 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47424 N->isOnlyUserOf(Op.getNode())) {
47425 SDValue Src = Op.getOperand(0);
47426 if (Src.getScalarValueSizeInBits() == 16)
47427 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
47428 }
47429 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47430 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47431 N->isOnlyUserOf(Op.getNode())) {
47432 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
47433 Op.getOperand(1));
47434 }
47435 return SDValue();
47436 };
47437 SDValue ZeroN0 = GetZeroableOp(N0);
47438 SDValue ZeroN1 = GetZeroableOp(N1);
47439 if (!ZeroN0 && !ZeroN1)
47440 return SDValue();
47441 N0 = ZeroN0 ? ZeroN0 : N0;
47442 N1 = ZeroN1 ? ZeroN1 : N1;
47443
47444 // Use SplitOpsAndApply to handle AVX splitting.
47445 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47446 ArrayRef<SDValue> Ops) {
47447 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47448 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47449 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47450 DAG.getBitcast(OpVT, Ops[0]),
47451 DAG.getBitcast(OpVT, Ops[1]));
47452 };
47453 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
47454 PMADDWDBuilder);
47455}
47456
47457static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
47458 const X86Subtarget &Subtarget) {
47459 if (!Subtarget.hasSSE2())
47460 return SDValue();
47461
47462 EVT VT = N->getValueType(0);
47463
47464 // Only support vXi64 vectors.
47465 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47466 VT.getVectorNumElements() < 2 ||
47467 !isPowerOf2_32(VT.getVectorNumElements()))
47468 return SDValue();
47469
47470 SDValue N0 = N->getOperand(0);
47471 SDValue N1 = N->getOperand(1);
47472
47473 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47474 // 32-bits. We can lower with this if the sign bits stretch that far.
47475 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47476 DAG.ComputeNumSignBits(N1) > 32) {
47477 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47478 ArrayRef<SDValue> Ops) {
47479 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47480 };
47481 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
47482 PMULDQBuilder, /*CheckBWI*/false);
47483 }
47484
47485 // If the upper bits are zero we can use a single pmuludq.
47486 APInt Mask = APInt::getHighBitsSet(64, 32);
47487 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47488 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47489 ArrayRef<SDValue> Ops) {
47490 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47491 };
47492 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
47493 PMULUDQBuilder, /*CheckBWI*/false);
47494 }
47495
47496 return SDValue();
47497}
47498
47499static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
47500 TargetLowering::DAGCombinerInfo &DCI,
47501 const X86Subtarget &Subtarget) {
47502 EVT VT = N->getValueType(0);
47503
47504 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
47505 return V;
47506
47507 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
47508 return V;
47509
47510 if (DCI.isBeforeLegalize() && VT.isVector())
47511 return reduceVMULWidth(N, DAG, Subtarget);
47512
47513 // Optimize a single multiply with constant into two operations in order to
47514 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47515 if (!MulConstantOptimization)
47516 return SDValue();
47517
47518 // An imul is usually smaller than the alternative sequence.
47519 if (DAG.getMachineFunction().getFunction().hasMinSize())
47520 return SDValue();
47521
47522 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47523 return SDValue();
47524
47525 if (VT != MVT::i64 && VT != MVT::i32)
47526 return SDValue();
47527
47528 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
47529 if (!C)
47530 return SDValue();
47531 if (isPowerOf2_64(C->getZExtValue()))
47532 return SDValue();
47533
47534 int64_t SignMulAmt = C->getSExtValue();
47535 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47535, __extension__
__PRETTY_FUNCTION__))
;
47536 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47537
47538 SDLoc DL(N);
47539 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47540 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47541 DAG.getConstant(AbsMulAmt, DL, VT));
47542 if (SignMulAmt < 0)
47543 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
47544 NewMul);
47545
47546 return NewMul;
47547 }
47548
47549 uint64_t MulAmt1 = 0;
47550 uint64_t MulAmt2 = 0;
47551 if ((AbsMulAmt % 9) == 0) {
47552 MulAmt1 = 9;
47553 MulAmt2 = AbsMulAmt / 9;
47554 } else if ((AbsMulAmt % 5) == 0) {
47555 MulAmt1 = 5;
47556 MulAmt2 = AbsMulAmt / 5;
47557 } else if ((AbsMulAmt % 3) == 0) {
47558 MulAmt1 = 3;
47559 MulAmt2 = AbsMulAmt / 3;
47560 }
47561
47562 SDValue NewMul;
47563 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47564 if (MulAmt2 &&
47565 (isPowerOf2_64(MulAmt2) ||
47566 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47567
47568 if (isPowerOf2_64(MulAmt2) &&
47569 !(SignMulAmt >= 0 && N->hasOneUse() &&
47570 N->use_begin()->getOpcode() == ISD::ADD))
47571 // If second multiplifer is pow2, issue it first. We want the multiply by
47572 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
47573 // is an add. Only do this for positive multiply amounts since the
47574 // negate would prevent it from being used as an address mode anyway.
47575 std::swap(MulAmt1, MulAmt2);
47576
47577 if (isPowerOf2_64(MulAmt1))
47578 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47579 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47580 else
47581 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47582 DAG.getConstant(MulAmt1, DL, VT));
47583
47584 if (isPowerOf2_64(MulAmt2))
47585 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47586 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47587 else
47588 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47589 DAG.getConstant(MulAmt2, DL, VT));
47590
47591 // Negate the result.
47592 if (SignMulAmt < 0)
47593 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
47594 NewMul);
47595 } else if (!Subtarget.slowLEA())
47596 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47597
47598 if (!NewMul) {
47599 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))
47600 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))
47601 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))
47602 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))
;
47603 if (isPowerOf2_64(AbsMulAmt - 1)) {
47604 // (mul x, 2^N + 1) => (add (shl x, N), x)
47605 NewMul = DAG.getNode(
47606 ISD::ADD, DL, VT, N->getOperand(0),
47607 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47608 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
47609 MVT::i8)));
47610 // To negate, subtract the number from zero
47611 if (SignMulAmt < 0)
47612 NewMul = DAG.getNode(ISD::SUB, DL, VT,
47613 DAG.getConstant(0, DL, VT), NewMul);
47614 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47615 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47616 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47617 DAG.getConstant(Log2_64(AbsMulAmt + 1),
47618 DL, MVT::i8));
47619 // To negate, reverse the operands of the subtract.
47620 if (SignMulAmt < 0)
47621 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47622 else
47623 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47624 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
47625 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47626 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47627 DAG.getConstant(Log2_64(AbsMulAmt - 2),
47628 DL, MVT::i8));
47629 NewMul = DAG.getNode(
47630 ISD::ADD, DL, VT, NewMul,
47631 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47632 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
47633 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
47634 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47635 DAG.getConstant(Log2_64(AbsMulAmt + 2),
47636 DL, MVT::i8));
47637 NewMul = DAG.getNode(
47638 ISD::SUB, DL, VT, NewMul,
47639 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47640 }
47641 }
47642
47643 return NewMul;
47644}
47645
47646// Try to form a MULHU or MULHS node by looking for
47647// (srl (mul ext, ext), 16)
47648// TODO: This is X86 specific because we want to be able to handle wide types
47649// before type legalization. But we can only do it if the vector will be
47650// legalized via widening/splitting. Type legalization can't handle promotion
47651// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47652// combiner.
47653static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
47654 const X86Subtarget &Subtarget) {
47655 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47656, __extension__
__PRETTY_FUNCTION__))
47656 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47656, __extension__
__PRETTY_FUNCTION__))
;
47657 SDLoc DL(N);
47658
47659 if (!Subtarget.hasSSE2())
47660 return SDValue();
47661
47662 // The operation feeding into the shift must be a multiply.
47663 SDValue ShiftOperand = N->getOperand(0);
47664 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
47665 return SDValue();
47666
47667 // Input type should be at least vXi32.
47668 EVT VT = N->getValueType(0);
47669 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
47670 return SDValue();
47671
47672 // Need a shift by 16.
47673 APInt ShiftAmt;
47674 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
47675 ShiftAmt != 16)
47676 return SDValue();
47677
47678 SDValue LHS = ShiftOperand.getOperand(0);
47679 SDValue RHS = ShiftOperand.getOperand(1);
47680
47681 unsigned ExtOpc = LHS.getOpcode();
47682 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47683 RHS.getOpcode() != ExtOpc)
47684 return SDValue();
47685
47686 // Peek through the extends.
47687 LHS = LHS.getOperand(0);
47688 RHS = RHS.getOperand(0);
47689
47690 // Ensure the input types match.
47691 EVT MulVT = LHS.getValueType();
47692 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
47693 return SDValue();
47694
47695 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47696 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
47697
47698 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47699 return DAG.getNode(ExtOpc, DL, VT, Mulh);
47700}
47701
47702static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
47703 SDValue N0 = N->getOperand(0);
47704 SDValue N1 = N->getOperand(1);
47705 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
47706 EVT VT = N0.getValueType();
47707
47708 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
47709 // since the result of setcc_c is all zero's or all ones.
47710 if (VT.isInteger() && !VT.isVector() &&
47711 N1C && N0.getOpcode() == ISD::AND &&
47712 N0.getOperand(1).getOpcode() == ISD::Constant) {
47713 SDValue N00 = N0.getOperand(0);
47714 APInt Mask = N0.getConstantOperandAPInt(1);
47715 Mask <<= N1C->getAPIntValue();
47716 bool MaskOK = false;
47717 // We can handle cases concerning bit-widening nodes containing setcc_c if
47718 // we carefully interrogate the mask to make sure we are semantics
47719 // preserving.
47720 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
47721 // of the underlying setcc_c operation if the setcc_c was zero extended.
47722 // Consider the following example:
47723 // zext(setcc_c) -> i32 0x0000FFFF
47724 // c1 -> i32 0x0000FFFF
47725 // c2 -> i32 0x00000001
47726 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
47727 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
47728 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
47729 MaskOK = true;
47730 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
47731 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
47732 MaskOK = true;
47733 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
47734 N00.getOpcode() == ISD::ANY_EXTEND) &&
47735 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
47736 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
47737 }
47738 if (MaskOK && Mask != 0) {
47739 SDLoc DL(N);
47740 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
47741 }
47742 }
47743
47744 return SDValue();
47745}
47746
47747static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
47748 const X86Subtarget &Subtarget) {
47749 SDValue N0 = N->getOperand(0);
47750 SDValue N1 = N->getOperand(1);
47751 EVT VT = N0.getValueType();
47752 unsigned Size = VT.getSizeInBits();
47753
47754 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47755 return V;
47756
47757 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
47758 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
47759 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
47760 // depending on sign of (SarConst - [56,48,32,24,16])
47761
47762 // sexts in X86 are MOVs. The MOVs have the same code size
47763 // as above SHIFTs (only SHIFT on 1 has lower code size).
47764 // However the MOVs have 2 advantages to a SHIFT:
47765 // 1. MOVs can write to a register that differs from source
47766 // 2. MOVs accept memory operands
47767
47768 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
47769 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
47770 N0.getOperand(1).getOpcode() != ISD::Constant)
47771 return SDValue();
47772
47773 SDValue N00 = N0.getOperand(0);
47774 SDValue N01 = N0.getOperand(1);
47775 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
47776 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
47777 EVT CVT = N1.getValueType();
47778
47779 if (SarConst.isNegative())
47780 return SDValue();
47781
47782 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
47783 unsigned ShiftSize = SVT.getSizeInBits();
47784 // skipping types without corresponding sext/zext and
47785 // ShlConst that is not one of [56,48,32,24,16]
47786 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
47787 continue;
47788 SDLoc DL(N);
47789 SDValue NN =
47790 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
47791 SarConst = SarConst - (Size - ShiftSize);
47792 if (SarConst == 0)
47793 return NN;
47794 if (SarConst.isNegative())
47795 return DAG.getNode(ISD::SHL, DL, VT, NN,
47796 DAG.getConstant(-SarConst, DL, CVT));
47797 return DAG.getNode(ISD::SRA, DL, VT, NN,
47798 DAG.getConstant(SarConst, DL, CVT));
47799 }
47800 return SDValue();
47801}
47802
47803static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
47804 TargetLowering::DAGCombinerInfo &DCI,
47805 const X86Subtarget &Subtarget) {
47806 SDValue N0 = N->getOperand(0);
47807 SDValue N1 = N->getOperand(1);
47808 EVT VT = N0.getValueType();
47809
47810 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47811 return V;
47812
47813 // Only do this on the last DAG combine as it can interfere with other
47814 // combines.
47815 if (!DCI.isAfterLegalizeDAG())
47816 return SDValue();
47817
47818 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
47819 // TODO: This is a generic DAG combine that became an x86-only combine to
47820 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
47821 // and-not ('andn').
47822 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
47823 return SDValue();
47824
47825 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
47826 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
47827 if (!ShiftC || !AndC)
47828 return SDValue();
47829
47830 // If we can shrink the constant mask below 8-bits or 32-bits, then this
47831 // transform should reduce code size. It may also enable secondary transforms
47832 // from improved known-bits analysis or instruction selection.
47833 APInt MaskVal = AndC->getAPIntValue();
47834
47835 // If this can be matched by a zero extend, don't optimize.
47836 if (MaskVal.isMask()) {
47837 unsigned TO = MaskVal.countTrailingOnes();
47838 if (TO >= 8 && isPowerOf2_32(TO))
47839 return SDValue();
47840 }
47841
47842 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
47843 unsigned OldMaskSize = MaskVal.getMinSignedBits();
47844 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
47845 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
47846 (OldMaskSize > 32 && NewMaskSize <= 32)) {
47847 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
47848 SDLoc DL(N);
47849 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
47850 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
47851 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
47852 }
47853 return SDValue();
47854}
47855
47856static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
47857 const X86Subtarget &Subtarget) {
47858 unsigned Opcode = N->getOpcode();
47859 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47859, __extension__
__PRETTY_FUNCTION__))
;
47860
47861 SDLoc DL(N);
47862 EVT VT = N->getValueType(0);
47863 SDValue N0 = N->getOperand(0);
47864 SDValue N1 = N->getOperand(1);
47865 EVT SrcVT = N0.getValueType();
47866
47867 SDValue BC0 =
47868 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
47869 SDValue BC1 =
47870 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
47871
47872 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
47873 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
47874 // truncation trees that help us avoid lane crossing shuffles.
47875 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
47876 // TODO: We don't handle vXf64 shuffles yet.
47877 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47878 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
47879 SmallVector<SDValue> ShuffleOps;
47880 SmallVector<int> ShuffleMask, ScaledMask;
47881 SDValue Vec = peekThroughBitcasts(BCSrc);
47882 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
47883 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
47884 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
47885 // shuffle to a v4X64 width - we can probably relax this in the future.
47886 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
47887 ShuffleOps[0].getValueType().is256BitVector() &&
47888 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
47889 SDValue Lo, Hi;
47890 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47891 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
47892 Lo = DAG.getBitcast(SrcVT, Lo);
47893 Hi = DAG.getBitcast(SrcVT, Hi);
47894 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
47895 Res = DAG.getBitcast(ShufVT, Res);
47896 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
47897 return DAG.getBitcast(VT, Res);
47898 }
47899 }
47900 }
47901 }
47902
47903 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
47904 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47905 // If either/both ops are a shuffle that can scale to v2x64,
47906 // then see if we can perform this as a v4x32 post shuffle.
47907 SmallVector<SDValue> Ops0, Ops1;
47908 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
47909 bool IsShuf0 =
47910 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47911 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47912 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47913 bool IsShuf1 =
47914 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47915 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
47916 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47917 if (IsShuf0 || IsShuf1) {
47918 if (!IsShuf0) {
47919 Ops0.assign({BC0});
47920 ScaledMask0.assign({0, 1});
47921 }
47922 if (!IsShuf1) {
47923 Ops1.assign({BC1});
47924 ScaledMask1.assign({0, 1});
47925 }
47926
47927 SDValue LHS, RHS;
47928 int PostShuffle[4] = {-1, -1, -1, -1};
47929 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47930 if (M < 0)
47931 return true;
47932 Idx = M % 2;
47933 SDValue Src = Ops[M / 2];
47934 if (!LHS || LHS == Src) {
47935 LHS = Src;
47936 return true;
47937 }
47938 if (!RHS || RHS == Src) {
47939 Idx += 2;
47940 RHS = Src;
47941 return true;
47942 }
47943 return false;
47944 };
47945 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47946 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47947 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47948 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47949 LHS = DAG.getBitcast(SrcVT, LHS);
47950 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47951 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47952 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47953 Res = DAG.getBitcast(ShufVT, Res);
47954 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47955 return DAG.getBitcast(VT, Res);
47956 }
47957 }
47958 }
47959
47960 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47961 if (VT.is256BitVector() && Subtarget.hasInt256()) {
47962 SmallVector<int> Mask0, Mask1;
47963 SmallVector<SDValue> Ops0, Ops1;
47964 SmallVector<int, 2> ScaledMask0, ScaledMask1;
47965 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47966 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47967 !Ops0.empty() && !Ops1.empty() &&
47968 all_of(Ops0,
47969 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47970 all_of(Ops1,
47971 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47972 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47973 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47974 SDValue Op00 = peekThroughBitcasts(Ops0.front());
47975 SDValue Op10 = peekThroughBitcasts(Ops1.front());
47976 SDValue Op01 = peekThroughBitcasts(Ops0.back());
47977 SDValue Op11 = peekThroughBitcasts(Ops1.back());
47978 if ((Op00 == Op11) && (Op01 == Op10)) {
47979 std::swap(Op10, Op11);
47980 ShuffleVectorSDNode::commuteMask(ScaledMask1);
47981 }
47982 if ((Op00 == Op10) && (Op01 == Op11)) {
47983 const int Map[4] = {0, 2, 1, 3};
47984 SmallVector<int, 4> ShuffleMask(
47985 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47986 Map[ScaledMask1[1]]});
47987 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47988 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47989 DAG.getBitcast(SrcVT, Op01));
47990 Res = DAG.getBitcast(ShufVT, Res);
47991 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47992 return DAG.getBitcast(VT, Res);
47993 }
47994 }
47995 }
47996
47997 return SDValue();
47998}
47999
48000static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
48001 TargetLowering::DAGCombinerInfo &DCI,
48002 const X86Subtarget &Subtarget) {
48003 unsigned Opcode = N->getOpcode();
48004 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48005, __extension__
__PRETTY_FUNCTION__))
48005 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48005, __extension__
__PRETTY_FUNCTION__))
;
48006
48007 EVT VT = N->getValueType(0);
48008 SDValue N0 = N->getOperand(0);
48009 SDValue N1 = N->getOperand(1);
48010 unsigned NumDstElts = VT.getVectorNumElements();
48011 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
48012 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
48013 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48015, __extension__
__PRETTY_FUNCTION__))
48014 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48015, __extension__
__PRETTY_FUNCTION__))
48015 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48015, __extension__
__PRETTY_FUNCTION__))
;
48016
48017 bool IsSigned = (X86ISD::PACKSS == Opcode);
48018
48019 // Constant Folding.
48020 APInt UndefElts0, UndefElts1;
48021 SmallVector<APInt, 32> EltBits0, EltBits1;
48022 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
48023 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
48024 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
48025 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
48026 unsigned NumLanes = VT.getSizeInBits() / 128;
48027 unsigned NumSrcElts = NumDstElts / 2;
48028 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
48029 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
48030
48031 APInt Undefs(NumDstElts, 0);
48032 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
48033 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
48034 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
48035 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
48036 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
48037 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
48038
48039 if (UndefElts[SrcIdx]) {
48040 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
48041 continue;
48042 }
48043
48044 APInt &Val = EltBits[SrcIdx];
48045 if (IsSigned) {
48046 // PACKSS: Truncate signed value with signed saturation.
48047 // Source values less than dst minint are saturated to minint.
48048 // Source values greater than dst maxint are saturated to maxint.
48049 if (Val.isSignedIntN(DstBitsPerElt))
48050 Val = Val.trunc(DstBitsPerElt);
48051 else if (Val.isNegative())
48052 Val = APInt::getSignedMinValue(DstBitsPerElt);
48053 else
48054 Val = APInt::getSignedMaxValue(DstBitsPerElt);
48055 } else {
48056 // PACKUS: Truncate signed value with unsigned saturation.
48057 // Source values less than zero are saturated to zero.
48058 // Source values greater than dst maxuint are saturated to maxuint.
48059 if (Val.isIntN(DstBitsPerElt))
48060 Val = Val.trunc(DstBitsPerElt);
48061 else if (Val.isNegative())
48062 Val = APInt::getZero(DstBitsPerElt);
48063 else
48064 Val = APInt::getAllOnes(DstBitsPerElt);
48065 }
48066 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
48067 }
48068 }
48069
48070 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
48071 }
48072
48073 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48074 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48075 return V;
48076
48077 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
48078 // truncate to create a larger truncate.
48079 if (Subtarget.hasAVX512() &&
48080 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
48081 N0.getOperand(0).getValueType() == MVT::v8i32) {
48082 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
48083 (!IsSigned &&
48084 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
48085 if (Subtarget.hasVLX())
48086 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
48087
48088 // Widen input to v16i32 so we can truncate that.
48089 SDLoc dl(N);
48090 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
48091 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
48092 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
48093 }
48094 }
48095
48096 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48097 if (VT.is128BitVector()) {
48098 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48099 SDValue Src0, Src1;
48100 if (N0.getOpcode() == ExtOpc &&
48101 N0.getOperand(0).getValueType().is64BitVector() &&
48102 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48103 Src0 = N0.getOperand(0);
48104 }
48105 if (N1.getOpcode() == ExtOpc &&
48106 N1.getOperand(0).getValueType().is64BitVector() &&
48107 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48108 Src1 = N1.getOperand(0);
48109 }
48110 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
48111 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48111, __extension__
__PRETTY_FUNCTION__))
;
48112 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
48113 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
48114 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
48115 }
48116
48117 // Try again with pack(*_extend_vector_inreg, undef).
48118 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
48119 : ISD::ZERO_EXTEND_VECTOR_INREG;
48120 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
48121 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
48122 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
48123 DAG);
48124 }
48125
48126 // Attempt to combine as shuffle.
48127 SDValue Op(N, 0);
48128 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48129 return Res;
48130
48131 return SDValue();
48132}
48133
48134static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
48135 TargetLowering::DAGCombinerInfo &DCI,
48136 const X86Subtarget &Subtarget) {
48137 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48139, __extension__
__PRETTY_FUNCTION__))
48138 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48139, __extension__
__PRETTY_FUNCTION__))
48139 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48139, __extension__
__PRETTY_FUNCTION__))
;
48140
48141 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
48142 MVT VT = N->getSimpleValueType(0);
48143 SDValue LHS = N->getOperand(0);
48144 SDValue RHS = N->getOperand(1);
48145
48146 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48147 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48148 LHS.getOpcode() == RHS.getOpcode() &&
48149 LHS.getValueType() == RHS.getValueType() &&
48150 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48151 SDValue LHS0 = LHS.getOperand(0);
48152 SDValue LHS1 = LHS.getOperand(1);
48153 SDValue RHS0 = RHS.getOperand(0);
48154 SDValue RHS1 = RHS.getOperand(1);
48155 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
48156 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
48157 SDLoc DL(N);
48158 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
48159 LHS0.isUndef() ? LHS1 : LHS0,
48160 RHS0.isUndef() ? RHS1 : RHS0);
48161 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
48162 Res = DAG.getBitcast(ShufVT, Res);
48163 SDValue NewLHS =
48164 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48165 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
48166 SDValue NewRHS =
48167 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48168 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
48169 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48170 DAG.getBitcast(VT, NewRHS));
48171 }
48172 }
48173 }
48174
48175 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
48176 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48177 return V;
48178
48179 return SDValue();
48180}
48181
48182static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
48183 TargetLowering::DAGCombinerInfo &DCI,
48184 const X86Subtarget &Subtarget) {
48185 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48187, __extension__
__PRETTY_FUNCTION__))
48186 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48187, __extension__
__PRETTY_FUNCTION__))
48187 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48187, __extension__
__PRETTY_FUNCTION__))
;
48188 EVT VT = N->getValueType(0);
48189 SDValue N0 = N->getOperand(0);
48190 SDValue N1 = N->getOperand(1);
48191
48192 // Shift zero -> zero.
48193 if (ISD::isBuildVectorAllZeros(N0.getNode()))
48194 return DAG.getConstant(0, SDLoc(N), VT);
48195
48196 // Detect constant shift amounts.
48197 APInt UndefElts;
48198 SmallVector<APInt, 32> EltBits;
48199 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
48200 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
48201 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
48202 EltBits[0].getZExtValue(), DAG);
48203 }
48204
48205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48206 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
48207 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
48208 return SDValue(N, 0);
48209
48210 return SDValue();
48211}
48212
48213static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
48214 TargetLowering::DAGCombinerInfo &DCI,
48215 const X86Subtarget &Subtarget) {
48216 unsigned Opcode = N->getOpcode();
48217 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48219, __extension__
__PRETTY_FUNCTION__))
48218 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48219, __extension__
__PRETTY_FUNCTION__))
48219 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48219, __extension__
__PRETTY_FUNCTION__))
;
48220 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
48221 EVT VT = N->getValueType(0);
48222 SDValue N0 = N->getOperand(0);
48223 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48224 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48225, __extension__
__PRETTY_FUNCTION__))
48225 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48225, __extension__
__PRETTY_FUNCTION__))
;
48226 assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48227, __extension__
__PRETTY_FUNCTION__))
48227 "Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48227, __extension__
__PRETTY_FUNCTION__))
;
48228
48229 // (shift undef, X) -> 0
48230 if (N0.isUndef())
48231 return DAG.getConstant(0, SDLoc(N), VT);
48232
48233 // Out of range logical bit shifts are guaranteed to be zero.
48234 // Out of range arithmetic bit shifts splat the sign bit.
48235 unsigned ShiftVal = N->getConstantOperandVal(1);
48236 if (ShiftVal >= NumBitsPerElt) {
48237 if (LogicalShift)
48238 return DAG.getConstant(0, SDLoc(N), VT);
48239 ShiftVal = NumBitsPerElt - 1;
48240 }
48241
48242 // (shift X, 0) -> X
48243 if (!ShiftVal)
48244 return N0;
48245
48246 // (shift 0, C) -> 0
48247 if (ISD::isBuildVectorAllZeros(N0.getNode()))
48248 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48249 // result are all zeros, not undef.
48250 return DAG.getConstant(0, SDLoc(N), VT);
48251
48252 // (VSRAI -1, C) -> -1
48253 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48254 // N0 is all ones or undef. We guarantee that the bits shifted into the
48255 // result are all ones, not undef.
48256 return DAG.getConstant(-1, SDLoc(N), VT);
48257
48258 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48259 unsigned NewShiftVal = Amt0 + Amt1;
48260 if (NewShiftVal >= NumBitsPerElt) {
48261 // Out of range logical bit shifts are guaranteed to be zero.
48262 // Out of range arithmetic bit shifts splat the sign bit.
48263 if (LogicalShift)
48264 return DAG.getConstant(0, SDLoc(N), VT);
48265 NewShiftVal = NumBitsPerElt - 1;
48266 }
48267 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48268 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48269 };
48270
48271 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48272 if (Opcode == N0.getOpcode())
48273 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48274
48275 // (shl (add X, X), C) -> (shl X, (C + 1))
48276 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48277 N0.getOperand(0) == N0.getOperand(1))
48278 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48279
48280 // We can decode 'whole byte' logical bit shifts as shuffles.
48281 if (LogicalShift && (ShiftVal % 8) == 0) {
48282 SDValue Op(N, 0);
48283 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48284 return Res;
48285 }
48286
48287 // Constant Folding.
48288 APInt UndefElts;
48289 SmallVector<APInt, 32> EltBits;
48290 if (N->isOnlyUserOf(N0.getNode()) &&
48291 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
48292 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48293, __extension__
__PRETTY_FUNCTION__))
48293 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48293, __extension__
__PRETTY_FUNCTION__))
;
48294 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48295 // created an undef input due to no input bits being demanded, but user
48296 // still expects 0 in other bits.
48297 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48298 APInt &Elt = EltBits[i];
48299 if (UndefElts[i])
48300 Elt = 0;
48301 else if (X86ISD::VSHLI == Opcode)
48302 Elt <<= ShiftVal;
48303 else if (X86ISD::VSRAI == Opcode)
48304 Elt.ashrInPlace(ShiftVal);
48305 else
48306 Elt.lshrInPlace(ShiftVal);
48307 }
48308 // Reset undef elements since they were zeroed above.
48309 UndefElts = 0;
48310 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48311 }
48312
48313 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48314 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48315 DCI))
48316 return SDValue(N, 0);
48317
48318 return SDValue();
48319}
48320
48321static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
48322 TargetLowering::DAGCombinerInfo &DCI,
48323 const X86Subtarget &Subtarget) {
48324 EVT VT = N->getValueType(0);
48325 unsigned Opcode = N->getOpcode();
48326 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))
48327 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))
48328 Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))
48329 "Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))
;
48330
48331 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48332 if (Opcode == ISD::INSERT_VECTOR_ELT && N->getOperand(0).isUndef() &&
48333 isNullConstant(N->getOperand(2)))
48334 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, N->getOperand(1));
48335
48336 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48337 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48338 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48339 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48340 APInt::getAllOnes(NumBitsPerElt), DCI))
48341 return SDValue(N, 0);
48342 }
48343
48344 // Attempt to combine insertion patterns to a shuffle.
48345 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48346 SDValue Op(N, 0);
48347 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48348 return Res;
48349 }
48350
48351 return SDValue();
48352}
48353
48354/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48355/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48356/// OR -> CMPNEQSS.
48357static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
48358 TargetLowering::DAGCombinerInfo &DCI,
48359 const X86Subtarget &Subtarget) {
48360 unsigned opcode;
48361
48362 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48363 // we're requiring SSE2 for both.
48364 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48365 SDValue N0 = N->getOperand(0);
48366 SDValue N1 = N->getOperand(1);
48367 SDValue CMP0 = N0.getOperand(1);
48368 SDValue CMP1 = N1.getOperand(1);
48369 SDLoc DL(N);
48370
48371 // The SETCCs should both refer to the same CMP.
48372 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48373 return SDValue();
48374
48375 SDValue CMP00 = CMP0->getOperand(0);
48376 SDValue CMP01 = CMP0->getOperand(1);
48377 EVT VT = CMP00.getValueType();
48378
48379 if (VT == MVT::f32 || VT == MVT::f64 ||
48380 (VT == MVT::f16 && Subtarget.hasFP16())) {
48381 bool ExpectingFlags = false;
48382 // Check for any users that want flags:
48383 for (const SDNode *U : N->uses()) {
48384 if (ExpectingFlags)
48385 break;
48386
48387 switch (U->getOpcode()) {
48388 default:
48389 case ISD::BR_CC:
48390 case ISD::BRCOND:
48391 case ISD::SELECT:
48392 ExpectingFlags = true;
48393 break;
48394 case ISD::CopyToReg:
48395 case ISD::SIGN_EXTEND:
48396 case ISD::ZERO_EXTEND:
48397 case ISD::ANY_EXTEND:
48398 break;
48399 }
48400 }
48401
48402 if (!ExpectingFlags) {
48403 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48404 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48405
48406 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48407 X86::CondCode tmp = cc0;
48408 cc0 = cc1;
48409 cc1 = tmp;
48410 }
48411
48412 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48413 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48414 // FIXME: need symbolic constants for these magic numbers.
48415 // See X86ATTInstPrinter.cpp:printSSECC().
48416 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48417 if (Subtarget.hasAVX512()) {
48418 SDValue FSetCC =
48419 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48420 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48421 // Need to fill with zeros to ensure the bitcast will produce zeroes
48422 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48423 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48424 DAG.getConstant(0, DL, MVT::v16i1),
48425 FSetCC, DAG.getIntPtrConstant(0, DL));
48426 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48427 N->getSimpleValueType(0));
48428 }
48429 SDValue OnesOrZeroesF =
48430 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48431 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48432
48433 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48434 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48435
48436 if (is64BitFP && !Subtarget.is64Bit()) {
48437 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48438 // 64-bit integer, since that's not a legal type. Since
48439 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48440 // bits, but can do this little dance to extract the lowest 32 bits
48441 // and work with those going forward.
48442 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48443 OnesOrZeroesF);
48444 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48445 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48446 Vector32, DAG.getIntPtrConstant(0, DL));
48447 IntVT = MVT::i32;
48448 }
48449
48450 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48451 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48452 DAG.getConstant(1, DL, IntVT));
48453 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48454 ANDed);
48455 return OneBitOfTruth;
48456 }
48457 }
48458 }
48459 }
48460 return SDValue();
48461}
48462
48463/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48464static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
48465 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48465, __extension__
__PRETTY_FUNCTION__))
;
48466
48467 MVT VT = N->getSimpleValueType(0);
48468 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48469 return SDValue();
48470
48471 SDValue X, Y;
48472 SDValue N0 = N->getOperand(0);
48473 SDValue N1 = N->getOperand(1);
48474
48475 if (SDValue Not = IsNOT(N0, DAG)) {
48476 X = Not;
48477 Y = N1;
48478 } else if (SDValue Not = IsNOT(N1, DAG)) {
48479 X = Not;
48480 Y = N0;
48481 } else
48482 return SDValue();
48483
48484 X = DAG.getBitcast(VT, X);
48485 Y = DAG.getBitcast(VT, Y);
48486 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
48487}
48488
48489/// Try to fold:
48490/// and (vector_shuffle<Z,...,Z>
48491/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
48492/// ->
48493/// andnp (vector_shuffle<Z,...,Z>
48494/// (insert_vector_elt undef, X, Z), undef), Y
48495static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
48496 const X86Subtarget &Subtarget) {
48497 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48497, __extension__
__PRETTY_FUNCTION__))
;
48498
48499 EVT VT = N->getValueType(0);
48500 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
48501 // value and require extra moves.
48502 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48503 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
48504 return SDValue();
48505
48506 auto GetNot = [&DAG](SDValue V) {
48507 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
48508 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
48509 // end-users are ISD::AND including cases
48510 // (and(extract_vector_element(SVN), Y)).
48511 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
48512 !SVN->getOperand(1).isUndef()) {
48513 return SDValue();
48514 }
48515 SDValue IVEN = SVN->getOperand(0);
48516 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
48517 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
48518 return SDValue();
48519 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
48520 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
48521 return SDValue();
48522 SDValue Src = IVEN.getOperand(1);
48523 if (SDValue Not = IsNOT(Src, DAG)) {
48524 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
48525 SDValue NotIVEN =
48526 DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
48527 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
48528 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
48529 SVN->getOperand(1), SVN->getMask());
48530 }
48531 return SDValue();
48532 };
48533
48534 SDValue X, Y;
48535 SDValue N0 = N->getOperand(0);
48536 SDValue N1 = N->getOperand(1);
48537
48538 if (SDValue Not = GetNot(N0)) {
48539 X = Not;
48540 Y = N1;
48541 } else if (SDValue Not = GetNot(N1)) {
48542 X = Not;
48543 Y = N0;
48544 } else
48545 return SDValue();
48546
48547 X = DAG.getBitcast(VT, X);
48548 Y = DAG.getBitcast(VT, Y);
48549 SDLoc DL(N);
48550 // We do not split for SSE at all, but we need to split vectors for AVX1 and
48551 // AVX2.
48552 if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
48553 SDValue LoX, HiX;
48554 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
48555 SDValue LoY, HiY;
48556 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
48557 EVT SplitVT = LoX.getValueType();
48558 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
48559 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
48560 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
48561 }
48562 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
48563}
48564
48565// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
48566// logical operations, like in the example below.
48567// or (and (truncate x, truncate y)),
48568// (xor (truncate z, build_vector (constants)))
48569// Given a target type \p VT, we generate
48570// or (and x, y), (xor z, zext(build_vector (constants)))
48571// given x, y and z are of type \p VT. We can do so, if operands are either
48572// truncates from VT types, the second operand is a vector of constants or can
48573// be recursively promoted.
48574static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
48575 unsigned Depth) {
48576 // Limit recursion to avoid excessive compile times.
48577 if (Depth >= SelectionDAG::MaxRecursionDepth)
48578 return SDValue();
48579
48580 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
48581 N->getOpcode() != ISD::OR)
48582 return SDValue();
48583
48584 SDValue N0 = N->getOperand(0);
48585 SDValue N1 = N->getOperand(1);
48586 SDLoc DL(N);
48587
48588 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48589 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
48590 return SDValue();
48591
48592 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
48593 N0 = NN0;
48594 else {
48595 // The Left side has to be a trunc.
48596 if (N0.getOpcode() != ISD::TRUNCATE)
48597 return SDValue();
48598
48599 // The type of the truncated inputs.
48600 if (N0.getOperand(0).getValueType() != VT)
48601 return SDValue();
48602
48603 N0 = N0.getOperand(0);
48604 }
48605
48606 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
48607 N1 = NN1;
48608 else {
48609 // The right side has to be a 'trunc' or a constant vector.
48610 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
48611 N1.getOperand(0).getValueType() == VT;
48612 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
48613 return SDValue();
48614
48615 if (RHSTrunc)
48616 N1 = N1.getOperand(0);
48617 else
48618 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
48619 }
48620
48621 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
48622}
48623
48624// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
48625// register. In most cases we actually compare or select YMM-sized registers
48626// and mixing the two types creates horrible code. This method optimizes
48627// some of the transition sequences.
48628// Even with AVX-512 this is still useful for removing casts around logical
48629// operations on vXi1 mask types.
48630static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
48631 const X86Subtarget &Subtarget) {
48632 EVT VT = N->getValueType(0);
48633 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48633, __extension__
__PRETTY_FUNCTION__))
;
48634
48635 SDLoc DL(N);
48636 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__))
48637 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__))
48638 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__))
;
48639
48640 SDValue Narrow = N->getOperand(0);
48641 EVT NarrowVT = Narrow.getValueType();
48642
48643 // Generate the wide operation.
48644 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
48645 if (!Op)
48646 return SDValue();
48647 switch (N->getOpcode()) {
48648 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48648)
;
48649 case ISD::ANY_EXTEND:
48650 return Op;
48651 case ISD::ZERO_EXTEND:
48652 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
48653 case ISD::SIGN_EXTEND:
48654 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
48655 Op, DAG.getValueType(NarrowVT));
48656 }
48657}
48658
48659static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
48660 unsigned FPOpcode;
48661 switch (Opcode) {
48662 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48662)
;
48663 case ISD::AND: FPOpcode = X86ISD::FAND; break;
48664 case ISD::OR: FPOpcode = X86ISD::FOR; break;
48665 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
48666 }
48667 return FPOpcode;
48668}
48669
48670/// If both input operands of a logic op are being cast from floating-point
48671/// types or FP compares, try to convert this into a floating-point logic node
48672/// to avoid unnecessary moves from SSE to integer registers.
48673static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
48674 TargetLowering::DAGCombinerInfo &DCI,
48675 const X86Subtarget &Subtarget) {
48676 EVT VT = N->getValueType(0);
48677 SDValue N0 = N->getOperand(0);
48678 SDValue N1 = N->getOperand(1);
48679 SDLoc DL(N);
48680
48681 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
48682 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
48683 return SDValue();
48684
48685 SDValue N00 = N0.getOperand(0);
48686 SDValue N10 = N1.getOperand(0);
48687 EVT N00Type = N00.getValueType();
48688 EVT N10Type = N10.getValueType();
48689
48690 // Ensure that both types are the same and are legal scalar fp types.
48691 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
48692 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
48693 (Subtarget.hasFP16() && N00Type == MVT::f16)))
48694 return SDValue();
48695
48696 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
48697 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
48698 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
48699 return DAG.getBitcast(VT, FPLogic);
48700 }
48701
48702 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
48703 !N1.hasOneUse())
48704 return SDValue();
48705
48706 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48707 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
48708
48709 // The vector ISA for FP predicates is incomplete before AVX, so converting
48710 // COMIS* to CMPS* may not be a win before AVX.
48711 if (!Subtarget.hasAVX() &&
48712 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
48713 return SDValue();
48714
48715 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
48716 // and vector logic:
48717 // logic (setcc N00, N01), (setcc N10, N11) -->
48718 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
48719 unsigned NumElts = 128 / N00Type.getSizeInBits();
48720 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
48721 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
48722 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
48723 SDValue N01 = N0.getOperand(1);
48724 SDValue N11 = N1.getOperand(1);
48725 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
48726 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
48727 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
48728 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
48729 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
48730 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
48731 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
48732 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
48733}
48734
48735// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
48736// to reduce XMM->GPR traffic.
48737static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
48738 unsigned Opc = N->getOpcode();
48739 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48740, __extension__
__PRETTY_FUNCTION__))
48740 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48740, __extension__
__PRETTY_FUNCTION__))
;
48741
48742 SDValue N0 = N->getOperand(0);
48743 SDValue N1 = N->getOperand(1);
48744
48745 // Both operands must be single use MOVMSK.
48746 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
48747 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
48748 return SDValue();
48749
48750 SDValue Vec0 = N0.getOperand(0);
48751 SDValue Vec1 = N1.getOperand(0);
48752 EVT VecVT0 = Vec0.getValueType();
48753 EVT VecVT1 = Vec1.getValueType();
48754
48755 // Both MOVMSK operands must be from vectors of the same size and same element
48756 // size, but its OK for a fp/int diff.
48757 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
48758 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
48759 return SDValue();
48760
48761 SDLoc DL(N);
48762 unsigned VecOpc =
48763 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
48764 SDValue Result =
48765 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
48766 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48767}
48768
48769// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
48770// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
48771// handles in InstCombine.
48772static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
48773 unsigned Opc = N->getOpcode();
48774 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48775, __extension__
__PRETTY_FUNCTION__))
48775 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48775, __extension__
__PRETTY_FUNCTION__))
;
48776
48777 SDValue N0 = N->getOperand(0);
48778 SDValue N1 = N->getOperand(1);
48779 EVT VT = N->getValueType(0);
48780
48781 // Both operands must be single use.
48782 if (!N0.hasOneUse() || !N1.hasOneUse())
48783 return SDValue();
48784
48785 // Search for matching shifts.
48786 SDValue BC0 = peekThroughOneUseBitcasts(N0);
48787 SDValue BC1 = peekThroughOneUseBitcasts(N1);
48788
48789 unsigned BCOpc = BC0.getOpcode();
48790 EVT BCVT = BC0.getValueType();
48791 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
48792 return SDValue();
48793
48794 switch (BCOpc) {
48795 case X86ISD::VSHLI:
48796 case X86ISD::VSRLI:
48797 case X86ISD::VSRAI: {
48798 if (BC0.getOperand(1) != BC1.getOperand(1))
48799 return SDValue();
48800
48801 SDLoc DL(N);
48802 SDValue BitOp =
48803 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
48804 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
48805 return DAG.getBitcast(VT, Shift);
48806 }
48807 }
48808
48809 return SDValue();
48810}
48811
48812/// If this is a zero/all-bits result that is bitwise-anded with a low bits
48813/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
48814/// with a shift-right to eliminate loading the vector constant mask value.
48815static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
48816 const X86Subtarget &Subtarget) {
48817 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
48818 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
48819 EVT VT = Op0.getValueType();
48820 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
48821 return SDValue();
48822
48823 // Try to convert an "is positive" signbit masking operation into arithmetic
48824 // shift and "andn". This saves a materialization of a -1 vector constant.
48825 // The "is negative" variant should be handled more generally because it only
48826 // requires "and" rather than "andn":
48827 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48828 //
48829 // This is limited to the original type to avoid producing even more bitcasts.
48830 // If the bitcasts can't be eliminated, then it is unlikely that this fold
48831 // will be profitable.
48832 if (N->getValueType(0) == VT &&
48833 supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {
48834 SDValue X, Y;
48835 if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&
48836 isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {
48837 X = Op1.getOperand(0);
48838 Y = Op0;
48839 } else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&
48840 isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {
48841 X = Op0.getOperand(0);
48842 Y = Op1;
48843 }
48844 if (X && Y) {
48845 SDLoc DL(N);
48846 SDValue Sra =
48847 getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
48848 VT.getScalarSizeInBits() - 1, DAG);
48849 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48850 }
48851 }
48852
48853 APInt SplatVal;
48854 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
48855 !SplatVal.isMask())
48856 return SDValue();
48857
48858 // Don't prevent creation of ANDN.
48859 if (isBitwiseNot(Op0))
48860 return SDValue();
48861
48862 if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))
48863 return SDValue();
48864
48865 unsigned EltBitWidth = VT.getScalarSizeInBits();
48866 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48867 return SDValue();
48868
48869 SDLoc DL(N);
48870 unsigned ShiftVal = SplatVal.countTrailingOnes();
48871 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48872 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48873 return DAG.getBitcast(N->getValueType(0), Shift);
48874}
48875
48876// Get the index node from the lowered DAG of a GEP IR instruction with one
48877// indexing dimension.
48878static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
48879 if (Ld->isIndexed())
48880 return SDValue();
48881
48882 SDValue Base = Ld->getBasePtr();
48883
48884 if (Base.getOpcode() != ISD::ADD)
48885 return SDValue();
48886
48887 SDValue ShiftedIndex = Base.getOperand(0);
48888
48889 if (ShiftedIndex.getOpcode() != ISD::SHL)
48890 return SDValue();
48891
48892 return ShiftedIndex.getOperand(0);
48893
48894}
48895
48896static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48897 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
48898 switch (VT.getSizeInBits()) {
48899 default: return false;
48900 case 64: return Subtarget.is64Bit() ? true : false;
48901 case 32: return true;
48902 }
48903 }
48904 return false;
48905}
48906
48907// This function recognizes cases where X86 bzhi instruction can replace and
48908// 'and-load' sequence.
48909// In case of loading integer value from an array of constants which is defined
48910// as follows:
48911//
48912// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48913//
48914// then applying a bitwise and on the result with another input.
48915// It's equivalent to performing bzhi (zero high bits) on the input, with the
48916// same index of the load.
48917static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
48918 const X86Subtarget &Subtarget) {
48919 MVT VT = Node->getSimpleValueType(0);
48920 SDLoc dl(Node);
48921
48922 // Check if subtarget has BZHI instruction for the node's type
48923 if (!hasBZHI(Subtarget, VT))
48924 return SDValue();
48925
48926 // Try matching the pattern for both operands.
48927 for (unsigned i = 0; i < 2; i++) {
48928 SDValue N = Node->getOperand(i);
48929 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48930
48931 // continue if the operand is not a load instruction
48932 if (!Ld)
48933 return SDValue();
48934
48935 const Value *MemOp = Ld->getMemOperand()->getValue();
48936
48937 if (!MemOp)
48938 return SDValue();
48939
48940 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48941 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48942 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48943
48944 Constant *Init = GV->getInitializer();
48945 Type *Ty = Init->getType();
48946 if (!isa<ConstantDataArray>(Init) ||
48947 !Ty->getArrayElementType()->isIntegerTy() ||
48948 Ty->getArrayElementType()->getScalarSizeInBits() !=
48949 VT.getSizeInBits() ||
48950 Ty->getArrayNumElements() >
48951 Ty->getArrayElementType()->getScalarSizeInBits())
48952 continue;
48953
48954 // Check if the array's constant elements are suitable to our case.
48955 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48956 bool ConstantsMatch = true;
48957 for (uint64_t j = 0; j < ArrayElementCount; j++) {
48958 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48959 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48960 ConstantsMatch = false;
48961 break;
48962 }
48963 }
48964 if (!ConstantsMatch)
48965 continue;
48966
48967 // Do the transformation (For 32-bit type):
48968 // -> (and (load arr[idx]), inp)
48969 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
48970 // that will be replaced with one bzhi instruction.
48971 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
48972 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
48973
48974 // Get the Node which indexes into the array.
48975 SDValue Index = getIndexFromUnindexedLoad(Ld);
48976 if (!Index)
48977 return SDValue();
48978 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
48979
48980 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
48981 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
48982
48983 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
48984 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
48985
48986 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
48987 }
48988 }
48989 }
48990 }
48991 return SDValue();
48992}
48993
48994// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
48995// Where C is a mask containing the same number of bits as the setcc and
48996// where the setcc will freely 0 upper bits of k-register. We can replace the
48997// undef in the concat with 0s and remove the AND. This mainly helps with
48998// v2i1/v4i1 setcc being casted to scalar.
48999static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
49000 const X86Subtarget &Subtarget) {
49001 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49001, __extension__
__PRETTY_FUNCTION__))
;
49002
49003 EVT VT = N->getValueType(0);
49004
49005 // Make sure this is an AND with constant. We will check the value of the
49006 // constant later.
49007 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49008 if (!C1)
49009 return SDValue();
49010
49011 // This is implied by the ConstantSDNode.
49012 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49012, __extension__
__PRETTY_FUNCTION__))
;
49013
49014 SDValue Src = N->getOperand(0);
49015 if (!Src.hasOneUse())
49016 return SDValue();
49017
49018 // (Optionally) peek through any_extend().
49019 if (Src.getOpcode() == ISD::ANY_EXTEND) {
49020 if (!Src.getOperand(0).hasOneUse())
49021 return SDValue();
49022 Src = Src.getOperand(0);
49023 }
49024
49025 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
49026 return SDValue();
49027
49028 Src = Src.getOperand(0);
49029 EVT SrcVT = Src.getValueType();
49030
49031 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49032 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
49033 !TLI.isTypeLegal(SrcVT))
49034 return SDValue();
49035
49036 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
49037 return SDValue();
49038
49039 // We only care about the first subvector of the concat, we expect the
49040 // other subvectors to be ignored due to the AND if we make the change.
49041 SDValue SubVec = Src.getOperand(0);
49042 EVT SubVecVT = SubVec.getValueType();
49043
49044 // The RHS of the AND should be a mask with as many bits as SubVec.
49045 if (!TLI.isTypeLegal(SubVecVT) ||
49046 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49047 return SDValue();
49048
49049 // First subvector should be a setcc with a legal result type or a
49050 // AND containing at least one setcc with a legal result type.
49051 auto IsLegalSetCC = [&](SDValue V) {
49052 if (V.getOpcode() != ISD::SETCC)
49053 return false;
49054 EVT SetccVT = V.getOperand(0).getValueType();
49055 if (!TLI.isTypeLegal(SetccVT) ||
49056 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
49057 return false;
49058 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
49059 return false;
49060 return true;
49061 };
49062 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
49063 (IsLegalSetCC(SubVec.getOperand(0)) ||
49064 IsLegalSetCC(SubVec.getOperand(1))))))
49065 return SDValue();
49066
49067 // We passed all the checks. Rebuild the concat_vectors with zeroes
49068 // and cast it back to VT.
49069 SDLoc dl(N);
49070 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
49071 DAG.getConstant(0, dl, SubVecVT));
49072 Ops[0] = SubVec;
49073 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
49074 Ops);
49075 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
49076 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
49077}
49078
49079static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
49080 TargetLowering::DAGCombinerInfo &DCI,
49081 const X86Subtarget &Subtarget) {
49082 SDValue N0 = N->getOperand(0);
49083 SDValue N1 = N->getOperand(1);
49084 EVT VT = N->getValueType(0);
49085 SDLoc dl(N);
49086 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49087
49088 // If this is SSE1 only convert to FAND to avoid scalarization.
49089 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49090 return DAG.getBitcast(MVT::v4i32,
49091 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49092 DAG.getBitcast(MVT::v4f32, N0),
49093 DAG.getBitcast(MVT::v4f32, N1)));
49094 }
49095
49096 // Use a 32-bit and+zext if upper bits known zero.
49097 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49098 APInt HiMask = APInt::getHighBitsSet(64, 32);
49099 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49100 DAG.MaskedValueIsZero(N0, HiMask)) {
49101 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49102 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49103 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49104 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49105 }
49106 }
49107
49108 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49109 // TODO: Support multiple SrcOps.
49110 if (VT == MVT::i1) {
49111 SmallVector<SDValue, 2> SrcOps;
49112 SmallVector<APInt, 2> SrcPartials;
49113 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49114 SrcOps.size() == 1) {
49115 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49116 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49117 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49118 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49119 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49120 if (Mask) {
49121 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49122, __extension__
__PRETTY_FUNCTION__))
49122 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49122, __extension__
__PRETTY_FUNCTION__))
;
49123 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49124 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49125 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49126 }
49127 }
49128 }
49129
49130 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49131 return V;
49132
49133 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49134 return R;
49135
49136 if (SDValue R = combineBitOpWithShift(N, DAG))
49137 return R;
49138
49139 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49140 return FPLogic;
49141
49142 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49143 return R;
49144
49145 if (DCI.isBeforeLegalizeOps())
49146 return SDValue();
49147
49148 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49149 return R;
49150
49151 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49152 return R;
49153
49154 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49155 return ShiftRight;
49156
49157 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49158 return R;
49159
49160 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49161 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49162 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49163 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49164 unsigned Opc0 = N0.getOpcode();
49165 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49166 getTargetConstantFromNode(N0.getOperand(1)) &&
49167 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49168 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49169 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49170 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49171 }
49172 }
49173
49174 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49175 // avoids slow variable shift (moving shift amount to ECX etc.)
49176 if (isOneConstant(N1) && N0->hasOneUse()) {
49177 SDValue Src = N0;
49178 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49179 Src.getOpcode() == ISD::TRUNCATE) &&
49180 Src.getOperand(0)->hasOneUse())
49181 Src = Src.getOperand(0);
49182 bool ContainsNOT = false;
49183 X86::CondCode X86CC = X86::COND_B;
49184 // Peek through AND(NOT(SRL(X,Y)),1).
49185 if (isBitwiseNot(Src)) {
49186 Src = Src.getOperand(0);
49187 X86CC = X86::COND_AE;
49188 ContainsNOT = true;
49189 }
49190 if (Src.getOpcode() == ISD::SRL &&
49191 !isa<ConstantSDNode>(Src.getOperand(1))) {
49192 SDValue BitNo = Src.getOperand(1);
49193 Src = Src.getOperand(0);
49194 // Peek through AND(SRL(NOT(X),Y),1).
49195 if (isBitwiseNot(Src)) {
49196 Src = Src.getOperand(0);
49197 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49198 ContainsNOT = true;
49199 }
49200 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
49201 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
49202 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
49203 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
49204 }
49205 }
49206
49207 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49208 // Attempt to recursively combine a bitmask AND with shuffles.
49209 SDValue Op(N, 0);
49210 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49211 return Res;
49212
49213 // If either operand is a constant mask, then only the elements that aren't
49214 // zero are actually demanded by the other operand.
49215 auto GetDemandedMasks = [&](SDValue Op) {
49216 APInt UndefElts;
49217 SmallVector<APInt> EltBits;
49218 int NumElts = VT.getVectorNumElements();
49219 int EltSizeInBits = VT.getScalarSizeInBits();
49220 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
49221 APInt DemandedElts = APInt::getAllOnes(NumElts);
49222 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
49223 EltBits)) {
49224 DemandedBits.clearAllBits();
49225 DemandedElts.clearAllBits();
49226 for (int I = 0; I != NumElts; ++I) {
49227 if (UndefElts[I]) {
49228 // We can't assume an undef src element gives an undef dst - the
49229 // other src might be zero.
49230 DemandedBits.setAllBits();
49231 DemandedElts.setBit(I);
49232 } else if (!EltBits[I].isZero()) {
49233 DemandedBits |= EltBits[I];
49234 DemandedElts.setBit(I);
49235 }
49236 }
49237 }
49238 return std::make_pair(DemandedBits, DemandedElts);
49239 };
49240 APInt Bits0, Elts0;
49241 APInt Bits1, Elts1;
49242 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
49243 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
49244
49245 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
49246 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
49247 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
49248 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
49249 if (N->getOpcode() != ISD::DELETED_NODE)
49250 DCI.AddToWorklist(N);
49251 return SDValue(N, 0);
49252 }
49253
49254 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
49255 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
49256 if (NewN0 || NewN1)
49257 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
49258 NewN1 ? NewN1 : N1);
49259 }
49260
49261 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
49262 if ((VT.getScalarSizeInBits() % 8) == 0 &&
49263 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
49264 isa<ConstantSDNode>(N0.getOperand(1))) {
49265 SDValue BitMask = N1;
49266 SDValue SrcVec = N0.getOperand(0);
49267 EVT SrcVecVT = SrcVec.getValueType();
49268
49269 // Check that the constant bitmask masks whole bytes.
49270 APInt UndefElts;
49271 SmallVector<APInt, 64> EltBits;
49272 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
49273 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
49274 llvm::all_of(EltBits, [](const APInt &M) {
49275 return M.isZero() || M.isAllOnes();
49276 })) {
49277 unsigned NumElts = SrcVecVT.getVectorNumElements();
49278 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
49279 unsigned Idx = N0.getConstantOperandVal(1);
49280
49281 // Create a root shuffle mask from the byte mask and the extracted index.
49282 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
49283 for (unsigned i = 0; i != Scale; ++i) {
49284 if (UndefElts[i])
49285 continue;
49286 int VecIdx = Scale * Idx + i;
49287 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
49288 }
49289
49290 if (SDValue Shuffle = combineX86ShufflesRecursively(
49291 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
49292 X86::MaxShuffleCombineDepth,
49293 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
49294 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
49295 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
49296 N0.getOperand(1));
49297 }
49298 }
49299
49300 return SDValue();
49301}
49302
49303// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
49304static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
49305 const X86Subtarget &Subtarget) {
49306 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49306, __extension__
__PRETTY_FUNCTION__))
;
49307
49308 MVT VT = N->getSimpleValueType(0);
49309 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49310 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
49311 return SDValue();
49312
49313 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
49314 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
49315 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
49316 return SDValue();
49317
49318 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
49319 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
49320 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
49321 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
49322 return SDValue();
49323
49324 // Attempt to extract constant byte masks.
49325 APInt UndefElts0, UndefElts1;
49326 SmallVector<APInt, 32> EltBits0, EltBits1;
49327 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
49328 false, false))
49329 return SDValue();
49330 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
49331 false, false))
49332 return SDValue();
49333
49334 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
49335 // TODO - add UNDEF elts support.
49336 if (UndefElts0[i] || UndefElts1[i])
49337 return SDValue();
49338 if (EltBits0[i] != ~EltBits1[i])
49339 return SDValue();
49340 }
49341
49342 SDLoc DL(N);
49343
49344 if (useVPTERNLOG(Subtarget, VT)) {
49345 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
49346 // VPTERNLOG is only available as vXi32/64-bit types.
49347 MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
49348 MVT OpVT =
49349 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
49350 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
49351 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
49352 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
49353 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
49354 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
49355 DAG, Subtarget);
49356 return DAG.getBitcast(VT, Res);
49357 }
49358
49359 SDValue X = N->getOperand(0);
49360 SDValue Y =
49361 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
49362 DAG.getBitcast(VT, N1.getOperand(0)));
49363 return DAG.getNode(ISD::OR, DL, VT, X, Y);
49364}
49365
49366// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
49367static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
49368 if (N->getOpcode() != ISD::OR)
49369 return false;
49370
49371 SDValue N0 = N->getOperand(0);
49372 SDValue N1 = N->getOperand(1);
49373
49374 // Canonicalize AND to LHS.
49375 if (N1.getOpcode() == ISD::AND)
49376 std::swap(N0, N1);
49377
49378 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
49379 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
49380 return false;
49381
49382 Mask = N1.getOperand(0);
49383 X = N1.getOperand(1);
49384
49385 // Check to see if the mask appeared in both the AND and ANDNP.
49386 if (N0.getOperand(0) == Mask)
49387 Y = N0.getOperand(1);
49388 else if (N0.getOperand(1) == Mask)
49389 Y = N0.getOperand(0);
49390 else
49391 return false;
49392
49393 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
49394 // ANDNP combine allows other combines to happen that prevent matching.
49395 return true;
49396}
49397
49398// Try to fold:
49399// (or (and (m, y), (pandn m, x)))
49400// into:
49401// (vselect m, x, y)
49402// As a special case, try to fold:
49403// (or (and (m, (sub 0, x)), (pandn m, x)))
49404// into:
49405// (sub (xor X, M), M)
49406static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
49407 const X86Subtarget &Subtarget) {
49408 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49408, __extension__
__PRETTY_FUNCTION__))
;
49409
49410 EVT VT = N->getValueType(0);
49411 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49412 (VT.is256BitVector() && Subtarget.hasInt256())))
49413 return SDValue();
49414
49415 SDValue X, Y, Mask;
49416 if (!matchLogicBlend(N, X, Y, Mask))
49417 return SDValue();
49418
49419 // Validate that X, Y, and Mask are bitcasts, and see through them.
49420 Mask = peekThroughBitcasts(Mask);
49421 X = peekThroughBitcasts(X);
49422 Y = peekThroughBitcasts(Y);
49423
49424 EVT MaskVT = Mask.getValueType();
49425 unsigned EltBits = MaskVT.getScalarSizeInBits();
49426
49427 // TODO: Attempt to handle floating point cases as well?
49428 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
49429 return SDValue();
49430
49431 SDLoc DL(N);
49432
49433 // Attempt to combine to conditional negate: (sub (xor X, M), M)
49434 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
49435 DAG, Subtarget))
49436 return Res;
49437
49438 // PBLENDVB is only available on SSE 4.1.
49439 if (!Subtarget.hasSSE41())
49440 return SDValue();
49441
49442 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
49443 if (Subtarget.hasVLX())
49444 return SDValue();
49445
49446 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
49447
49448 X = DAG.getBitcast(BlendVT, X);
49449 Y = DAG.getBitcast(BlendVT, Y);
49450 Mask = DAG.getBitcast(BlendVT, Mask);
49451 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
49452 return DAG.getBitcast(VT, Mask);
49453}
49454
49455// Helper function for combineOrCmpEqZeroToCtlzSrl
49456// Transforms:
49457// seteq(cmp x, 0)
49458// into:
49459// srl(ctlz x), log2(bitsize(x))
49460// Input pattern is checked by caller.
49461static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
49462 SDValue Cmp = Op.getOperand(1);
49463 EVT VT = Cmp.getOperand(0).getValueType();
49464 unsigned Log2b = Log2_32(VT.getSizeInBits());
49465 SDLoc dl(Op);
49466 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
49467 // The result of the shift is true or false, and on X86, the 32-bit
49468 // encoding of shr and lzcnt is more desirable.
49469 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
49470 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
49471 DAG.getConstant(Log2b, dl, MVT::i8));
49472 return Scc;
49473}
49474
49475// Try to transform:
49476// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
49477// into:
49478// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
49479// Will also attempt to match more generic cases, eg:
49480// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
49481// Only applies if the target supports the FastLZCNT feature.
49482static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
49483 TargetLowering::DAGCombinerInfo &DCI,
49484 const X86Subtarget &Subtarget) {
49485 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
49486 return SDValue();
49487
49488 auto isORCandidate = [](SDValue N) {
49489 return (N->getOpcode() == ISD::OR && N->hasOneUse());
49490 };
49491
49492 // Check the zero extend is extending to 32-bit or more. The code generated by
49493 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
49494 // instructions to clear the upper bits.
49495 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
49496 !isORCandidate(N->getOperand(0)))
49497 return SDValue();
49498
49499 // Check the node matches: setcc(eq, cmp 0)
49500 auto isSetCCCandidate = [](SDValue N) {
49501 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
49502 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
49503 N->getOperand(1).getOpcode() == X86ISD::CMP &&
49504 isNullConstant(N->getOperand(1).getOperand(1)) &&
49505 N->getOperand(1).getValueType().bitsGE(MVT::i32);
49506 };
49507
49508 SDNode *OR = N->getOperand(0).getNode();
49509 SDValue LHS = OR->getOperand(0);
49510 SDValue RHS = OR->getOperand(1);
49511
49512 // Save nodes matching or(or, setcc(eq, cmp 0)).
49513 SmallVector<SDNode *, 2> ORNodes;
49514 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
49515 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
49516 ORNodes.push_back(OR);
49517 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
49518 LHS = OR->getOperand(0);
49519 RHS = OR->getOperand(1);
49520 }
49521
49522 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
49523 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
49524 !isORCandidate(SDValue(OR, 0)))
49525 return SDValue();
49526
49527 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
49528 // to
49529 // or(srl(ctlz),srl(ctlz)).
49530 // The dag combiner can then fold it into:
49531 // srl(or(ctlz, ctlz)).
49532 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
49533 SDValue Ret, NewRHS;
49534 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
49535 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
49536
49537 if (!Ret)
49538 return SDValue();
49539
49540 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
49541 while (ORNodes.size() > 0) {
49542 OR = ORNodes.pop_back_val();
49543 LHS = OR->getOperand(0);
49544 RHS = OR->getOperand(1);
49545 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
49546 if (RHS->getOpcode() == ISD::OR)
49547 std::swap(LHS, RHS);
49548 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
49549 if (!NewRHS)
49550 return SDValue();
49551 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
49552 }
49553
49554 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
49555}
49556
49557static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
49558 SDValue And1_L, SDValue And1_R,
49559 const SDLoc &DL, SelectionDAG &DAG) {
49560 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
49561 return SDValue();
49562 SDValue NotOp = And0_L->getOperand(0);
49563 if (NotOp == And1_R)
49564 std::swap(And1_R, And1_L);
49565 if (NotOp != And1_L)
49566 return SDValue();
49567
49568 // (~(NotOp) & And0_R) | (NotOp & And1_R)
49569 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
49570 EVT VT = And1_L->getValueType(0);
49571 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
49572 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
49573 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
49574 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
49575 return Xor1;
49576}
49577
49578/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
49579/// equivalent `((x ^ y) & m) ^ y)` pattern.
49580/// This is typically a better representation for targets without a fused
49581/// "and-not" operation. This function is intended to be called from a
49582/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
49583static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
49584 // Note that masked-merge variants using XOR or ADD expressions are
49585 // normalized to OR by InstCombine so we only check for OR.
49586 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49586, __extension__
__PRETTY_FUNCTION__))
;
49587 SDValue N0 = Node->getOperand(0);
49588 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
49589 return SDValue();
49590 SDValue N1 = Node->getOperand(1);
49591 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
49592 return SDValue();
49593
49594 SDLoc DL(Node);
49595 SDValue N00 = N0->getOperand(0);
49596 SDValue N01 = N0->getOperand(1);
49597 SDValue N10 = N1->getOperand(0);
49598 SDValue N11 = N1->getOperand(1);
49599 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
49600 return Result;
49601 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
49602 return Result;
49603 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
49604 return Result;
49605 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
49606 return Result;
49607 return SDValue();
49608}
49609
49610static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
49611 TargetLowering::DAGCombinerInfo &DCI,
49612 const X86Subtarget &Subtarget) {
49613 SDValue N0 = N->getOperand(0);
49614 SDValue N1 = N->getOperand(1);
49615 EVT VT = N->getValueType(0);
49616 SDLoc dl(N);
49617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49618
49619 // If this is SSE1 only convert to FOR to avoid scalarization.
49620 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49621 return DAG.getBitcast(MVT::v4i32,
49622 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
49623 DAG.getBitcast(MVT::v4f32, N0),
49624 DAG.getBitcast(MVT::v4f32, N1)));
49625 }
49626
49627 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
49628 // TODO: Support multiple SrcOps.
49629 if (VT == MVT::i1) {
49630 SmallVector<SDValue, 2> SrcOps;
49631 SmallVector<APInt, 2> SrcPartials;
49632 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
49633 SrcOps.size() == 1) {
49634 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49635 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49636 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49637 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49638 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49639 if (Mask) {
49640 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49641, __extension__
__PRETTY_FUNCTION__))
49641 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49641, __extension__
__PRETTY_FUNCTION__))
;
49642 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
49643 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49644 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49645 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
49646 }
49647 }
49648 }
49649
49650 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49651 return R;
49652
49653 if (SDValue R = combineBitOpWithShift(N, DAG))
49654 return R;
49655
49656 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49657 return FPLogic;
49658
49659 if (DCI.isBeforeLegalizeOps())
49660 return SDValue();
49661
49662 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49663 return R;
49664
49665 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
49666 return R;
49667
49668 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
49669 return R;
49670
49671 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
49672 if ((VT == MVT::i32 || VT == MVT::i64) &&
49673 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
49674 isNullConstant(N0.getOperand(0))) {
49675 SDValue Cond = N0.getOperand(1);
49676 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
49677 Cond = Cond.getOperand(0);
49678
49679 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
49680 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
49681 uint64_t Val = CN->getZExtValue();
49682 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
49683 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
49684 CCode = X86::GetOppositeBranchCondition(CCode);
49685 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
49686
49687 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
49688 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
49689 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
49690 return R;
49691 }
49692 }
49693 }
49694 }
49695
49696 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
49697 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
49698 // iff the upper elements of the non-shifted arg are zero.
49699 // KUNPCK require 16+ bool vector elements.
49700 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
49701 unsigned NumElts = VT.getVectorNumElements();
49702 unsigned HalfElts = NumElts / 2;
49703 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
49704 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
49705 N1.getConstantOperandAPInt(1) == HalfElts &&
49706 DAG.MaskedVectorIsZero(N0, UpperElts)) {
49707 return DAG.getNode(
49708 ISD::CONCAT_VECTORS, dl, VT,
49709 extractSubVector(N0, 0, DAG, dl, HalfElts),
49710 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
49711 }
49712 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
49713 N0.getConstantOperandAPInt(1) == HalfElts &&
49714 DAG.MaskedVectorIsZero(N1, UpperElts)) {
49715 return DAG.getNode(
49716 ISD::CONCAT_VECTORS, dl, VT,
49717 extractSubVector(N1, 0, DAG, dl, HalfElts),
49718 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
49719 }
49720 }
49721
49722 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49723 // Attempt to recursively combine an OR of shuffles.
49724 SDValue Op(N, 0);
49725 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49726 return Res;
49727
49728 // If either operand is a constant mask, then only the elements that aren't
49729 // allones are actually demanded by the other operand.
49730 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
49731 APInt UndefElts;
49732 SmallVector<APInt> EltBits;
49733 int NumElts = VT.getVectorNumElements();
49734 int EltSizeInBits = VT.getScalarSizeInBits();
49735 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
49736 return false;
49737
49738 APInt DemandedElts = APInt::getZero(NumElts);
49739 for (int I = 0; I != NumElts; ++I)
49740 if (!EltBits[I].isAllOnes())
49741 DemandedElts.setBit(I);
49742
49743 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
49744 };
49745 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
49746 if (N->getOpcode() != ISD::DELETED_NODE)
49747 DCI.AddToWorklist(N);
49748 return SDValue(N, 0);
49749 }
49750 }
49751
49752 // We should fold "masked merge" patterns when `andn` is not available.
49753 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
49754 if (SDValue R = foldMaskedMerge(N, DAG))
49755 return R;
49756
49757 return SDValue();
49758}
49759
49760/// Try to turn tests against the signbit in the form of:
49761/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
49762/// into:
49763/// SETGT(X, -1)
49764static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
49765 // This is only worth doing if the output type is i8 or i1.
49766 EVT ResultType = N->getValueType(0);
49767 if (ResultType != MVT::i8 && ResultType != MVT::i1)
49768 return SDValue();
49769
49770 SDValue N0 = N->getOperand(0);
49771 SDValue N1 = N->getOperand(1);
49772
49773 // We should be performing an xor against a truncated shift.
49774 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
49775 return SDValue();
49776
49777 // Make sure we are performing an xor against one.
49778 if (!isOneConstant(N1))
49779 return SDValue();
49780
49781 // SetCC on x86 zero extends so only act on this if it's a logical shift.
49782 SDValue Shift = N0.getOperand(0);
49783 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
49784 return SDValue();
49785
49786 // Make sure we are truncating from one of i16, i32 or i64.
49787 EVT ShiftTy = Shift.getValueType();
49788 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
49789 return SDValue();
49790
49791 // Make sure the shift amount extracts the sign bit.
49792 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
49793 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
49794 return SDValue();
49795
49796 // Create a greater-than comparison against -1.
49797 // N.B. Using SETGE against 0 works but we want a canonical looking
49798 // comparison, using SETGT matches up with what TranslateX86CC.
49799 SDLoc DL(N);
49800 SDValue ShiftOp = Shift.getOperand(0);
49801 EVT ShiftOpTy = ShiftOp.getValueType();
49802 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49803 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
49804 *DAG.getContext(), ResultType);
49805 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
49806 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
49807 if (SetCCResultType != ResultType)
49808 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
49809 return Cond;
49810}
49811
49812/// Turn vector tests of the signbit in the form of:
49813/// xor (sra X, elt_size(X)-1), -1
49814/// into:
49815/// pcmpgt X, -1
49816///
49817/// This should be called before type legalization because the pattern may not
49818/// persist after that.
49819static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
49820 const X86Subtarget &Subtarget) {
49821 EVT VT = N->getValueType(0);
49822 if (!VT.isSimple())
49823 return SDValue();
49824
49825 switch (VT.getSimpleVT().SimpleTy) {
49826 default: return SDValue();
49827 case MVT::v16i8:
49828 case MVT::v8i16:
49829 case MVT::v4i32:
49830 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
49831 case MVT::v32i8:
49832 case MVT::v16i16:
49833 case MVT::v8i32:
49834 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
49835 }
49836
49837 // There must be a shift right algebraic before the xor, and the xor must be a
49838 // 'not' operation.
49839 SDValue Shift = N->getOperand(0);
49840 SDValue Ones = N->getOperand(1);
49841 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
49842 !ISD::isBuildVectorAllOnes(Ones.getNode()))
49843 return SDValue();
49844
49845 // The shift should be smearing the sign bit across each vector element.
49846 auto *ShiftAmt =
49847 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
49848 if (!ShiftAmt ||
49849 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
49850 return SDValue();
49851
49852 // Create a greater-than comparison against -1. We don't use the more obvious
49853 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
49854 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
49855}
49856
49857/// Detect patterns of truncation with unsigned saturation:
49858///
49859/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
49860/// Return the source value x to be truncated or SDValue() if the pattern was
49861/// not matched.
49862///
49863/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
49864/// where C1 >= 0 and C2 is unsigned max of destination type.
49865///
49866/// (truncate (smax (smin (x, C2), C1)) to dest_type)
49867/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
49868///
49869/// These two patterns are equivalent to:
49870/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
49871/// So return the smax(x, C1) value to be truncated or SDValue() if the
49872/// pattern was not matched.
49873static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
49874 const SDLoc &DL) {
49875 EVT InVT = In.getValueType();
49876
49877 // Saturation with truncation. We truncate from InVT to VT.
49878 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))
49879 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))
;
49880
49881 // Match min/max and return limit value as a parameter.
49882 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
49883 if (V.getOpcode() == Opcode &&
49884 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
49885 return V.getOperand(0);
49886 return SDValue();
49887 };
49888
49889 APInt C1, C2;
49890 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
49891 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
49892 // the element size of the destination type.
49893 if (C2.isMask(VT.getScalarSizeInBits()))
49894 return UMin;
49895
49896 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
49897 if (MatchMinMax(SMin, ISD::SMAX, C1))
49898 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
49899 return SMin;
49900
49901 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
49902 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
49903 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
49904 C2.uge(C1)) {
49905 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
49906 }
49907
49908 return SDValue();
49909}
49910
49911/// Detect patterns of truncation with signed saturation:
49912/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
49913/// signed_max_of_dest_type)) to dest_type)
49914/// or:
49915/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
49916/// signed_min_of_dest_type)) to dest_type).
49917/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
49918/// Return the source value to be truncated or SDValue() if the pattern was not
49919/// matched.
49920static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
49921 unsigned NumDstBits = VT.getScalarSizeInBits();
49922 unsigned NumSrcBits = In.getScalarValueSizeInBits();
49923 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49923, __extension__
__PRETTY_FUNCTION__))
;
49924
49925 auto MatchMinMax = [](SDValue V, unsigned Opcode,
49926 const APInt &Limit) -> SDValue {
49927 APInt C;
49928 if (V.getOpcode() == Opcode &&
49929 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
49930 return V.getOperand(0);
49931 return SDValue();
49932 };
49933
49934 APInt SignedMax, SignedMin;
49935 if (MatchPackUS) {
49936 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
49937 SignedMin = APInt(NumSrcBits, 0);
49938 } else {
49939 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
49940 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
49941 }
49942
49943 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
49944 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
49945 return SMax;
49946
49947 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
49948 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
49949 return SMin;
49950
49951 return SDValue();
49952}
49953
49954static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
49955 SelectionDAG &DAG,
49956 const X86Subtarget &Subtarget) {
49957 if (!Subtarget.hasSSE2() || !VT.isVector())
49958 return SDValue();
49959
49960 EVT SVT = VT.getVectorElementType();
49961 EVT InVT = In.getValueType();
49962 EVT InSVT = InVT.getVectorElementType();
49963
49964 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
49965 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
49966 // and concatenate at the same time. Then we can use a final vpmovuswb to
49967 // clip to 0-255.
49968 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
49969 InVT == MVT::v16i32 && VT == MVT::v16i8) {
49970 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
49971 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
49972 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
49973 DL, DAG, Subtarget);
49974 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49974, __extension__
__PRETTY_FUNCTION__))
;
49975 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
49976 }
49977 }
49978
49979 // vXi32 truncate instructions are available with AVX512F.
49980 // vXi16 truncate instructions are only available with AVX512BW.
49981 // For 256-bit or smaller vectors, we require VLX.
49982 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
49983 // If the result type is 256-bits or larger and we have disable 512-bit
49984 // registers, we should go ahead and use the pack instructions if possible.
49985 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
49986 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
49987 (InVT.getSizeInBits() > 128) &&
49988 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
49989 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
49990
49991 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
49992 VT.getSizeInBits() >= 64 &&
49993 (SVT == MVT::i8 || SVT == MVT::i16) &&
49994 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
49995 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
49996 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
49997 // Only do this when the result is at least 64 bits or we'll leaving
49998 // dangling PACKSSDW nodes.
49999 if (SVT == MVT::i8 && InSVT == MVT::i32) {
50000 EVT MidVT = VT.changeVectorElementType(MVT::i16);
50001 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
50002 DAG, Subtarget);
50003 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50003, __extension__
__PRETTY_FUNCTION__))
;
50004 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
50005 Subtarget);
50006 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50006, __extension__ __PRETTY_FUNCTION__))
;
50007 return V;
50008 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
50009 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
50010 Subtarget);
50011 }
50012 if (SDValue SSatVal = detectSSatPattern(In, VT))
50013 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
50014 Subtarget);
50015 }
50016
50017 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50018 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
50019 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
50020 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
50021 unsigned TruncOpc = 0;
50022 SDValue SatVal;
50023 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
50024 SatVal = SSatVal;
50025 TruncOpc = X86ISD::VTRUNCS;
50026 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
50027 SatVal = USatVal;
50028 TruncOpc = X86ISD::VTRUNCUS;
50029 }
50030 if (SatVal) {
50031 unsigned ResElts = VT.getVectorNumElements();
50032 // If the input type is less than 512 bits and we don't have VLX, we need
50033 // to widen to 512 bits.
50034 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
50035 unsigned NumConcats = 512 / InVT.getSizeInBits();
50036 ResElts *= NumConcats;
50037 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
50038 ConcatOps[0] = SatVal;
50039 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
50040 NumConcats * InVT.getVectorNumElements());
50041 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
50042 }
50043 // Widen the result if its narrower than 128 bits.
50044 if (ResElts * SVT.getSizeInBits() < 128)
50045 ResElts = 128 / SVT.getSizeInBits();
50046 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
50047 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
50048 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50049 DAG.getIntPtrConstant(0, DL));
50050 }
50051 }
50052
50053 return SDValue();
50054}
50055
50056/// This function detects the AVG pattern between vectors of unsigned i8/i16,
50057/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
50058/// ISD::AVGCEILU (AVG) instruction.
50059static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
50060 const X86Subtarget &Subtarget,
50061 const SDLoc &DL) {
50062 if (!VT.isVector())
50063 return SDValue();
50064 EVT InVT = In.getValueType();
50065 unsigned NumElems = VT.getVectorNumElements();
50066
50067 EVT ScalarVT = VT.getVectorElementType();
50068 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
50069 return SDValue();
50070
50071 // InScalarVT is the intermediate type in AVG pattern and it should be greater
50072 // than the original input type (i8/i16).
50073 EVT InScalarVT = InVT.getVectorElementType();
50074 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
50075 return SDValue();
50076
50077 if (!Subtarget.hasSSE2())
50078 return SDValue();
50079
50080 // Detect the following pattern:
50081 //
50082 // %1 = zext <N x i8> %a to <N x i32>
50083 // %2 = zext <N x i8> %b to <N x i32>
50084 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
50085 // %4 = add nuw nsw <N x i32> %3, %2
50086 // %5 = lshr <N x i32> %N, <i32 1 x N>
50087 // %6 = trunc <N x i32> %5 to <N x i8>
50088 //
50089 // In AVX512, the last instruction can also be a trunc store.
50090 if (In.getOpcode() != ISD::SRL)
50091 return SDValue();
50092
50093 // A lambda checking the given SDValue is a constant vector and each element
50094 // is in the range [Min, Max].
50095 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
50096 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
50097 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
50098 });
50099 };
50100
50101 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
50102 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
50103 return MaxActiveBits <= ScalarVT.getSizeInBits();
50104 };
50105
50106 // Check if each element of the vector is right-shifted by one.
50107 SDValue LHS = In.getOperand(0);
50108 SDValue RHS = In.getOperand(1);
50109 if (!IsConstVectorInRange(RHS, 1, 1))
50110 return SDValue();
50111 if (LHS.getOpcode() != ISD::ADD)
50112 return SDValue();
50113
50114 // Detect a pattern of a + b + 1 where the order doesn't matter.
50115 SDValue Operands[3];
50116 Operands[0] = LHS.getOperand(0);
50117 Operands[1] = LHS.getOperand(1);
50118
50119 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50120 ArrayRef<SDValue> Ops) {
50121 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
50122 };
50123
50124 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
50125 for (SDValue &Op : Ops)
50126 if (Op.getValueType() != VT)
50127 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
50128 // Pad to a power-of-2 vector, split+apply and extract the original vector.
50129 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
50130 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
50131 if (NumElemsPow2 != NumElems) {
50132 for (SDValue &Op : Ops) {
50133 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
50134 for (unsigned i = 0; i != NumElems; ++i) {
50135 SDValue Idx = DAG.getIntPtrConstant(i, DL);
50136 EltsOfOp[i] =
50137 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
50138 }
50139 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
50140 }
50141 }
50142 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
50143 if (NumElemsPow2 == NumElems)
50144 return Res;
50145 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50146 DAG.getIntPtrConstant(0, DL));
50147 };
50148
50149 // Take care of the case when one of the operands is a constant vector whose
50150 // element is in the range [1, 256].
50151 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
50152 IsZExtLike(Operands[0])) {
50153 // The pattern is detected. Subtract one from the constant vector, then
50154 // demote it and emit X86ISD::AVG instruction.
50155 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
50156 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
50157 return AVGSplitter({Operands[0], Operands[1]});
50158 }
50159
50160 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
50161 // Match the or case only if its 'add-like' - can be replaced by an add.
50162 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
50163 if (ISD::ADD == V.getOpcode()) {
50164 Op0 = V.getOperand(0);
50165 Op1 = V.getOperand(1);
50166 return true;
50167 }
50168 if (ISD::ZERO_EXTEND != V.getOpcode())
50169 return false;
50170 V = V.getOperand(0);
50171 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
50172 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
50173 return false;
50174 Op0 = V.getOperand(0);
50175 Op1 = V.getOperand(1);
50176 return true;
50177 };
50178
50179 SDValue Op0, Op1;
50180 if (FindAddLike(Operands[0], Op0, Op1))
50181 std::swap(Operands[0], Operands[1]);
50182 else if (!FindAddLike(Operands[1], Op0, Op1))
50183 return SDValue();
50184 Operands[2] = Op0;
50185 Operands[1] = Op1;
50186
50187 // Now we have three operands of two additions. Check that one of them is a
50188 // constant vector with ones, and the other two can be promoted from i8/i16.
50189 for (SDValue &Op : Operands) {
50190 if (!IsConstVectorInRange(Op, 1, 1))
50191 continue;
50192 std::swap(Op, Operands[2]);
50193
50194 // Check if Operands[0] and Operands[1] are results of type promotion.
50195 for (int j = 0; j < 2; ++j)
50196 if (Operands[j].getValueType() != VT)
50197 if (!IsZExtLike(Operands[j]))
50198 return SDValue();
50199
50200 // The pattern is detected, emit X86ISD::AVG instruction(s).
50201 return AVGSplitter({Operands[0], Operands[1]});
50202 }
50203
50204 return SDValue();
50205}
50206
50207static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
50208 TargetLowering::DAGCombinerInfo &DCI,
50209 const X86Subtarget &Subtarget) {
50210 LoadSDNode *Ld = cast<LoadSDNode>(N);
50211 EVT RegVT = Ld->getValueType(0);
50212 EVT MemVT = Ld->getMemoryVT();
50213 SDLoc dl(Ld);
50214 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50215
50216 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
50217 // into two 16-byte operations. Also split non-temporal aligned loads on
50218 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
50219 ISD::LoadExtType Ext = Ld->getExtensionType();
50220 unsigned Fast;
50221 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
50222 Ext == ISD::NON_EXTLOAD &&
50223 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
50224 Ld->getAlign() >= Align(16)) ||
50225 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
50226 *Ld->getMemOperand(), &Fast) &&
50227 !Fast))) {
50228 unsigned NumElems = RegVT.getVectorNumElements();
50229 if (NumElems < 2)
50230 return SDValue();
50231
50232 unsigned HalfOffset = 16;
50233 SDValue Ptr1 = Ld->getBasePtr();
50234 SDValue Ptr2 =
50235 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
50236 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
50237 NumElems / 2);
50238 SDValue Load1 =
50239 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
50240 Ld->getOriginalAlign(),
50241 Ld->getMemOperand()->getFlags());
50242 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
50243 Ld->getPointerInfo().getWithOffset(HalfOffset),
50244 Ld->getOriginalAlign(),
50245 Ld->getMemOperand()->getFlags());
50246 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
50247 Load1.getValue(1), Load2.getValue(1));
50248
50249 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
50250 return DCI.CombineTo(N, NewVec, TF, true);
50251 }
50252
50253 // Bool vector load - attempt to cast to an integer, as we have good
50254 // (vXiY *ext(vXi1 bitcast(iX))) handling.
50255 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
50256 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
50257 unsigned NumElts = RegVT.getVectorNumElements();
50258 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50259 if (TLI.isTypeLegal(IntVT)) {
50260 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
50261 Ld->getPointerInfo(),
50262 Ld->getOriginalAlign(),
50263 Ld->getMemOperand()->getFlags());
50264 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
50265 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
50266 }
50267 }
50268
50269 // If we also broadcast this as a subvector to a wider type, then just extract
50270 // the lowest subvector.
50271 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
50272 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
50273 SDValue Ptr = Ld->getBasePtr();
50274 SDValue Chain = Ld->getChain();
50275 for (SDNode *User : Ptr->uses()) {
50276 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50277 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
50278 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
50279 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
50280 MemVT.getSizeInBits() &&
50281 !User->hasAnyUseOfValue(1) &&
50282 User->getValueSizeInBits(0).getFixedValue() >
50283 RegVT.getFixedSizeInBits()) {
50284 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50285 RegVT.getSizeInBits());
50286 Extract = DAG.getBitcast(RegVT, Extract);
50287 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50288 }
50289 }
50290 }
50291
50292 // Cast ptr32 and ptr64 pointers to the default address space before a load.
50293 unsigned AddrSpace = Ld->getAddressSpace();
50294 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50295 AddrSpace == X86AS::PTR32_UPTR) {
50296 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50297 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
50298 SDValue Cast =
50299 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
50300 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
50301 Ld->getOriginalAlign(),
50302 Ld->getMemOperand()->getFlags());
50303 }
50304 }
50305
50306 return SDValue();
50307}
50308
50309/// If V is a build vector of boolean constants and exactly one of those
50310/// constants is true, return the operand index of that true element.
50311/// Otherwise, return -1.
50312static int getOneTrueElt(SDValue V) {
50313 // This needs to be a build vector of booleans.
50314 // TODO: Checking for the i1 type matches the IR definition for the mask,
50315 // but the mask check could be loosened to i8 or other types. That might
50316 // also require checking more than 'allOnesValue'; eg, the x86 HW
50317 // instructions only require that the MSB is set for each mask element.
50318 // The ISD::MSTORE comments/definition do not specify how the mask operand
50319 // is formatted.
50320 auto *BV = dyn_cast<BuildVectorSDNode>(V);
50321 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
50322 return -1;
50323
50324 int TrueIndex = -1;
50325 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
50326 for (unsigned i = 0; i < NumElts; ++i) {
50327 const SDValue &Op = BV->getOperand(i);
50328 if (Op.isUndef())
50329 continue;
50330 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
50331 if (!ConstNode)
50332 return -1;
50333 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
50334 // If we already found a one, this is too many.
50335 if (TrueIndex >= 0)
50336 return -1;
50337 TrueIndex = i;
50338 }
50339 }
50340 return TrueIndex;
50341}
50342
50343/// Given a masked memory load/store operation, return true if it has one mask
50344/// bit set. If it has one mask bit set, then also return the memory address of
50345/// the scalar element to load/store, the vector index to insert/extract that
50346/// scalar element, and the alignment for the scalar memory access.
50347static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
50348 SelectionDAG &DAG, SDValue &Addr,
50349 SDValue &Index, Align &Alignment,
50350 unsigned &Offset) {
50351 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
50352 if (TrueMaskElt < 0)
50353 return false;
50354
50355 // Get the address of the one scalar element that is specified by the mask
50356 // using the appropriate offset from the base pointer.
50357 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
50358 Offset = 0;
50359 Addr = MaskedOp->getBasePtr();
50360 if (TrueMaskElt != 0) {
50361 Offset = TrueMaskElt * EltVT.getStoreSize();
50362 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
50363 SDLoc(MaskedOp));
50364 }
50365
50366 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
50367 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
50368 EltVT.getStoreSize());
50369 return true;
50370}
50371
50372/// If exactly one element of the mask is set for a non-extending masked load,
50373/// it is a scalar load and vector insert.
50374/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50375/// mask have already been optimized in IR, so we don't bother with those here.
50376static SDValue
50377reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
50378 TargetLowering::DAGCombinerInfo &DCI,
50379 const X86Subtarget &Subtarget) {
50380 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50380, __extension__
__PRETTY_FUNCTION__))
;
50381 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50382 // However, some target hooks may need to be added to know when the transform
50383 // is profitable. Endianness would also have to be considered.
50384
50385 SDValue Addr, VecIndex;
50386 Align Alignment;
50387 unsigned Offset;
50388 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
50389 return SDValue();
50390
50391 // Load the one scalar element that is specified by the mask using the
50392 // appropriate offset from the base pointer.
50393 SDLoc DL(ML);
50394 EVT VT = ML->getValueType(0);
50395 EVT EltVT = VT.getVectorElementType();
50396
50397 EVT CastVT = VT;
50398 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50399 EltVT = MVT::f64;
50400 CastVT = VT.changeVectorElementType(EltVT);
50401 }
50402
50403 SDValue Load =
50404 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
50405 ML->getPointerInfo().getWithOffset(Offset),
50406 Alignment, ML->getMemOperand()->getFlags());
50407
50408 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
50409
50410 // Insert the loaded element into the appropriate place in the vector.
50411 SDValue Insert =
50412 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
50413 Insert = DAG.getBitcast(VT, Insert);
50414 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
50415}
50416
50417static SDValue
50418combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
50419 TargetLowering::DAGCombinerInfo &DCI) {
50420 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50420, __extension__
__PRETTY_FUNCTION__))
;
50421 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
50422 return SDValue();
50423
50424 SDLoc DL(ML);
50425 EVT VT = ML->getValueType(0);
50426
50427 // If we are loading the first and last elements of a vector, it is safe and
50428 // always faster to load the whole vector. Replace the masked load with a
50429 // vector load and select.
50430 unsigned NumElts = VT.getVectorNumElements();
50431 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
50432 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
50433 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
50434 if (LoadFirstElt && LoadLastElt) {
50435 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
50436 ML->getMemOperand());
50437 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
50438 ML->getPassThru());
50439 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
50440 }
50441
50442 // Convert a masked load with a constant mask into a masked load and a select.
50443 // This allows the select operation to use a faster kind of select instruction
50444 // (for example, vblendvps -> vblendps).
50445
50446 // Don't try this if the pass-through operand is already undefined. That would
50447 // cause an infinite loop because that's what we're about to create.
50448 if (ML->getPassThru().isUndef())
50449 return SDValue();
50450
50451 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
50452 return SDValue();
50453
50454 // The new masked load has an undef pass-through operand. The select uses the
50455 // original pass-through operand.
50456 SDValue NewML = DAG.getMaskedLoad(
50457 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
50458 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
50459 ML->getAddressingMode(), ML->getExtensionType());
50460 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
50461 ML->getPassThru());
50462
50463 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
50464}
50465
50466static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
50467 TargetLowering::DAGCombinerInfo &DCI,
50468 const X86Subtarget &Subtarget) {
50469 auto *Mld = cast<MaskedLoadSDNode>(N);
50470
50471 // TODO: Expanding load with constant mask may be optimized as well.
50472 if (Mld->isExpandingLoad())
50473 return SDValue();
50474
50475 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
50476 if (SDValue ScalarLoad =
50477 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
50478 return ScalarLoad;
50479
50480 // TODO: Do some AVX512 subsets benefit from this transform?
50481 if (!Subtarget.hasAVX512())
50482 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50483 return Blend;
50484 }
50485
50486 // If the mask value has been legalized to a non-boolean vector, try to
50487 // simplify ops leading up to it. We only demand the MSB of each lane.
50488 SDValue Mask = Mld->getMask();
50489 if (Mask.getScalarValueSizeInBits() != 1) {
50490 EVT VT = Mld->getValueType(0);
50491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50492 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
50493 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50494 if (N->getOpcode() != ISD::DELETED_NODE)
50495 DCI.AddToWorklist(N);
50496 return SDValue(N, 0);
50497 }
50498 if (SDValue NewMask =
50499 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
50500 return DAG.getMaskedLoad(
50501 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50502 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50503 Mld->getAddressingMode(), Mld->getExtensionType());
50504 }
50505
50506 return SDValue();
50507}
50508
50509/// If exactly one element of the mask is set for a non-truncating masked store,
50510/// it is a vector extract and scalar store.
50511/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50512/// mask have already been optimized in IR, so we don't bother with those here.
50513static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
50514 SelectionDAG &DAG,
50515 const X86Subtarget &Subtarget) {
50516 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50517 // However, some target hooks may need to be added to know when the transform
50518 // is profitable. Endianness would also have to be considered.
50519
50520 SDValue Addr, VecIndex;
50521 Align Alignment;
50522 unsigned Offset;
50523 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50524 return SDValue();
50525
50526 // Extract the one scalar element that is actually being stored.
50527 SDLoc DL(MS);
50528 SDValue Value = MS->getValue();
50529 EVT VT = Value.getValueType();
50530 EVT EltVT = VT.getVectorElementType();
50531 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50532 EltVT = MVT::f64;
50533 EVT CastVT = VT.changeVectorElementType(EltVT);
50534 Value = DAG.getBitcast(CastVT, Value);
50535 }
50536 SDValue Extract =
50537 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50538
50539 // Store that element at the appropriate offset from the base pointer.
50540 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50541 MS->getPointerInfo().getWithOffset(Offset),
50542 Alignment, MS->getMemOperand()->getFlags());
50543}
50544
50545static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
50546 TargetLowering::DAGCombinerInfo &DCI,
50547 const X86Subtarget &Subtarget) {
50548 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50549 if (Mst->isCompressingStore())
50550 return SDValue();
50551
50552 EVT VT = Mst->getValue().getValueType();
50553 SDLoc dl(Mst);
50554 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50555
50556 if (Mst->isTruncatingStore())
50557 return SDValue();
50558
50559 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
50560 return ScalarStore;
50561
50562 // If the mask value has been legalized to a non-boolean vector, try to
50563 // simplify ops leading up to it. We only demand the MSB of each lane.
50564 SDValue Mask = Mst->getMask();
50565 if (Mask.getScalarValueSizeInBits() != 1) {
50566 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
50567 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50568 if (N->getOpcode() != ISD::DELETED_NODE)
50569 DCI.AddToWorklist(N);
50570 return SDValue(N, 0);
50571 }
50572 if (SDValue NewMask =
50573 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
50574 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
50575 Mst->getBasePtr(), Mst->getOffset(), NewMask,
50576 Mst->getMemoryVT(), Mst->getMemOperand(),
50577 Mst->getAddressingMode());
50578 }
50579
50580 SDValue Value = Mst->getValue();
50581 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
50582 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
50583 Mst->getMemoryVT())) {
50584 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
50585 Mst->getBasePtr(), Mst->getOffset(), Mask,
50586 Mst->getMemoryVT(), Mst->getMemOperand(),
50587 Mst->getAddressingMode(), true);
50588 }
50589
50590 return SDValue();
50591}
50592
50593static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
50594 TargetLowering::DAGCombinerInfo &DCI,
50595 const X86Subtarget &Subtarget) {
50596 StoreSDNode *St = cast<StoreSDNode>(N);
50597 EVT StVT = St->getMemoryVT();
50598 SDLoc dl(St);
50599 SDValue StoredVal = St->getValue();
50600 EVT VT = StoredVal.getValueType();
50601 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50602
50603 // Convert a store of vXi1 into a store of iX and a bitcast.
50604 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
50605 VT.getVectorElementType() == MVT::i1) {
50606
50607 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
50608 StoredVal = DAG.getBitcast(NewVT, StoredVal);
50609
50610 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50611 St->getPointerInfo(), St->getOriginalAlign(),
50612 St->getMemOperand()->getFlags());
50613 }
50614
50615 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
50616 // This will avoid a copy to k-register.
50617 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
50618 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50619 StoredVal.getOperand(0).getValueType() == MVT::i8) {
50620 SDValue Val = StoredVal.getOperand(0);
50621 // We must store zeros to the unused bits.
50622 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
50623 return DAG.getStore(St->getChain(), dl, Val,
50624 St->getBasePtr(), St->getPointerInfo(),
50625 St->getOriginalAlign(),
50626 St->getMemOperand()->getFlags());
50627 }
50628
50629 // Widen v2i1/v4i1 stores to v8i1.
50630 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
50631 Subtarget.hasAVX512()) {
50632 unsigned NumConcats = 8 / VT.getVectorNumElements();
50633 // We must store zeros to the unused bits.
50634 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
50635 Ops[0] = StoredVal;
50636 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
50637 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50638 St->getPointerInfo(), St->getOriginalAlign(),
50639 St->getMemOperand()->getFlags());
50640 }
50641
50642 // Turn vXi1 stores of constants into a scalar store.
50643 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
50644 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
50645 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
50646 // If its a v64i1 store without 64-bit support, we need two stores.
50647 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
50648 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
50649 StoredVal->ops().slice(0, 32));
50650 Lo = combinevXi1ConstantToInteger(Lo, DAG);
50651 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
50652 StoredVal->ops().slice(32, 32));
50653 Hi = combinevXi1ConstantToInteger(Hi, DAG);
50654
50655 SDValue Ptr0 = St->getBasePtr();
50656 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
50657
50658 SDValue Ch0 =
50659 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
50660 St->getOriginalAlign(),
50661 St->getMemOperand()->getFlags());
50662 SDValue Ch1 =
50663 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
50664 St->getPointerInfo().getWithOffset(4),
50665 St->getOriginalAlign(),
50666 St->getMemOperand()->getFlags());
50667 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
50668 }
50669
50670 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
50671 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50672 St->getPointerInfo(), St->getOriginalAlign(),
50673 St->getMemOperand()->getFlags());
50674 }
50675
50676 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
50677 // Sandy Bridge, perform two 16-byte stores.
50678 unsigned Fast;
50679 if (VT.is256BitVector() && StVT == VT &&
50680 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50681 *St->getMemOperand(), &Fast) &&
50682 !Fast) {
50683 unsigned NumElems = VT.getVectorNumElements();
50684 if (NumElems < 2)
50685 return SDValue();
50686
50687 return splitVectorStore(St, DAG);
50688 }
50689
50690 // Split under-aligned vector non-temporal stores.
50691 if (St->isNonTemporal() && StVT == VT &&
50692 St->getAlign().value() < VT.getStoreSize()) {
50693 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
50694 // vectors or the legalizer can scalarize it to use MOVNTI.
50695 if (VT.is256BitVector() || VT.is512BitVector()) {
50696 unsigned NumElems = VT.getVectorNumElements();
50697 if (NumElems < 2)
50698 return SDValue();
50699 return splitVectorStore(St, DAG);
50700 }
50701
50702 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
50703 // to use MOVNTI.
50704 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
50705 MVT NTVT = Subtarget.hasSSE4A()
50706 ? MVT::v2f64
50707 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
50708 return scalarizeVectorStore(St, NTVT, DAG);
50709 }
50710 }
50711
50712 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
50713 // supported, but avx512f is by extending to v16i32 and truncating.
50714 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
50715 St->getValue().getOpcode() == ISD::TRUNCATE &&
50716 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
50717 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
50718 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
50719 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
50720 St->getValue().getOperand(0));
50721 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
50722 MVT::v16i8, St->getMemOperand());
50723 }
50724
50725 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
50726 if (!St->isTruncatingStore() &&
50727 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
50728 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
50729 StoredVal.hasOneUse() &&
50730 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
50731 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
50732 return EmitTruncSStore(IsSigned, St->getChain(),
50733 dl, StoredVal.getOperand(0), St->getBasePtr(),
50734 VT, St->getMemOperand(), DAG);
50735 }
50736
50737 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
50738 if (!St->isTruncatingStore()) {
50739 auto IsExtractedElement = [](SDValue V) {
50740 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
50741 V = V.getOperand(0);
50742 unsigned Opc = V.getOpcode();
50743 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
50744 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
50745 V.getOperand(0).hasOneUse())
50746 return V.getOperand(0);
50747 return SDValue();
50748 };
50749 if (SDValue Extract = IsExtractedElement(StoredVal)) {
50750 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
50751 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
50752 SDValue Src = Trunc.getOperand(0);
50753 MVT DstVT = Trunc.getSimpleValueType();
50754 MVT SrcVT = Src.getSimpleValueType();
50755 unsigned NumSrcElts = SrcVT.getVectorNumElements();
50756 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
50757 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
50758 if (NumTruncBits == VT.getSizeInBits() &&
50759 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
50760 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
50761 TruncVT, St->getMemOperand());
50762 }
50763 }
50764 }
50765 }
50766
50767 // Optimize trunc store (of multiple scalars) to shuffle and store.
50768 // First, pack all of the elements in one place. Next, store to memory
50769 // in fewer chunks.
50770 if (St->isTruncatingStore() && VT.isVector()) {
50771 // Check if we can detect an AVG pattern from the truncation. If yes,
50772 // replace the trunc store by a normal store with the result of X86ISD::AVG
50773 // instruction.
50774 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
50775 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
50776 Subtarget, dl))
50777 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
50778 St->getPointerInfo(), St->getOriginalAlign(),
50779 St->getMemOperand()->getFlags());
50780
50781 if (TLI.isTruncStoreLegal(VT, StVT)) {
50782 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
50783 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
50784 dl, Val, St->getBasePtr(),
50785 St->getMemoryVT(), St->getMemOperand(), DAG);
50786 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
50787 DAG, dl))
50788 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
50789 dl, Val, St->getBasePtr(),
50790 St->getMemoryVT(), St->getMemOperand(), DAG);
50791 }
50792
50793 return SDValue();
50794 }
50795
50796 // Cast ptr32 and ptr64 pointers to the default address space before a store.
50797 unsigned AddrSpace = St->getAddressSpace();
50798 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50799 AddrSpace == X86AS::PTR32_UPTR) {
50800 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50801 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
50802 SDValue Cast =
50803 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
50804 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
50805 St->getPointerInfo(), St->getOriginalAlign(),
50806 St->getMemOperand()->getFlags(), St->getAAInfo());
50807 }
50808 }
50809
50810 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
50811 // the FP state in cases where an emms may be missing.
50812 // A preferable solution to the general problem is to figure out the right
50813 // places to insert EMMS. This qualifies as a quick hack.
50814
50815 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
50816 if (VT.getSizeInBits() != 64)
50817 return SDValue();
50818
50819 const Function &F = DAG.getMachineFunction().getFunction();
50820 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
50821 bool F64IsLegal =
50822 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
50823 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
50824 isa<LoadSDNode>(St->getValue()) &&
50825 cast<LoadSDNode>(St->getValue())->isSimple() &&
50826 St->getChain().hasOneUse() && St->isSimple()) {
50827 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
50828
50829 if (!ISD::isNormalLoad(Ld))
50830 return SDValue();
50831
50832 // Avoid the transformation if there are multiple uses of the loaded value.
50833 if (!Ld->hasNUsesOfValue(1, 0))
50834 return SDValue();
50835
50836 SDLoc LdDL(Ld);
50837 SDLoc StDL(N);
50838 // Lower to a single movq load/store pair.
50839 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
50840 Ld->getBasePtr(), Ld->getMemOperand());
50841
50842 // Make sure new load is placed in same chain order.
50843 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
50844 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
50845 St->getMemOperand());
50846 }
50847
50848 // This is similar to the above case, but here we handle a scalar 64-bit
50849 // integer store that is extracted from a vector on a 32-bit target.
50850 // If we have SSE2, then we can treat it like a floating-point double
50851 // to get past legalization. The execution dependencies fixup pass will
50852 // choose the optimal machine instruction for the store if this really is
50853 // an integer or v2f32 rather than an f64.
50854 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
50855 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
50856 SDValue OldExtract = St->getOperand(1);
50857 SDValue ExtOp0 = OldExtract.getOperand(0);
50858 unsigned VecSize = ExtOp0.getValueSizeInBits();
50859 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
50860 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
50861 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
50862 BitCast, OldExtract.getOperand(1));
50863 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
50864 St->getPointerInfo(), St->getOriginalAlign(),
50865 St->getMemOperand()->getFlags());
50866 }
50867
50868 return SDValue();
50869}
50870
50871static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
50872 TargetLowering::DAGCombinerInfo &DCI,
50873 const X86Subtarget &Subtarget) {
50874 auto *St = cast<MemIntrinsicSDNode>(N);
50875
50876 SDValue StoredVal = N->getOperand(1);
50877 MVT VT = StoredVal.getSimpleValueType();
50878 EVT MemVT = St->getMemoryVT();
50879
50880 // Figure out which elements we demand.
50881 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
50882 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
50883
50884 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50885 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
50886 if (N->getOpcode() != ISD::DELETED_NODE)
50887 DCI.AddToWorklist(N);
50888 return SDValue(N, 0);
50889 }
50890
50891 return SDValue();
50892}
50893
50894/// Return 'true' if this vector operation is "horizontal"
50895/// and return the operands for the horizontal operation in LHS and RHS. A
50896/// horizontal operation performs the binary operation on successive elements
50897/// of its first operand, then on successive elements of its second operand,
50898/// returning the resulting values in a vector. For example, if
50899/// A = < float a0, float a1, float a2, float a3 >
50900/// and
50901/// B = < float b0, float b1, float b2, float b3 >
50902/// then the result of doing a horizontal operation on A and B is
50903/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
50904/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
50905/// A horizontal-op B, for some already available A and B, and if so then LHS is
50906/// set to A, RHS to B, and the routine returns 'true'.
50907static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
50908 SelectionDAG &DAG, const X86Subtarget &Subtarget,
50909 bool IsCommutative,
50910 SmallVectorImpl<int> &PostShuffleMask) {
50911 // If either operand is undef, bail out. The binop should be simplified.
50912 if (LHS.isUndef() || RHS.isUndef())
50913 return false;
50914
50915 // Look for the following pattern:
50916 // A = < float a0, float a1, float a2, float a3 >
50917 // B = < float b0, float b1, float b2, float b3 >
50918 // and
50919 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
50920 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
50921 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
50922 // which is A horizontal-op B.
50923
50924 MVT VT = LHS.getSimpleValueType();
50925 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50926, __extension__
__PRETTY_FUNCTION__))
50926 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50926, __extension__
__PRETTY_FUNCTION__))
;
50927 unsigned NumElts = VT.getVectorNumElements();
50928
50929 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
50930 SmallVectorImpl<int> &ShuffleMask) {
50931 bool UseSubVector = false;
50932 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50933 Op.getOperand(0).getValueType().is256BitVector() &&
50934 llvm::isNullConstant(Op.getOperand(1))) {
50935 Op = Op.getOperand(0);
50936 UseSubVector = true;
50937 }
50938 SmallVector<SDValue, 2> SrcOps;
50939 SmallVector<int, 16> SrcMask, ScaledMask;
50940 SDValue BC = peekThroughBitcasts(Op);
50941 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
50942 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
50943 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
50944 })) {
50945 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
50946 if (!UseSubVector && SrcOps.size() <= 2 &&
50947 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
50948 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
50949 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
50950 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
50951 }
50952 if (UseSubVector && SrcOps.size() == 1 &&
50953 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
50954 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
50955 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
50956 ShuffleMask.assign(Mask.begin(), Mask.end());
50957 }
50958 }
50959 };
50960
50961 // View LHS in the form
50962 // LHS = VECTOR_SHUFFLE A, B, LMask
50963 // If LHS is not a shuffle, then pretend it is the identity shuffle:
50964 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
50965 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
50966 SDValue A, B;
50967 SmallVector<int, 16> LMask;
50968 GetShuffle(LHS, A, B, LMask);
50969
50970 // Likewise, view RHS in the form
50971 // RHS = VECTOR_SHUFFLE C, D, RMask
50972 SDValue C, D;
50973 SmallVector<int, 16> RMask;
50974 GetShuffle(RHS, C, D, RMask);
50975
50976 // At least one of the operands should be a vector shuffle.
50977 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
50978 if (NumShuffles == 0)
50979 return false;
50980
50981 if (LMask.empty()) {
50982 A = LHS;
50983 for (unsigned i = 0; i != NumElts; ++i)
50984 LMask.push_back(i);
50985 }
50986
50987 if (RMask.empty()) {
50988 C = RHS;
50989 for (unsigned i = 0; i != NumElts; ++i)
50990 RMask.push_back(i);
50991 }
50992
50993 // If we have an unary mask, ensure the other op is set to null.
50994 if (isUndefOrInRange(LMask, 0, NumElts))
50995 B = SDValue();
50996 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
50997 A = SDValue();
50998
50999 if (isUndefOrInRange(RMask, 0, NumElts))
51000 D = SDValue();
51001 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51002 C = SDValue();
51003
51004 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51005 // RHS operands and shuffle mask.
51006 if (A != C) {
51007 std::swap(C, D);
51008 ShuffleVectorSDNode::commuteMask(RMask);
51009 }
51010 // Check that the shuffles are both shuffling the same vectors.
51011 if (!(A == C && B == D))
51012 return false;
51013
51014 PostShuffleMask.clear();
51015 PostShuffleMask.append(NumElts, SM_SentinelUndef);
51016
51017 // LHS and RHS are now:
51018 // LHS = shuffle A, B, LMask
51019 // RHS = shuffle A, B, RMask
51020 // Check that the masks correspond to performing a horizontal operation.
51021 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
51022 // so we just repeat the inner loop if this is a 256-bit op.
51023 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
51024 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
51025 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
51026 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51027, __extension__
__PRETTY_FUNCTION__))
51027 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51027, __extension__
__PRETTY_FUNCTION__))
;
51028 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
51029 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
51030 // Ignore undefined components.
51031 int LIdx = LMask[i + j], RIdx = RMask[i + j];
51032 if (LIdx < 0 || RIdx < 0 ||
51033 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
51034 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
51035 continue;
51036
51037 // Check that successive odd/even elements are being operated on. If not,
51038 // this is not a horizontal operation.
51039 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
51040 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
51041 return false;
51042
51043 // Compute the post-shuffle mask index based on where the element
51044 // is stored in the HOP result, and where it needs to be moved to.
51045 int Base = LIdx & ~1u;
51046 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
51047 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
51048
51049 // The low half of the 128-bit result must choose from A.
51050 // The high half of the 128-bit result must choose from B,
51051 // unless B is undef. In that case, we are always choosing from A.
51052 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
51053 Index += NumEltsPer64BitChunk;
51054 PostShuffleMask[i + j] = Index;
51055 }
51056 }
51057
51058 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
51059 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
51060
51061 bool IsIdentityPostShuffle =
51062 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
51063 if (IsIdentityPostShuffle)
51064 PostShuffleMask.clear();
51065
51066 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
51067 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
51068 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
51069 return false;
51070
51071 // If the source nodes are already used in HorizOps then always accept this.
51072 // Shuffle folding should merge these back together.
51073 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
51074 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51075 });
51076 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
51077 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51078 });
51079 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
51080
51081 // Assume a SingleSource HOP if we only shuffle one input and don't need to
51082 // shuffle the result.
51083 if (!ForceHorizOp &&
51084 !shouldUseHorizontalOp(NewLHS == NewRHS &&
51085 (NumShuffles < 2 || !IsIdentityPostShuffle),
51086 DAG, Subtarget))
51087 return false;
51088
51089 LHS = DAG.getBitcast(VT, NewLHS);
51090 RHS = DAG.getBitcast(VT, NewRHS);
51091 return true;
51092}
51093
51094// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
51095static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
51096 const X86Subtarget &Subtarget) {
51097 EVT VT = N->getValueType(0);
51098 unsigned Opcode = N->getOpcode();
51099 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
51100 SmallVector<int, 8> PostShuffleMask;
51101
51102 switch (Opcode) {
51103 case ISD::FADD:
51104 case ISD::FSUB:
51105 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
51106 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
51107 SDValue LHS = N->getOperand(0);
51108 SDValue RHS = N->getOperand(1);
51109 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
51110 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51111 PostShuffleMask)) {
51112 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
51113 if (!PostShuffleMask.empty())
51114 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51115 DAG.getUNDEF(VT), PostShuffleMask);
51116 return HorizBinOp;
51117 }
51118 }
51119 break;
51120 case ISD::ADD:
51121 case ISD::SUB:
51122 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
51123 VT == MVT::v16i16 || VT == MVT::v8i32)) {
51124 SDValue LHS = N->getOperand(0);
51125 SDValue RHS = N->getOperand(1);
51126 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
51127 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51128 PostShuffleMask)) {
51129 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
51130 ArrayRef<SDValue> Ops) {
51131 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
51132 };
51133 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
51134 {LHS, RHS}, HOpBuilder);
51135 if (!PostShuffleMask.empty())
51136 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51137 DAG.getUNDEF(VT), PostShuffleMask);
51138 return HorizBinOp;
51139 }
51140 }
51141 break;
51142 }
51143
51144 return SDValue();
51145}
51146
51147// Try to combine the following nodes
51148// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
51149// <i32 -2147483648[float -0.000000e+00]> 0
51150// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
51151// <(load 4 from constant-pool)> t0, t29
51152// [t30: v16i32 = bitcast t27]
51153// t6: v16i32 = xor t7, t27[t30]
51154// t11: v16f32 = bitcast t6
51155// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
51156// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
51157// t22: v16f32 = bitcast t7
51158// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
51159// t24: v32f16 = bitcast t23
51160static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
51161 const X86Subtarget &Subtarget) {
51162 EVT VT = N->getValueType(0);
51163 SDValue LHS = N->getOperand(0);
51164 SDValue RHS = N->getOperand(1);
51165 int CombineOpcode =
51166 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
51167 auto isConjugationConstant = [](const Constant *c) {
51168 if (const auto *CI = dyn_cast<ConstantInt>(c)) {
51169 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
51170 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
51171 switch (CI->getBitWidth()) {
51172 case 16:
51173 return false;
51174 case 32:
51175 return CI->getValue() == ConjugationInt32;
51176 case 64:
51177 return CI->getValue() == ConjugationInt64;
51178 default:
51179 llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51179)
;
51180 }
51181 }
51182 if (const auto *CF = dyn_cast<ConstantFP>(c))
51183 return CF->isNegativeZeroValue();
51184 return false;
51185 };
51186 auto combineConjugation = [&](SDValue &r) {
51187 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
51188 SDValue XOR = LHS.getOperand(0);
51189 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
51190 SDValue XORRHS = XOR.getOperand(1);
51191 if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
51192 XORRHS = XORRHS.getOperand(0);
51193 if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
51194 XORRHS.getOperand(1).getNumOperands()) {
51195 ConstantPoolSDNode *CP =
51196 dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
51197 if (CP && isConjugationConstant(CP->getConstVal())) {
51198 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
51199 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
51200 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
51201 r = DAG.getBitcast(VT, FCMulC);
51202 return true;
51203 }
51204 }
51205 }
51206 }
51207 return false;
51208 };
51209 SDValue Res;
51210 if (combineConjugation(Res))
51211 return Res;
51212 std::swap(LHS, RHS);
51213 if (combineConjugation(Res))
51214 return Res;
51215 return Res;
51216}
51217
51218// Try to combine the following nodes:
51219// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
51220static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
51221 const X86Subtarget &Subtarget) {
51222 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
51223 return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
51224 Flags.hasAllowContract();
51225 };
51226
51227 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
51228 return DAG.getTarget().Options.NoSignedZerosFPMath ||
51229 Flags.hasNoSignedZeros();
51230 };
51231 auto IsVectorAllNegativeZero = [](const SDNode *N) {
51232 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
51233 return false;
51234 assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51235, __extension__
__PRETTY_FUNCTION__))
51235 "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51235, __extension__
__PRETTY_FUNCTION__))
;
51236 if (ConstantPoolSDNode *CP =
51237 dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
51238 APInt AI = APInt(32, 0x80008000, true);
51239 if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
51240 return CI->getValue() == AI;
51241 if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
51242 return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
51243 }
51244 return false;
51245 };
51246
51247 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
51248 !AllowContract(N->getFlags()))
51249 return SDValue();
51250
51251 EVT VT = N->getValueType(0);
51252 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
51253 return SDValue();
51254
51255 SDValue LHS = N->getOperand(0);
51256 SDValue RHS = N->getOperand(1);
51257 bool IsConj;
51258 SDValue FAddOp1, MulOp0, MulOp1;
51259 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
51260 &IsVectorAllNegativeZero,
51261 &HasNoSignedZero](SDValue N) -> bool {
51262 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
51263 return false;
51264 SDValue Op0 = N.getOperand(0);
51265 unsigned Opcode = Op0.getOpcode();
51266 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
51267 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
51268 MulOp0 = Op0.getOperand(0);
51269 MulOp1 = Op0.getOperand(1);
51270 IsConj = Opcode == X86ISD::VFCMULC;
51271 return true;
51272 }
51273 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
51274 ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
51275 HasNoSignedZero(Op0->getFlags())) ||
51276 IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
51277 MulOp0 = Op0.getOperand(0);
51278 MulOp1 = Op0.getOperand(1);
51279 IsConj = Opcode == X86ISD::VFCMADDC;
51280 return true;
51281 }
51282 }
51283 return false;
51284 };
51285
51286 if (GetCFmulFrom(LHS))
51287 FAddOp1 = RHS;
51288 else if (GetCFmulFrom(RHS))
51289 FAddOp1 = LHS;
51290 else
51291 return SDValue();
51292
51293 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
51294 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
51295 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
51296 // FIXME: How do we handle when fast math flags of FADD are different from
51297 // CFMUL's?
51298 SDValue CFmul =
51299 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
51300 return DAG.getBitcast(VT, CFmul);
51301}
51302
51303/// Do target-specific dag combines on floating-point adds/subs.
51304static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
51305 const X86Subtarget &Subtarget) {
51306 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
51307 return HOp;
51308
51309 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
51310 return COp;
51311
51312 return SDValue();
51313}
51314
51315/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
51316/// the codegen.
51317/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
51318/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
51319/// anything that is guaranteed to be transformed by DAGCombiner.
51320static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
51321 const X86Subtarget &Subtarget,
51322 const SDLoc &DL) {
51323 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51323, __extension__
__PRETTY_FUNCTION__))
;
51324 SDValue Src = N->getOperand(0);
51325 unsigned SrcOpcode = Src.getOpcode();
51326 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51327
51328 EVT VT = N->getValueType(0);
51329 EVT SrcVT = Src.getValueType();
51330
51331 auto IsFreeTruncation = [VT](SDValue Op) {
51332 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
51333
51334 // See if this has been extended from a smaller/equal size to
51335 // the truncation size, allowing a truncation to combine with the extend.
51336 unsigned Opcode = Op.getOpcode();
51337 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
51338 Opcode == ISD::ZERO_EXTEND) &&
51339 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
51340 return true;
51341
51342 // See if this is a single use constant which can be constant folded.
51343 // NOTE: We don't peek throught bitcasts here because there is currently
51344 // no support for constant folding truncate+bitcast+vector_of_constants. So
51345 // we'll just send up with a truncate on both operands which will
51346 // get turned back into (truncate (binop)) causing an infinite loop.
51347 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
51348 };
51349
51350 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
51351 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
51352 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
51353 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
51354 };
51355
51356 // Don't combine if the operation has other uses.
51357 if (!Src.hasOneUse())
51358 return SDValue();
51359
51360 // Only support vector truncation for now.
51361 // TODO: i64 scalar math would benefit as well.
51362 if (!VT.isVector())
51363 return SDValue();
51364
51365 // In most cases its only worth pre-truncating if we're only facing the cost
51366 // of one truncation.
51367 // i.e. if one of the inputs will constant fold or the input is repeated.
51368 switch (SrcOpcode) {
51369 case ISD::MUL:
51370 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
51371 // better to truncate if we have the chance.
51372 if (SrcVT.getScalarType() == MVT::i64 &&
51373 TLI.isOperationLegal(SrcOpcode, VT) &&
51374 !TLI.isOperationLegal(SrcOpcode, SrcVT))
51375 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
51376 [[fallthrough]];
51377 case ISD::AND:
51378 case ISD::XOR:
51379 case ISD::OR:
51380 case ISD::ADD:
51381 case ISD::SUB: {
51382 SDValue Op0 = Src.getOperand(0);
51383 SDValue Op1 = Src.getOperand(1);
51384 if (TLI.isOperationLegal(SrcOpcode, VT) &&
51385 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
51386 return TruncateArithmetic(Op0, Op1);
51387 break;
51388 }
51389 }
51390
51391 return SDValue();
51392}
51393
51394/// Truncate using ISD::AND mask and X86ISD::PACKUS.
51395/// e.g. trunc <8 x i32> X to <8 x i16> -->
51396/// MaskX = X & 0xffff (clear high bits to prevent saturation)
51397/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
51398static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
51399 const X86Subtarget &Subtarget,
51400 SelectionDAG &DAG) {
51401 SDValue In = N->getOperand(0);
51402 EVT InVT = In.getValueType();
51403 EVT OutVT = N->getValueType(0);
51404
51405 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
51406 OutVT.getScalarSizeInBits());
51407 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
51408 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
51409}
51410
51411/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
51412static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
51413 const X86Subtarget &Subtarget,
51414 SelectionDAG &DAG) {
51415 SDValue In = N->getOperand(0);
51416 EVT InVT = In.getValueType();
51417 EVT OutVT = N->getValueType(0);
51418 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
51419 DAG.getValueType(OutVT));
51420 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
51421}
51422
51423/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
51424/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
51425/// legalization the truncation will be translated into a BUILD_VECTOR with each
51426/// element that is extracted from a vector and then truncated, and it is
51427/// difficult to do this optimization based on them.
51428static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
51429 const X86Subtarget &Subtarget) {
51430 EVT OutVT = N->getValueType(0);
51431 if (!OutVT.isVector())
51432 return SDValue();
51433
51434 SDValue In = N->getOperand(0);
51435 if (!In.getValueType().isSimple())
51436 return SDValue();
51437
51438 EVT InVT = In.getValueType();
51439 unsigned NumElems = OutVT.getVectorNumElements();
51440
51441 // AVX512 provides fast truncate ops.
51442 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
51443 return SDValue();
51444
51445 EVT OutSVT = OutVT.getVectorElementType();
51446 EVT InSVT = InVT.getVectorElementType();
51447 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
51448 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
51449 NumElems >= 8))
51450 return SDValue();
51451
51452 // SSSE3's pshufb results in less instructions in the cases below.
51453 if (Subtarget.hasSSSE3() && NumElems == 8) {
51454 if (InSVT == MVT::i16)
51455 return SDValue();
51456 if (InSVT == MVT::i32 &&
51457 (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
51458 return SDValue();
51459 }
51460
51461 SDLoc DL(N);
51462 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
51463 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
51464 // truncate 2 x v4i32 to v8i16.
51465 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
51466 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
51467 if (InSVT == MVT::i32)
51468 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
51469
51470 return SDValue();
51471}
51472
51473/// This function transforms vector truncation of 'extended sign-bits' or
51474/// 'extended zero-bits' values.
51475/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
51476static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
51477 SelectionDAG &DAG,
51478 const X86Subtarget &Subtarget) {
51479 // Requires SSE2.
51480 if (!Subtarget.hasSSE2())
51481 return SDValue();
51482
51483 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
51484 return SDValue();
51485
51486 SDValue In = N->getOperand(0);
51487 if (!In.getValueType().isSimple())
51488 return SDValue();
51489
51490 MVT VT = N->getValueType(0).getSimpleVT();
51491 MVT SVT = VT.getScalarType();
51492
51493 MVT InVT = In.getValueType().getSimpleVT();
51494 MVT InSVT = InVT.getScalarType();
51495
51496 // Check we have a truncation suited for PACKSS/PACKUS.
51497 if (!isPowerOf2_32(VT.getVectorNumElements()))
51498 return SDValue();
51499 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
51500 return SDValue();
51501 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
51502 return SDValue();
51503
51504 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
51505 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
51506 return SDValue();
51507
51508 // AVX512 has fast truncate, but if the input is already going to be split,
51509 // there's no harm in trying pack.
51510 if (Subtarget.hasAVX512() &&
51511 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
51512 InVT.is512BitVector())) {
51513 // PACK should still be worth it for 128-bit vectors if the sources were
51514 // originally concatenated from subvectors.
51515 SmallVector<SDValue> ConcatOps;
51516 if (VT.getSizeInBits() > 128 ||
51517 !collectConcatOps(In.getNode(), ConcatOps, DAG))
51518 return SDValue();
51519 }
51520
51521 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
51522 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
51523
51524 // Use PACKUS if the input has zero-bits that extend all the way to the
51525 // packed/truncated value. e.g. masks, zext_in_reg, etc.
51526 KnownBits Known = DAG.computeKnownBits(In);
51527 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
51528 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
51529 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
51530
51531 // Use PACKSS if the input has sign-bits that extend all the way to the
51532 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
51533 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
51534
51535 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
51536 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
51537 // on and combines/simplifications can't then use it.
51538 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
51539 return SDValue();
51540
51541 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
51542 if (NumSignBits > MinSignBits)
51543 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
51544
51545 // If we have a srl that only generates signbits that we will discard in
51546 // the truncation then we can use PACKSS by converting the srl to a sra.
51547 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
51548 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
51549 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
51550 In, APInt::getAllOnes(VT.getVectorNumElements()))) {
51551 if (*ShAmt == MinSignBits) {
51552 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
51553 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
51554 Subtarget);
51555 }
51556 }
51557
51558 return SDValue();
51559}
51560
51561// Try to form a MULHU or MULHS node by looking for
51562// (trunc (srl (mul ext, ext), 16))
51563// TODO: This is X86 specific because we want to be able to handle wide types
51564// before type legalization. But we can only do it if the vector will be
51565// legalized via widening/splitting. Type legalization can't handle promotion
51566// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
51567// combiner.
51568static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
51569 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
51570 // First instruction should be a right shift of a multiply.
51571 if (Src.getOpcode() != ISD::SRL ||
51572 Src.getOperand(0).getOpcode() != ISD::MUL)
51573 return SDValue();
51574
51575 if (!Subtarget.hasSSE2())
51576 return SDValue();
51577
51578 // Only handle vXi16 types that are at least 128-bits unless they will be
51579 // widened.
51580 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
51581 return SDValue();
51582
51583 // Input type should be at least vXi32.
51584 EVT InVT = Src.getValueType();
51585 if (InVT.getVectorElementType().getSizeInBits() < 32)
51586 return SDValue();
51587
51588 // Need a shift by 16.
51589 APInt ShiftAmt;
51590 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
51591 ShiftAmt != 16)
51592 return SDValue();
51593
51594 SDValue LHS = Src.getOperand(0).getOperand(0);
51595 SDValue RHS = Src.getOperand(0).getOperand(1);
51596
51597 // Count leading sign/zero bits on both inputs - if there are enough then
51598 // truncation back to vXi16 will be cheap - either as a pack/shuffle
51599 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
51600 // truncations may actually be free by peeking through to the ext source.
51601 auto IsSext = [&DAG](SDValue V) {
51602 return DAG.ComputeMaxSignificantBits(V) <= 16;
51603 };
51604 auto IsZext = [&DAG](SDValue V) {
51605 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
51606 };
51607
51608 bool IsSigned = IsSext(LHS) && IsSext(RHS);
51609 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
51610 if (!IsSigned && !IsUnsigned)
51611 return SDValue();
51612
51613 // Check if both inputs are extensions, which will be removed by truncation.
51614 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
51615 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
51616 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
51617 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
51618 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
51619 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
51620
51621 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
51622 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
51623 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
51624 // will have to split anyway.
51625 unsigned InSizeInBits = InVT.getSizeInBits();
51626 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
51627 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
51628 (InSizeInBits % 16) == 0) {
51629 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51630 InVT.getSizeInBits() / 16);
51631 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
51632 DAG.getBitcast(BCVT, RHS));
51633 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
51634 }
51635
51636 // Truncate back to source type.
51637 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
51638 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
51639
51640 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
51641 return DAG.getNode(Opc, DL, VT, LHS, RHS);
51642}
51643
51644// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
51645// from one vector with signed bytes from another vector, adds together
51646// adjacent pairs of 16-bit products, and saturates the result before
51647// truncating to 16-bits.
51648//
51649// Which looks something like this:
51650// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51651// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51652static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
51653 const X86Subtarget &Subtarget,
51654 const SDLoc &DL) {
51655 if (!VT.isVector() || !Subtarget.hasSSSE3())
51656 return SDValue();
51657
51658 unsigned NumElems = VT.getVectorNumElements();
51659 EVT ScalarVT = VT.getVectorElementType();
51660 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51661 return SDValue();
51662
51663 SDValue SSatVal = detectSSatPattern(In, VT);
51664 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51665 return SDValue();
51666
51667 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51668 // of multiplies from even/odd elements.
51669 SDValue N0 = SSatVal.getOperand(0);
51670 SDValue N1 = SSatVal.getOperand(1);
51671
51672 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51673 return SDValue();
51674
51675 SDValue N00 = N0.getOperand(0);
51676 SDValue N01 = N0.getOperand(1);
51677 SDValue N10 = N1.getOperand(0);
51678 SDValue N11 = N1.getOperand(1);
51679
51680 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51681 // Canonicalize zero_extend to LHS.
51682 if (N01.getOpcode() == ISD::ZERO_EXTEND)
51683 std::swap(N00, N01);
51684 if (N11.getOpcode() == ISD::ZERO_EXTEND)
51685 std::swap(N10, N11);
51686
51687 // Ensure we have a zero_extend and a sign_extend.
51688 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51689 N01.getOpcode() != ISD::SIGN_EXTEND ||
51690 N10.getOpcode() != ISD::ZERO_EXTEND ||
51691 N11.getOpcode() != ISD::SIGN_EXTEND)
51692 return SDValue();
51693
51694 // Peek through the extends.
51695 N00 = N00.getOperand(0);
51696 N01 = N01.getOperand(0);
51697 N10 = N10.getOperand(0);
51698 N11 = N11.getOperand(0);
51699
51700 // Ensure the extend is from vXi8.
51701 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51702 N01.getValueType().getVectorElementType() != MVT::i8 ||
51703 N10.getValueType().getVectorElementType() != MVT::i8 ||
51704 N11.getValueType().getVectorElementType() != MVT::i8)
51705 return SDValue();
51706
51707 // All inputs should be build_vectors.
51708 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51709 N01.getOpcode() != ISD::BUILD_VECTOR ||
51710 N10.getOpcode() != ISD::BUILD_VECTOR ||
51711 N11.getOpcode() != ISD::BUILD_VECTOR)
51712 return SDValue();
51713
51714 // N00/N10 are zero extended. N01/N11 are sign extended.
51715
51716 // For each element, we need to ensure we have an odd element from one vector
51717 // multiplied by the odd element of another vector and the even element from
51718 // one of the same vectors being multiplied by the even element from the
51719 // other vector. So we need to make sure for each element i, this operator
51720 // is being performed:
51721 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51722 SDValue ZExtIn, SExtIn;
51723 for (unsigned i = 0; i != NumElems; ++i) {
51724 SDValue N00Elt = N00.getOperand(i);
51725 SDValue N01Elt = N01.getOperand(i);
51726 SDValue N10Elt = N10.getOperand(i);
51727 SDValue N11Elt = N11.getOperand(i);
51728 // TODO: Be more tolerant to undefs.
51729 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51730 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51731 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51732 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
51733 return SDValue();
51734 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
51735 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
51736 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
51737 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
51738 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
51739 return SDValue();
51740 unsigned IdxN00 = ConstN00Elt->getZExtValue();
51741 unsigned IdxN01 = ConstN01Elt->getZExtValue();
51742 unsigned IdxN10 = ConstN10Elt->getZExtValue();
51743 unsigned IdxN11 = ConstN11Elt->getZExtValue();
51744 // Add is commutative so indices can be reordered.
51745 if (IdxN00 > IdxN10) {
51746 std::swap(IdxN00, IdxN10);
51747 std::swap(IdxN01, IdxN11);
51748 }
51749 // N0 indices be the even element. N1 indices must be the next odd element.
51750 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
51751 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
51752 return SDValue();
51753 SDValue N00In = N00Elt.getOperand(0);
51754 SDValue N01In = N01Elt.getOperand(0);
51755 SDValue N10In = N10Elt.getOperand(0);
51756 SDValue N11In = N11Elt.getOperand(0);
51757 // First time we find an input capture it.
51758 if (!ZExtIn) {
51759 ZExtIn = N00In;
51760 SExtIn = N01In;
51761 }
51762 if (ZExtIn != N00In || SExtIn != N01In ||
51763 ZExtIn != N10In || SExtIn != N11In)
51764 return SDValue();
51765 }
51766
51767 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51768 ArrayRef<SDValue> Ops) {
51769 // Shrink by adding truncate nodes and let DAGCombine fold with the
51770 // sources.
51771 EVT InVT = Ops[0].getValueType();
51772 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51773, __extension__
__PRETTY_FUNCTION__))
51773 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51773, __extension__
__PRETTY_FUNCTION__))
;
51774 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51774, __extension__
__PRETTY_FUNCTION__))
;
51775 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51776 InVT.getVectorNumElements() / 2);
51777 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
51778 };
51779 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
51780 PMADDBuilder);
51781}
51782
51783static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
51784 const X86Subtarget &Subtarget) {
51785 EVT VT = N->getValueType(0);
51786 SDValue Src = N->getOperand(0);
51787 SDLoc DL(N);
51788
51789 // Attempt to pre-truncate inputs to arithmetic ops instead.
51790 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
51791 return V;
51792
51793 // Try to detect AVG pattern first.
51794 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
51795 return Avg;
51796
51797 // Try to detect PMADD
51798 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
51799 return PMAdd;
51800
51801 // Try to combine truncation with signed/unsigned saturation.
51802 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
51803 return Val;
51804
51805 // Try to combine PMULHUW/PMULHW for vXi16.
51806 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
51807 return V;
51808
51809 // The bitcast source is a direct mmx result.
51810 // Detect bitcasts between i32 to x86mmx
51811 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
51812 SDValue BCSrc = Src.getOperand(0);
51813 if (BCSrc.getValueType() == MVT::x86mmx)
51814 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
51815 }
51816
51817 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
51818 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
51819 return V;
51820
51821 return combineVectorTruncation(N, DAG, Subtarget);
51822}
51823
51824static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
51825 TargetLowering::DAGCombinerInfo &DCI) {
51826 EVT VT = N->getValueType(0);
51827 SDValue In = N->getOperand(0);
51828 SDLoc DL(N);
51829
51830 if (SDValue SSatVal = detectSSatPattern(In, VT))
51831 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
51832 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
51833 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
51834
51835 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51836 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
51837 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51838 return SDValue(N, 0);
51839
51840 return SDValue();
51841}
51842
51843/// Returns the negated value if the node \p N flips sign of FP value.
51844///
51845/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
51846/// or FSUB(0, x)
51847/// AVX512F does not have FXOR, so FNEG is lowered as
51848/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
51849/// In this case we go though all bitcasts.
51850/// This also recognizes splat of a negated value and returns the splat of that
51851/// value.
51852static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
51853 if (N->getOpcode() == ISD::FNEG)
51854 return N->getOperand(0);
51855
51856 // Don't recurse exponentially.
51857 if (Depth > SelectionDAG::MaxRecursionDepth)
51858 return SDValue();
51859
51860 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
51861
51862 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
51863 EVT VT = Op->getValueType(0);
51864
51865 // Make sure the element size doesn't change.
51866 if (VT.getScalarSizeInBits() != ScalarSize)
51867 return SDValue();
51868
51869 unsigned Opc = Op.getOpcode();
51870 switch (Opc) {
51871 case ISD::VECTOR_SHUFFLE: {
51872 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
51873 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
51874 if (!Op.getOperand(1).isUndef())
51875 return SDValue();
51876 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
51877 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
51878 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
51879 cast<ShuffleVectorSDNode>(Op)->getMask());
51880 break;
51881 }
51882 case ISD::INSERT_VECTOR_ELT: {
51883 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
51884 // -V, INDEX).
51885 SDValue InsVector = Op.getOperand(0);
51886 SDValue InsVal = Op.getOperand(1);
51887 if (!InsVector.isUndef())
51888 return SDValue();
51889 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
51890 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
51891 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
51892 NegInsVal, Op.getOperand(2));
51893 break;
51894 }
51895 case ISD::FSUB:
51896 case ISD::XOR:
51897 case X86ISD::FXOR: {
51898 SDValue Op1 = Op.getOperand(1);
51899 SDValue Op0 = Op.getOperand(0);
51900
51901 // For XOR and FXOR, we want to check if constant
51902 // bits of Op1 are sign bit masks. For FSUB, we
51903 // have to check if constant bits of Op0 are sign
51904 // bit masks and hence we swap the operands.
51905 if (Opc == ISD::FSUB)
51906 std::swap(Op0, Op1);
51907
51908 APInt UndefElts;
51909 SmallVector<APInt, 16> EltBits;
51910 // Extract constant bits and see if they are all
51911 // sign bit masks. Ignore the undef elements.
51912 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
51913 /* AllowWholeUndefs */ true,
51914 /* AllowPartialUndefs */ false)) {
51915 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
51916 if (!UndefElts[I] && !EltBits[I].isSignMask())
51917 return SDValue();
51918
51919 // Only allow bitcast from correctly-sized constant.
51920 Op0 = peekThroughBitcasts(Op0);
51921 if (Op0.getScalarValueSizeInBits() == ScalarSize)
51922 return Op0;
51923 }
51924 break;
51925 } // case
51926 } // switch
51927
51928 return SDValue();
51929}
51930
51931static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
51932 bool NegRes) {
51933 if (NegMul) {
51934 switch (Opcode) {
51935 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51935)
;
51936 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
51937 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
51938 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
51939 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
51940 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
51941 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
51942 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
51943 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
51944 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
51945 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
51946 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
51947 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
51948 }
51949 }
51950
51951 if (NegAcc) {
51952 switch (Opcode) {
51953 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51953)
;
51954 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
51955 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
51956 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
51957 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
51958 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
51959 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
51960 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
51961 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
51962 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
51963 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
51964 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
51965 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
51966 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
51967 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
51968 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
51969 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
51970 }
51971 }
51972
51973 if (NegRes) {
51974 switch (Opcode) {
51975 // For accuracy reason, we never combine fneg and fma under strict FP.
51976 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51976)
;
51977 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
51978 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
51979 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
51980 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
51981 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
51982 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
51983 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
51984 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
51985 }
51986 }
51987
51988 return Opcode;
51989}
51990
51991/// Do target-specific dag combines on floating point negations.
51992static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
51993 TargetLowering::DAGCombinerInfo &DCI,
51994 const X86Subtarget &Subtarget) {
51995 EVT OrigVT = N->getValueType(0);
51996 SDValue Arg = isFNEG(DAG, N);
51997 if (!Arg)
51998 return SDValue();
51999
52000 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52001 EVT VT = Arg.getValueType();
52002 EVT SVT = VT.getScalarType();
52003 SDLoc DL(N);
52004
52005 // Let legalize expand this if it isn't a legal type yet.
52006 if (!TLI.isTypeLegal(VT))
52007 return SDValue();
52008
52009 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52010 // use of a constant by performing (-0 - A*B) instead.
52011 // FIXME: Check rounding control flags as well once it becomes available.
52012 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52013 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52014 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52015 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52016 Arg.getOperand(1), Zero);
52017 return DAG.getBitcast(OrigVT, NewNode);
52018 }
52019
52020 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52021 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52022 if (SDValue NegArg =
52023 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52024 return DAG.getBitcast(OrigVT, NegArg);
52025
52026 return SDValue();
52027}
52028
52029SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
52030 bool LegalOperations,
52031 bool ForCodeSize,
52032 NegatibleCost &Cost,
52033 unsigned Depth) const {
52034 // fneg patterns are removable even if they have multiple uses.
52035 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52036 Cost = NegatibleCost::Cheaper;
52037 return DAG.getBitcast(Op.getValueType(), Arg);
52038 }
52039
52040 EVT VT = Op.getValueType();
52041 EVT SVT = VT.getScalarType();
52042 unsigned Opc = Op.getOpcode();
52043 SDNodeFlags Flags = Op.getNode()->getFlags();
52044 switch (Opc) {
52045 case ISD::FMA:
52046 case X86ISD::FMSUB:
52047 case X86ISD::FNMADD:
52048 case X86ISD::FNMSUB:
52049 case X86ISD::FMADD_RND:
52050 case X86ISD::FMSUB_RND:
52051 case X86ISD::FNMADD_RND:
52052 case X86ISD::FNMSUB_RND: {
52053 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52054 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52055 !isOperationLegal(ISD::FMA, VT))
52056 break;
52057
52058 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52059 // if it may have signed zeros.
52060 if (!Flags.hasNoSignedZeros())
52061 break;
52062
52063 // This is always negatible for free but we might be able to remove some
52064 // extra operand negations as well.
52065 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
52066 for (int i = 0; i != 3; ++i)
52067 NewOps[i] = getCheaperNegatedExpression(
52068 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52069
52070 bool NegA = !!NewOps[0];
52071 bool NegB = !!NewOps[1];
52072 bool NegC = !!NewOps[2];
52073 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52074
52075 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52076 : NegatibleCost::Neutral;
52077
52078 // Fill in the non-negated ops with the original values.
52079 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52080 if (!NewOps[i])
52081 NewOps[i] = Op.getOperand(i);
52082 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52083 }
52084 case X86ISD::FRCP:
52085 if (SDValue NegOp0 =
52086 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52087 ForCodeSize, Cost, Depth + 1))
52088 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52089 break;
52090 }
52091
52092 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52093 ForCodeSize, Cost, Depth);
52094}
52095
52096static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
52097 const X86Subtarget &Subtarget) {
52098 MVT VT = N->getSimpleValueType(0);
52099 // If we have integer vector types available, use the integer opcodes.
52100 if (!VT.isVector() || !Subtarget.hasSSE2())
52101 return SDValue();
52102
52103 SDLoc dl(N);
52104
52105 unsigned IntBits = VT.getScalarSizeInBits();
52106 MVT IntSVT = MVT::getIntegerVT(IntBits);
52107 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52108
52109 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52110 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52111 unsigned IntOpcode;
52112 switch (N->getOpcode()) {
52113 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52113)
;
52114 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52115 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52116 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52117 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52118 }
52119 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52120 return DAG.getBitcast(VT, IntOp);
52121}
52122
52123
52124/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52125static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
52126 if (N->getOpcode() != ISD::XOR)
52127 return SDValue();
52128
52129 SDValue LHS = N->getOperand(0);
52130 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52131 return SDValue();
52132
52133 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
52134 X86::CondCode(LHS->getConstantOperandVal(0)));
52135 SDLoc DL(N);
52136 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52137}
52138
52139static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
52140 TargetLowering::DAGCombinerInfo &DCI,
52141 const X86Subtarget &Subtarget) {
52142 SDValue N0 = N->getOperand(0);
52143 SDValue N1 = N->getOperand(1);
52144 EVT VT = N->getValueType(0);
52145
52146 // If this is SSE1 only convert to FXOR to avoid scalarization.
52147 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52148 return DAG.getBitcast(MVT::v4i32,
52149 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
52150 DAG.getBitcast(MVT::v4f32, N0),
52151 DAG.getBitcast(MVT::v4f32, N1)));
52152 }
52153
52154 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
52155 return Cmp;
52156
52157 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
52158 return R;
52159
52160 if (SDValue R = combineBitOpWithShift(N, DAG))
52161 return R;
52162
52163 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
52164 return FPLogic;
52165
52166 if (DCI.isBeforeLegalizeOps())
52167 return SDValue();
52168
52169 if (SDValue SetCC = foldXor1SetCC(N, DAG))
52170 return SetCC;
52171
52172 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
52173 return RV;
52174
52175 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
52176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52177 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
52178 N0.getOperand(0).getValueType().isVector() &&
52179 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52180 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
52181 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
52182 N0.getOperand(0).getValueType()));
52183 }
52184
52185 // Handle AVX512 mask widening.
52186 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
52187 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
52188 VT.getVectorElementType() == MVT::i1 &&
52189 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
52190 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
52191 return DAG.getNode(
52192 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
52193 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
52194 N0.getOperand(2));
52195 }
52196
52197 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
52198 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
52199 // TODO: Under what circumstances could this be performed in DAGCombine?
52200 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
52201 N0.getOperand(0).getOpcode() == N->getOpcode()) {
52202 SDValue TruncExtSrc = N0.getOperand(0);
52203 auto *N1C = dyn_cast<ConstantSDNode>(N1);
52204 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
52205 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
52206 SDLoc DL(N);
52207 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
52208 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
52209 return DAG.getNode(ISD::XOR, DL, VT, LHS,
52210 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
52211 }
52212 }
52213
52214 return combineFneg(N, DAG, DCI, Subtarget);
52215}
52216
52217static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
52218 TargetLowering::DAGCombinerInfo &DCI,
52219 const X86Subtarget &Subtarget) {
52220 EVT VT = N->getValueType(0);
52221 unsigned NumBits = VT.getSizeInBits();
52222
52223 // TODO - Constant Folding.
52224
52225 // Simplify the inputs.
52226 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52227 APInt DemandedMask(APInt::getAllOnes(NumBits));
52228 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52229 return SDValue(N, 0);
52230
52231 return SDValue();
52232}
52233
52234static bool isNullFPScalarOrVectorConst(SDValue V) {
52235 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
52236}
52237
52238/// If a value is a scalar FP zero or a vector FP zero (potentially including
52239/// undefined elements), return a zero constant that may be used to fold away
52240/// that value. In the case of a vector, the returned constant will not contain
52241/// undefined elements even if the input parameter does. This makes it suitable
52242/// to be used as a replacement operand with operations (eg, bitwise-and) where
52243/// an undef should not propagate.
52244static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
52245 const X86Subtarget &Subtarget) {
52246 if (!isNullFPScalarOrVectorConst(V))
52247 return SDValue();
52248
52249 if (V.getValueType().isVector())
52250 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
52251
52252 return V;
52253}
52254
52255static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
52256 const X86Subtarget &Subtarget) {
52257 SDValue N0 = N->getOperand(0);
52258 SDValue N1 = N->getOperand(1);
52259 EVT VT = N->getValueType(0);
52260 SDLoc DL(N);
52261
52262 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
52263 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
52264 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
52265 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
52266 return SDValue();
52267
52268 auto isAllOnesConstantFP = [](SDValue V) {
52269 if (V.getSimpleValueType().isVector())
52270 return ISD::isBuildVectorAllOnes(V.getNode());
52271 auto *C = dyn_cast<ConstantFPSDNode>(V);
52272 return C && C->getConstantFPValue()->isAllOnesValue();
52273 };
52274
52275 // fand (fxor X, -1), Y --> fandn X, Y
52276 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
52277 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
52278
52279 // fand X, (fxor Y, -1) --> fandn Y, X
52280 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
52281 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
52282
52283 return SDValue();
52284}
52285
52286/// Do target-specific dag combines on X86ISD::FAND nodes.
52287static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
52288 const X86Subtarget &Subtarget) {
52289 // FAND(0.0, x) -> 0.0
52290 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
52291 return V;
52292
52293 // FAND(x, 0.0) -> 0.0
52294 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52295 return V;
52296
52297 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
52298 return V;
52299
52300 return lowerX86FPLogicOp(N, DAG, Subtarget);
52301}
52302
52303/// Do target-specific dag combines on X86ISD::FANDN nodes.
52304static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
52305 const X86Subtarget &Subtarget) {
52306 // FANDN(0.0, x) -> x
52307 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52308 return N->getOperand(1);
52309
52310 // FANDN(x, 0.0) -> 0.0
52311 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52312 return V;
52313
52314 return lowerX86FPLogicOp(N, DAG, Subtarget);
52315}
52316
52317/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
52318static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
52319 TargetLowering::DAGCombinerInfo &DCI,
52320 const X86Subtarget &Subtarget) {
52321 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52321, __extension__
__PRETTY_FUNCTION__))
;
52322
52323 // F[X]OR(0.0, x) -> x
52324 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52325 return N->getOperand(1);
52326
52327 // F[X]OR(x, 0.0) -> x
52328 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
52329 return N->getOperand(0);
52330
52331 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
52332 return NewVal;
52333
52334 return lowerX86FPLogicOp(N, DAG, Subtarget);
52335}
52336
52337/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
52338static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
52339 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52339, __extension__
__PRETTY_FUNCTION__))
;
52340
52341 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
52342 if (!DAG.getTarget().Options.NoNaNsFPMath ||
52343 !DAG.getTarget().Options.NoSignedZerosFPMath)
52344 return SDValue();
52345
52346 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
52347 // into FMINC and FMAXC, which are Commutative operations.
52348 unsigned NewOp = 0;
52349 switch (N->getOpcode()) {
52350 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52350)
;
52351 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
52352 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
52353 }
52354
52355 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
52356 N->getOperand(0), N->getOperand(1));
52357}
52358
52359static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
52360 const X86Subtarget &Subtarget) {
52361 EVT VT = N->getValueType(0);
52362 if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
52363 return SDValue();
52364
52365 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52366
52367 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
52368 (Subtarget.hasSSE2() && VT == MVT::f64) ||
52369 (Subtarget.hasFP16() && VT == MVT::f16) ||
52370 (VT.isVector() && TLI.isTypeLegal(VT))))
52371 return SDValue();
52372
52373 SDValue Op0 = N->getOperand(0);
52374 SDValue Op1 = N->getOperand(1);
52375 SDLoc DL(N);
52376 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
52377
52378 // If we don't have to respect NaN inputs, this is a direct translation to x86
52379 // min/max instructions.
52380 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
52381 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52382
52383 // If one of the operands is known non-NaN use the native min/max instructions
52384 // with the non-NaN input as second operand.
52385 if (DAG.isKnownNeverNaN(Op1))
52386 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52387 if (DAG.isKnownNeverNaN(Op0))
52388 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
52389
52390 // If we have to respect NaN inputs, this takes at least 3 instructions.
52391 // Favor a library call when operating on a scalar and minimizing code size.
52392 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
52393 return SDValue();
52394
52395 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
52396 VT);
52397
52398 // There are 4 possibilities involving NaN inputs, and these are the required
52399 // outputs:
52400 // Op1
52401 // Num NaN
52402 // ----------------
52403 // Num | Max | Op0 |
52404 // Op0 ----------------
52405 // NaN | Op1 | NaN |
52406 // ----------------
52407 //
52408 // The SSE FP max/min instructions were not designed for this case, but rather
52409 // to implement:
52410 // Min = Op1 < Op0 ? Op1 : Op0
52411 // Max = Op1 > Op0 ? Op1 : Op0
52412 //
52413 // So they always return Op0 if either input is a NaN. However, we can still
52414 // use those instructions for fmaxnum by selecting away a NaN input.
52415
52416 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
52417 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
52418 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
52419
52420 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
52421 // are NaN, the NaN value of Op1 is the result.
52422 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
52423}
52424
52425static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
52426 TargetLowering::DAGCombinerInfo &DCI) {
52427 EVT VT = N->getValueType(0);
52428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52429
52430 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
52431 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
52432 return SDValue(N, 0);
52433
52434 // Convert a full vector load into vzload when not all bits are needed.
52435 SDValue In = N->getOperand(0);
52436 MVT InVT = In.getSimpleValueType();
52437 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52438 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52439 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52439, __extension__
__PRETTY_FUNCTION__))
;
52440 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
52441 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52442 MVT MemVT = MVT::getIntegerVT(NumBits);
52443 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52444 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52445 SDLoc dl(N);
52446 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
52447 DAG.getBitcast(InVT, VZLoad));
52448 DCI.CombineTo(N, Convert);
52449 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52450 DCI.recursivelyDeleteUnusedNodes(LN);
52451 return SDValue(N, 0);
52452 }
52453 }
52454
52455 return SDValue();
52456}
52457
52458static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
52459 TargetLowering::DAGCombinerInfo &DCI) {
52460 bool IsStrict = N->isTargetStrictFPOpcode();
52461 EVT VT = N->getValueType(0);
52462
52463 // Convert a full vector load into vzload when not all bits are needed.
52464 SDValue In = N->getOperand(IsStrict ? 1 : 0);
52465 MVT InVT = In.getSimpleValueType();
52466 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52467 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52468 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52468, __extension__
__PRETTY_FUNCTION__))
;
52469 LoadSDNode *LN = cast<LoadSDNode>(In);
52470 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52471 MVT MemVT = MVT::getFloatingPointVT(NumBits);
52472 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52473 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52474 SDLoc dl(N);
52475 if (IsStrict) {
52476 SDValue Convert =
52477 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
52478 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
52479 DCI.CombineTo(N, Convert, Convert.getValue(1));
52480 } else {
52481 SDValue Convert =
52482 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
52483 DCI.CombineTo(N, Convert);
52484 }
52485 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52486 DCI.recursivelyDeleteUnusedNodes(LN);
52487 return SDValue(N, 0);
52488 }
52489 }
52490
52491 return SDValue();
52492}
52493
52494/// Do target-specific dag combines on X86ISD::ANDNP nodes.
52495static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
52496 TargetLowering::DAGCombinerInfo &DCI,
52497 const X86Subtarget &Subtarget) {
52498 SDValue N0 = N->getOperand(0);
52499 SDValue N1 = N->getOperand(1);
52500 MVT VT = N->getSimpleValueType(0);
52501 int NumElts = VT.getVectorNumElements();
52502 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52503
52504 // ANDNP(undef, x) -> 0
52505 // ANDNP(x, undef) -> 0
52506 if (N0.isUndef() || N1.isUndef())
52507 return DAG.getConstant(0, SDLoc(N), VT);
52508
52509 // ANDNP(0, x) -> x
52510 if (ISD::isBuildVectorAllZeros(N0.getNode()))
52511 return N1;
52512
52513 // ANDNP(x, 0) -> 0
52514 if (ISD::isBuildVectorAllZeros(N1.getNode()))
52515 return DAG.getConstant(0, SDLoc(N), VT);
52516
52517 // Turn ANDNP back to AND if input is inverted.
52518 if (SDValue Not = IsNOT(N0, DAG))
52519 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
52520
52521 // Constant Folding
52522 APInt Undefs0, Undefs1;
52523 SmallVector<APInt> EltBits0, EltBits1;
52524 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
52525 SDLoc DL(N);
52526 APInt ResultUndefs = APInt::getZero(NumElts);
52527
52528 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
52529 SmallVector<APInt> ResultBits;
52530 for (int I = 0; I != NumElts; ++I)
52531 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
52532 return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);
52533 }
52534
52535 // Constant fold NOT(N0) to allow us to use AND.
52536 // Ensure this is only performed if we can confirm that the bitcasted source
52537 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
52538 if (N0->hasOneUse()) {
52539 SDValue BC0 = peekThroughOneUseBitcasts(N0);
52540 if (BC0.getOpcode() != ISD::BITCAST) {
52541 for (APInt &Elt : EltBits0)
52542 Elt = ~Elt;
52543 SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);
52544 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
52545 }
52546 }
52547 }
52548
52549 // Attempt to recursively combine a bitmask ANDNP with shuffles.
52550 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52551 SDValue Op(N, 0);
52552 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52553 return Res;
52554
52555 // If either operand is a constant mask, then only the elements that aren't
52556 // zero are actually demanded by the other operand.
52557 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
52558 APInt UndefElts;
52559 SmallVector<APInt> EltBits;
52560 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
52561 APInt DemandedElts = APInt::getAllOnes(NumElts);
52562 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
52563 EltBits)) {
52564 DemandedBits.clearAllBits();
52565 DemandedElts.clearAllBits();
52566 for (int I = 0; I != NumElts; ++I) {
52567 if (UndefElts[I]) {
52568 // We can't assume an undef src element gives an undef dst - the
52569 // other src might be zero.
52570 DemandedBits.setAllBits();
52571 DemandedElts.setBit(I);
52572 } else if ((Invert && !EltBits[I].isAllOnes()) ||
52573 (!Invert && !EltBits[I].isZero())) {
52574 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52575 DemandedElts.setBit(I);
52576 }
52577 }
52578 }
52579 return std::make_pair(DemandedBits, DemandedElts);
52580 };
52581 APInt Bits0, Elts0;
52582 APInt Bits1, Elts1;
52583 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52584 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52585
52586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52587 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52588 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52589 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52590 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52591 if (N->getOpcode() != ISD::DELETED_NODE)
52592 DCI.AddToWorklist(N);
52593 return SDValue(N, 0);
52594 }
52595 }
52596
52597 return SDValue();
52598}
52599
52600static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
52601 TargetLowering::DAGCombinerInfo &DCI) {
52602 SDValue N1 = N->getOperand(1);
52603
52604 // BT ignores high bits in the bit index operand.
52605 unsigned BitWidth = N1.getValueSizeInBits();
52606 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
52607 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
52608 if (N->getOpcode() != ISD::DELETED_NODE)
52609 DCI.AddToWorklist(N);
52610 return SDValue(N, 0);
52611 }
52612
52613 return SDValue();
52614}
52615
52616static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
52617 TargetLowering::DAGCombinerInfo &DCI) {
52618 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
52619 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
52620
52621 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
52622 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52623 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
52624 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
52625 if (N->getOpcode() != ISD::DELETED_NODE)
52626 DCI.AddToWorklist(N);
52627 return SDValue(N, 0);
52628 }
52629
52630 // Convert a full vector load into vzload when not all bits are needed.
52631 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52632 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
52633 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
52634 SDLoc dl(N);
52635 if (IsStrict) {
52636 SDValue Convert = DAG.getNode(
52637 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
52638 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
52639 DCI.CombineTo(N, Convert, Convert.getValue(1));
52640 } else {
52641 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
52642 DAG.getBitcast(MVT::v8i16, VZLoad));
52643 DCI.CombineTo(N, Convert);
52644 }
52645
52646 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52647 DCI.recursivelyDeleteUnusedNodes(LN);
52648 return SDValue(N, 0);
52649 }
52650 }
52651 }
52652
52653 return SDValue();
52654}
52655
52656// Try to combine sext_in_reg of a cmov of constants by extending the constants.
52657static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
52658 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52658, __extension__
__PRETTY_FUNCTION__))
;
52659
52660 EVT DstVT = N->getValueType(0);
52661
52662 SDValue N0 = N->getOperand(0);
52663 SDValue N1 = N->getOperand(1);
52664 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52665
52666 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
52667 return SDValue();
52668
52669 // Look through single use any_extends / truncs.
52670 SDValue IntermediateBitwidthOp;
52671 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
52672 N0.hasOneUse()) {
52673 IntermediateBitwidthOp = N0;
52674 N0 = N0.getOperand(0);
52675 }
52676
52677 // See if we have a single use cmov.
52678 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
52679 return SDValue();
52680
52681 SDValue CMovOp0 = N0.getOperand(0);
52682 SDValue CMovOp1 = N0.getOperand(1);
52683
52684 // Make sure both operands are constants.
52685 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52686 !isa<ConstantSDNode>(CMovOp1.getNode()))
52687 return SDValue();
52688
52689 SDLoc DL(N);
52690
52691 // If we looked through an any_extend/trunc above, add one to the constants.
52692 if (IntermediateBitwidthOp) {
52693 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
52694 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
52695 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
52696 }
52697
52698 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
52699 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
52700
52701 EVT CMovVT = DstVT;
52702 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
52703 if (DstVT == MVT::i16) {
52704 CMovVT = MVT::i32;
52705 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
52706 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
52707 }
52708
52709 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
52710 N0.getOperand(2), N0.getOperand(3));
52711
52712 if (CMovVT != DstVT)
52713 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
52714
52715 return CMov;
52716}
52717
52718static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
52719 const X86Subtarget &Subtarget) {
52720 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52720, __extension__
__PRETTY_FUNCTION__))
;
52721
52722 if (SDValue V = combineSextInRegCmov(N, DAG))
52723 return V;
52724
52725 EVT VT = N->getValueType(0);
52726 SDValue N0 = N->getOperand(0);
52727 SDValue N1 = N->getOperand(1);
52728 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52729 SDLoc dl(N);
52730
52731 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
52732 // both SSE and AVX2 since there is no sign-extended shift right
52733 // operation on a vector with 64-bit elements.
52734 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
52735 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
52736 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
52737 N0.getOpcode() == ISD::SIGN_EXTEND)) {
52738 SDValue N00 = N0.getOperand(0);
52739
52740 // EXTLOAD has a better solution on AVX2,
52741 // it may be replaced with X86ISD::VSEXT node.
52742 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
52743 if (!ISD::isNormalLoad(N00.getNode()))
52744 return SDValue();
52745
52746 // Attempt to promote any comparison mask ops before moving the
52747 // SIGN_EXTEND_INREG in the way.
52748 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
52749 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
52750
52751 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
52752 SDValue Tmp =
52753 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
52754 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
52755 }
52756 }
52757 return SDValue();
52758}
52759
52760/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
52761/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
52762/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
52763/// opportunities to combine math ops, use an LEA, or use a complex addressing
52764/// mode. This can eliminate extend, add, and shift instructions.
52765static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
52766 const X86Subtarget &Subtarget) {
52767 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
52768 Ext->getOpcode() != ISD::ZERO_EXTEND)
52769 return SDValue();
52770
52771 // TODO: This should be valid for other integer types.
52772 EVT VT = Ext->getValueType(0);
52773 if (VT != MVT::i64)
52774 return SDValue();
52775
52776 SDValue Add = Ext->getOperand(0);
52777 if (Add.getOpcode() != ISD::ADD)
52778 return SDValue();
52779
52780 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
52781 bool NSW = Add->getFlags().hasNoSignedWrap();
52782 bool NUW = Add->getFlags().hasNoUnsignedWrap();
52783
52784 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
52785 // into the 'zext'
52786 if ((Sext && !NSW) || (!Sext && !NUW))
52787 return SDValue();
52788
52789 // Having a constant operand to the 'add' ensures that we are not increasing
52790 // the instruction count because the constant is extended for free below.
52791 // A constant operand can also become the displacement field of an LEA.
52792 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
52793 if (!AddOp1)
52794 return SDValue();
52795
52796 // Don't make the 'add' bigger if there's no hope of combining it with some
52797 // other 'add' or 'shl' instruction.
52798 // TODO: It may be profitable to generate simpler LEA instructions in place
52799 // of single 'add' instructions, but the cost model for selecting an LEA
52800 // currently has a high threshold.
52801 bool HasLEAPotential = false;
52802 for (auto *User : Ext->uses()) {
52803 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
52804 HasLEAPotential = true;
52805 break;
52806 }
52807 }
52808 if (!HasLEAPotential)
52809 return SDValue();
52810
52811 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
52812 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
52813 SDValue AddOp0 = Add.getOperand(0);
52814 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
52815 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
52816
52817 // The wider add is guaranteed to not wrap because both operands are
52818 // sign-extended.
52819 SDNodeFlags Flags;
52820 Flags.setNoSignedWrap(NSW);
52821 Flags.setNoUnsignedWrap(NUW);
52822 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
52823}
52824
52825// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
52826// operands and the result of CMOV is not used anywhere else - promote CMOV
52827// itself instead of promoting its result. This could be beneficial, because:
52828// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
52829// (or more) pseudo-CMOVs only when they go one-after-another and
52830// getting rid of result extension code after CMOV will help that.
52831// 2) Promotion of constant CMOV arguments is free, hence the
52832// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
52833// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
52834// promotion is also good in terms of code-size.
52835// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
52836// promotion).
52837static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
52838 SDValue CMovN = Extend->getOperand(0);
52839 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
52840 return SDValue();
52841
52842 EVT TargetVT = Extend->getValueType(0);
52843 unsigned ExtendOpcode = Extend->getOpcode();
52844 SDLoc DL(Extend);
52845
52846 EVT VT = CMovN.getValueType();
52847 SDValue CMovOp0 = CMovN.getOperand(0);
52848 SDValue CMovOp1 = CMovN.getOperand(1);
52849
52850 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52851 !isa<ConstantSDNode>(CMovOp1.getNode()))
52852 return SDValue();
52853
52854 // Only extend to i32 or i64.
52855 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
52856 return SDValue();
52857
52858 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
52859 // are free.
52860 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
52861 return SDValue();
52862
52863 // If this a zero extend to i64, we should only extend to i32 and use a free
52864 // zero extend to finish.
52865 EVT ExtendVT = TargetVT;
52866 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
52867 ExtendVT = MVT::i32;
52868
52869 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
52870 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
52871
52872 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
52873 CMovN.getOperand(2), CMovN.getOperand(3));
52874
52875 // Finish extending if needed.
52876 if (ExtendVT != TargetVT)
52877 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
52878
52879 return Res;
52880}
52881
52882// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
52883// result type.
52884static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
52885 const X86Subtarget &Subtarget) {
52886 SDValue N0 = N->getOperand(0);
52887 EVT VT = N->getValueType(0);
52888 SDLoc dl(N);
52889
52890 // Only do this combine with AVX512 for vector extends.
52891 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
52892 return SDValue();
52893
52894 // Only combine legal element types.
52895 EVT SVT = VT.getVectorElementType();
52896 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
52897 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
52898 return SDValue();
52899
52900 // We don't have CMPP Instruction for vxf16
52901 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
52902 return SDValue();
52903 // We can only do this if the vector size in 256 bits or less.
52904 unsigned Size = VT.getSizeInBits();
52905 if (Size > 256 && Subtarget.useAVX512Regs())
52906 return SDValue();
52907
52908 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
52909 // that's the only integer compares with we have.
52910 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
52911 if (ISD::isUnsignedIntSetCC(CC))
52912 return SDValue();
52913
52914 // Only do this combine if the extension will be fully consumed by the setcc.
52915 EVT N00VT = N0.getOperand(0).getValueType();
52916 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
52917 if (Size != MatchingVecType.getSizeInBits())
52918 return SDValue();
52919
52920 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
52921
52922 if (N->getOpcode() == ISD::ZERO_EXTEND)
52923 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
52924
52925 return Res;
52926}
52927
52928static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
52929 TargetLowering::DAGCombinerInfo &DCI,
52930 const X86Subtarget &Subtarget) {
52931 SDValue N0 = N->getOperand(0);
52932 EVT VT = N->getValueType(0);
52933 SDLoc DL(N);
52934
52935 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
52936 if (!DCI.isBeforeLegalizeOps() &&
52937 N0.getOpcode() == X86ISD::SETCC_CARRY) {
52938 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
52939 N0->getOperand(1));
52940 bool ReplaceOtherUses = !N0.hasOneUse();
52941 DCI.CombineTo(N, Setcc);
52942 // Replace other uses with a truncate of the widened setcc_carry.
52943 if (ReplaceOtherUses) {
52944 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
52945 N0.getValueType(), Setcc);
52946 DCI.CombineTo(N0.getNode(), Trunc);
52947 }
52948
52949 return SDValue(N, 0);
52950 }
52951
52952 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
52953 return NewCMov;
52954
52955 if (!DCI.isBeforeLegalizeOps())
52956 return SDValue();
52957
52958 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
52959 return V;
52960
52961 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
52962 DAG, DCI, Subtarget))
52963 return V;
52964
52965 if (VT.isVector()) {
52966 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
52967 return R;
52968
52969 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
52970 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
52971 }
52972
52973 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
52974 return NewAdd;
52975
52976 return SDValue();
52977}
52978
52979static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
52980 TargetLowering::DAGCombinerInfo &DCI,
52981 const X86Subtarget &Subtarget) {
52982 SDLoc dl(N);
52983 EVT VT = N->getValueType(0);
52984 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
52985
52986 // Let legalize expand this if it isn't a legal type yet.
52987 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52988 if (!TLI.isTypeLegal(VT))
52989 return SDValue();
52990
52991 SDValue A = N->getOperand(IsStrict ? 1 : 0);
52992 SDValue B = N->getOperand(IsStrict ? 2 : 1);
52993 SDValue C = N->getOperand(IsStrict ? 3 : 2);
52994
52995 // If the operation allows fast-math and the target does not support FMA,
52996 // split this into mul+add to avoid libcall(s).
52997 SDNodeFlags Flags = N->getFlags();
52998 if (!IsStrict && Flags.hasAllowReassociation() &&
52999 TLI.isOperationExpand(ISD::FMA, VT)) {
53000 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
53001 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
53002 }
53003
53004 EVT ScalarVT = VT.getScalarType();
53005 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
53006 !Subtarget.hasAnyFMA()) &&
53007 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
53008 return SDValue();
53009
53010 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
53011 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53012 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53013 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
53014 CodeSize)) {
53015 V = NegV;
53016 return true;
53017 }
53018 // Look through extract_vector_elts. If it comes from an FNEG, create a
53019 // new extract from the FNEG input.
53020 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53021 isNullConstant(V.getOperand(1))) {
53022 SDValue Vec = V.getOperand(0);
53023 if (SDValue NegV = TLI.getCheaperNegatedExpression(
53024 Vec, DAG, LegalOperations, CodeSize)) {
53025 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
53026 NegV, V.getOperand(1));
53027 return true;
53028 }
53029 }
53030
53031 return false;
53032 };
53033
53034 // Do not convert the passthru input of scalar intrinsics.
53035 // FIXME: We could allow negations of the lower element only.
53036 bool NegA = invertIfNegative(A);
53037 bool NegB = invertIfNegative(B);
53038 bool NegC = invertIfNegative(C);
53039
53040 if (!NegA && !NegB && !NegC)
53041 return SDValue();
53042
53043 unsigned NewOpcode =
53044 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
53045
53046 // Propagate fast-math-flags to new FMA node.
53047 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
53048 if (IsStrict) {
53049 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53049, __extension__
__PRETTY_FUNCTION__))
;
53050 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
53051 {N->getOperand(0), A, B, C});
53052 } else {
53053 if (N->getNumOperands() == 4)
53054 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
53055 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
53056 }
53057}
53058
53059// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
53060// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
53061static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
53062 TargetLowering::DAGCombinerInfo &DCI) {
53063 SDLoc dl(N);
53064 EVT VT = N->getValueType(0);
53065 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53066 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53067 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53068
53069 SDValue N2 = N->getOperand(2);
53070
53071 SDValue NegN2 =
53072 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
53073 if (!NegN2)
53074 return SDValue();
53075 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
53076
53077 if (N->getNumOperands() == 4)
53078 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53079 NegN2, N->getOperand(3));
53080 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53081 NegN2);
53082}
53083
53084static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
53085 TargetLowering::DAGCombinerInfo &DCI,
53086 const X86Subtarget &Subtarget) {
53087 SDLoc dl(N);
53088 SDValue N0 = N->getOperand(0);
53089 EVT VT = N->getValueType(0);
53090
53091 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53092 // FIXME: Is this needed? We don't seem to have any tests for it.
53093 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
53094 N0.getOpcode() == X86ISD::SETCC_CARRY) {
53095 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
53096 N0->getOperand(1));
53097 bool ReplaceOtherUses = !N0.hasOneUse();
53098 DCI.CombineTo(N, Setcc);
53099 // Replace other uses with a truncate of the widened setcc_carry.
53100 if (ReplaceOtherUses) {
53101 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53102 N0.getValueType(), Setcc);
53103 DCI.CombineTo(N0.getNode(), Trunc);
53104 }
53105
53106 return SDValue(N, 0);
53107 }
53108
53109 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53110 return NewCMov;
53111
53112 if (DCI.isBeforeLegalizeOps())
53113 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53114 return V;
53115
53116 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
53117 DAG, DCI, Subtarget))
53118 return V;
53119
53120 if (VT.isVector())
53121 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
53122 return R;
53123
53124 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53125 return NewAdd;
53126
53127 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
53128 return R;
53129
53130 // TODO: Combine with any target/faux shuffle.
53131 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
53132 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
53133 SDValue N00 = N0.getOperand(0);
53134 SDValue N01 = N0.getOperand(1);
53135 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
53136 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
53137 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
53138 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
53139 return concatSubVectors(N00, N01, DAG, dl);
53140 }
53141 }
53142
53143 return SDValue();
53144}
53145
53146/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
53147/// recognizable memcmp expansion.
53148static bool isOrXorXorTree(SDValue X, bool Root = true) {
53149 if (X.getOpcode() == ISD::OR)
53150 return isOrXorXorTree(X.getOperand(0), false) &&
53151 isOrXorXorTree(X.getOperand(1), false);
53152 if (Root)
53153 return false;
53154 return X.getOpcode() == ISD::XOR;
53155}
53156
53157/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
53158/// expansion.
53159template <typename F>
53160static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
53161 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
53162 SDValue Op0 = X.getOperand(0);
53163 SDValue Op1 = X.getOperand(1);
53164 if (X.getOpcode() == ISD::OR) {
53165 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
53166 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
53167 if (VecVT != CmpVT)
53168 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
53169 if (HasPT)
53170 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
53171 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
53172 }
53173 if (X.getOpcode() == ISD::XOR) {
53174 SDValue A = SToV(Op0);
53175 SDValue B = SToV(Op1);
53176 if (VecVT != CmpVT)
53177 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
53178 if (HasPT)
53179 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
53180 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
53181 }
53182 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53182)
;
53183}
53184
53185/// Try to map a 128-bit or larger integer comparison to vector instructions
53186/// before type legalization splits it up into chunks.
53187static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
53188 const X86Subtarget &Subtarget) {
53189 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
53190 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53190, __extension__
__PRETTY_FUNCTION__))
;
53191
53192 // We're looking for an oversized integer equality comparison.
53193 SDValue X = SetCC->getOperand(0);
53194 SDValue Y = SetCC->getOperand(1);
53195 EVT OpVT = X.getValueType();
53196 unsigned OpSize = OpVT.getSizeInBits();
53197 if (!OpVT.isScalarInteger() || OpSize < 128)
53198 return SDValue();
53199
53200 // Ignore a comparison with zero because that gets special treatment in
53201 // EmitTest(). But make an exception for the special case of a pair of
53202 // logically-combined vector-sized operands compared to zero. This pattern may
53203 // be generated by the memcmp expansion pass with oversized integer compares
53204 // (see PR33325).
53205 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
53206 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
53207 return SDValue();
53208
53209 // Don't perform this combine if constructing the vector will be expensive.
53210 auto IsVectorBitCastCheap = [](SDValue X) {
53211 X = peekThroughBitcasts(X);
53212 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
53213 X.getOpcode() == ISD::LOAD;
53214 };
53215 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
53216 !IsOrXorXorTreeCCZero)
53217 return SDValue();
53218
53219 EVT VT = SetCC->getValueType(0);
53220 SDLoc DL(SetCC);
53221
53222 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
53223 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
53224 // Otherwise use PCMPEQ (plus AND) and mask testing.
53225 bool NoImplicitFloatOps =
53226 DAG.getMachineFunction().getFunction().hasFnAttribute(
53227 Attribute::NoImplicitFloat);
53228 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
53229 ((OpSize == 128 && Subtarget.hasSSE2()) ||
53230 (OpSize == 256 && Subtarget.hasAVX()) ||
53231 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
53232 bool HasPT = Subtarget.hasSSE41();
53233
53234 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
53235 // vector registers are essentially free. (Technically, widening registers
53236 // prevents load folding, but the tradeoff is worth it.)
53237 bool PreferKOT = Subtarget.preferMaskRegisters();
53238 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
53239
53240 EVT VecVT = MVT::v16i8;
53241 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
53242 if (OpSize == 256) {
53243 VecVT = MVT::v32i8;
53244 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
53245 }
53246 EVT CastVT = VecVT;
53247 bool NeedsAVX512FCast = false;
53248 if (OpSize == 512 || NeedZExt) {
53249 if (Subtarget.hasBWI()) {
53250 VecVT = MVT::v64i8;
53251 CmpVT = MVT::v64i1;
53252 if (OpSize == 512)
53253 CastVT = VecVT;
53254 } else {
53255 VecVT = MVT::v16i32;
53256 CmpVT = MVT::v16i1;
53257 CastVT = OpSize == 512 ? VecVT :
53258 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
53259 NeedsAVX512FCast = true;
53260 }
53261 }
53262
53263 auto ScalarToVector = [&](SDValue X) -> SDValue {
53264 bool TmpZext = false;
53265 EVT TmpCastVT = CastVT;
53266 if (X.getOpcode() == ISD::ZERO_EXTEND) {
53267 SDValue OrigX = X.getOperand(0);
53268 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
53269 if (OrigSize < OpSize) {
53270 if (OrigSize == 128) {
53271 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
53272 X = OrigX;
53273 TmpZext = true;
53274 } else if (OrigSize == 256) {
53275 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
53276 X = OrigX;
53277 TmpZext = true;
53278 }
53279 }
53280 }
53281 X = DAG.getBitcast(TmpCastVT, X);
53282 if (!NeedZExt && !TmpZext)
53283 return X;
53284 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
53285 DAG.getConstant(0, DL, VecVT), X,
53286 DAG.getVectorIdxConstant(0, DL));
53287 };
53288
53289 SDValue Cmp;
53290 if (IsOrXorXorTreeCCZero) {
53291 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
53292 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
53293 // Use 2 vector equality compares and 'and' the results before doing a
53294 // MOVMSK.
53295 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
53296 } else {
53297 SDValue VecX = ScalarToVector(X);
53298 SDValue VecY = ScalarToVector(Y);
53299 if (VecVT != CmpVT) {
53300 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
53301 } else if (HasPT) {
53302 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
53303 } else {
53304 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
53305 }
53306 }
53307 // AVX512 should emit a setcc that will lower to kortest.
53308 if (VecVT != CmpVT) {
53309 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
53310 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
53311 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
53312 DAG.getConstant(0, DL, KRegVT), CC);
53313 }
53314 if (HasPT) {
53315 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
53316 Cmp);
53317 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
53318 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
53319 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
53320 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
53321 }
53322 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
53323 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
53324 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
53325 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53326, __extension__
__PRETTY_FUNCTION__))
53326 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53326, __extension__
__PRETTY_FUNCTION__))
;
53327 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
53328 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
53329 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
53330 }
53331
53332 return SDValue();
53333}
53334
53335static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
53336 TargetLowering::DAGCombinerInfo &DCI,
53337 const X86Subtarget &Subtarget) {
53338 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
53339 const SDValue LHS = N->getOperand(0);
53340 const SDValue RHS = N->getOperand(1);
53341 EVT VT = N->getValueType(0);
53342 EVT OpVT = LHS.getValueType();
53343 SDLoc DL(N);
53344
53345 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
53346 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
53347 return V;
53348
53349 if (VT == MVT::i1 && isNullConstant(RHS)) {
53350 SDValue X86CC;
53351 if (SDValue V =
53352 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
53353 return DAG.getNode(ISD::TRUNCATE, DL, VT,
53354 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
53355 }
53356
53357 if (OpVT.isScalarInteger()) {
53358 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
53359 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
53360 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
53361 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
53362 if (N0.getOperand(0) == N1)
53363 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53364 N0.getOperand(1));
53365 if (N0.getOperand(1) == N1)
53366 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53367 N0.getOperand(0));
53368 }
53369 return SDValue();
53370 };
53371 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
53372 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53373 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
53374 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53375
53376 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
53377 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
53378 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
53379 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
53380 if (N0.getOperand(0) == N1)
53381 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53382 DAG.getNOT(DL, N0.getOperand(1), OpVT));
53383 if (N0.getOperand(1) == N1)
53384 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53385 DAG.getNOT(DL, N0.getOperand(0), OpVT));
53386 }
53387 return SDValue();
53388 };
53389 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
53390 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53391 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
53392 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53393
53394 // cmpeq(trunc(x),0) --> cmpeq(x,0)
53395 // cmpne(trunc(x),0) --> cmpne(x,0)
53396 // iff x upper bits are zero.
53397 // TODO: Add support for RHS to be truncate as well?
53398 if (LHS.getOpcode() == ISD::TRUNCATE &&
53399 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
53400 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
53401 EVT SrcVT = LHS.getOperand(0).getValueType();
53402 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
53403 OpVT.getScalarSizeInBits());
53404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53405 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
53406 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
53407 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
53408 DAG.getConstant(0, DL, SrcVT), CC);
53409 }
53410 }
53411 }
53412
53413 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
53414 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
53415 // Using temporaries to avoid messing up operand ordering for later
53416 // transformations if this doesn't work.
53417 SDValue Op0 = LHS;
53418 SDValue Op1 = RHS;
53419 ISD::CondCode TmpCC = CC;
53420 // Put build_vector on the right.
53421 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
53422 std::swap(Op0, Op1);
53423 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
53424 }
53425
53426 bool IsSEXT0 =
53427 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
53428 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
53429 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
53430
53431 if (IsSEXT0 && IsVZero1) {
53432 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53433, __extension__
__PRETTY_FUNCTION__))
53433 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53433, __extension__
__PRETTY_FUNCTION__))
;
53434 if (TmpCC == ISD::SETGT)
53435 return DAG.getConstant(0, DL, VT);
53436 if (TmpCC == ISD::SETLE)
53437 return DAG.getConstant(1, DL, VT);
53438 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
53439 return DAG.getNOT(DL, Op0.getOperand(0), VT);
53440
53441 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53442, __extension__
__PRETTY_FUNCTION__))
53442 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53442, __extension__
__PRETTY_FUNCTION__))
;
53443 return Op0.getOperand(0);
53444 }
53445 }
53446
53447 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
53448 // pre-promote its result type since vXi1 vectors don't get promoted
53449 // during type legalization.
53450 // NOTE: The element count check is to ignore operand types that need to
53451 // go through type promotion to a 128-bit vector.
53452 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
53453 VT.getVectorElementType() == MVT::i1 &&
53454 (OpVT.getVectorElementType() == MVT::i8 ||
53455 OpVT.getVectorElementType() == MVT::i16)) {
53456 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
53457 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
53458 }
53459
53460 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
53461 // to avoid scalarization via legalization because v4i32 is not a legal type.
53462 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
53463 LHS.getValueType() == MVT::v4f32)
53464 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
53465
53466 // X pred 0.0 --> X pred -X
53467 // If the negation of X already exists, use it in the comparison. This removes
53468 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
53469 // instructions in patterns with a 'select' node.
53470 if (isNullFPScalarOrVectorConst(RHS)) {
53471 SDVTList FNegVT = DAG.getVTList(OpVT);
53472 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
53473 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
53474 }
53475
53476 return SDValue();
53477}
53478
53479static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
53480 TargetLowering::DAGCombinerInfo &DCI,
53481 const X86Subtarget &Subtarget) {
53482 SDValue Src = N->getOperand(0);
53483 MVT SrcVT = Src.getSimpleValueType();
53484 MVT VT = N->getSimpleValueType(0);
53485 unsigned NumBits = VT.getScalarSizeInBits();
53486 unsigned NumElts = SrcVT.getVectorNumElements();
53487 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
53488 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53488, __extension__
__PRETTY_FUNCTION__))
;
53489
53490 // Perform constant folding.
53491 APInt UndefElts;
53492 SmallVector<APInt, 32> EltBits;
53493 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
53494 APInt Imm(32, 0);
53495 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
53496 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53497 Imm.setBit(Idx);
53498
53499 return DAG.getConstant(Imm, SDLoc(N), VT);
53500 }
53501
53502 // Look through int->fp bitcasts that don't change the element width.
53503 unsigned EltWidth = SrcVT.getScalarSizeInBits();
53504 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
53505 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
53506 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
53507
53508 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
53509 // with scalar comparisons.
53510 if (SDValue NotSrc = IsNOT(Src, DAG)) {
53511 SDLoc DL(N);
53512 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53513 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
53514 return DAG.getNode(ISD::XOR, DL, VT,
53515 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
53516 DAG.getConstant(NotMask, DL, VT));
53517 }
53518
53519 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
53520 // results with scalar comparisons.
53521 if (Src.getOpcode() == X86ISD::PCMPGT &&
53522 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
53523 SDLoc DL(N);
53524 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53525 return DAG.getNode(ISD::XOR, DL, VT,
53526 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
53527 DAG.getConstant(NotMask, DL, VT));
53528 }
53529
53530 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
53531 // iff pow2splat(c1).
53532 if (Src.getOpcode() == X86ISD::PCMPEQ &&
53533 Src.getOperand(0).getOpcode() == ISD::AND &&
53534 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
53535 SDValue LHS = Src.getOperand(0).getOperand(0);
53536 SDValue RHS = Src.getOperand(0).getOperand(1);
53537 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
53538 if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {
53539 SDLoc DL(N);
53540 MVT ShiftVT = SrcVT;
53541 if (ShiftVT.getScalarType() == MVT::i8) {
53542 // vXi8 shifts - we only care about the signbit so can use PSLLW.
53543 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
53544 LHS = DAG.getBitcast(ShiftVT, LHS);
53545 }
53546 unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();
53547 LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,
53548 ShiftAmt, DAG);
53549 LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);
53550 return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);
53551 }
53552 }
53553
53554 // Simplify the inputs.
53555 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53556 APInt DemandedMask(APInt::getAllOnes(NumBits));
53557 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53558 return SDValue(N, 0);
53559
53560 return SDValue();
53561}
53562
53563static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
53564 TargetLowering::DAGCombinerInfo &DCI,
53565 const X86Subtarget &Subtarget) {
53566 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
53567 SDValue BasePtr = MemOp->getBasePtr();
53568 SDValue Index = MemOp->getIndex();
53569 SDValue Scale = MemOp->getScale();
53570 SDValue Mask = MemOp->getMask();
53571
53572 // Attempt to fold an index scale into the scale value directly.
53573 // For smaller indices, implicit sext is performed BEFORE scale, preventing
53574 // this fold under most circumstances.
53575 // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
53576 if ((Index.getOpcode() == X86ISD::VSHLI ||
53577 (Index.getOpcode() == ISD::ADD &&
53578 Index.getOperand(0) == Index.getOperand(1))) &&
53579 isa<ConstantSDNode>(Scale) &&
53580 BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
53581 unsigned ShiftAmt =
53582 Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
53583 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
53584 uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
53585 if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
53586 SDValue NewIndex = Index.getOperand(0);
53587 SDValue NewScale =
53588 DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
53589 if (N->getOpcode() == X86ISD::MGATHER)
53590 return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
53591 MemOp->getOperand(1), Mask,
53592 MemOp->getBasePtr(), NewIndex, NewScale,
53593 MemOp->getChain(), Subtarget);
53594 if (N->getOpcode() == X86ISD::MSCATTER)
53595 return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
53596 MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
53597 NewIndex, NewScale, MemOp->getChain(), Subtarget);
53598 }
53599 }
53600
53601 // With vector masks we only demand the upper bit of the mask.
53602 if (Mask.getScalarValueSizeInBits() != 1) {
53603 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53604 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53605 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53606 if (N->getOpcode() != ISD::DELETED_NODE)
53607 DCI.AddToWorklist(N);
53608 return SDValue(N, 0);
53609 }
53610 }
53611
53612 return SDValue();
53613}
53614
53615static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
53616 SDValue Index, SDValue Base, SDValue Scale,
53617 SelectionDAG &DAG) {
53618 SDLoc DL(GorS);
53619
53620 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
53621 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
53622 Gather->getMask(), Base, Index, Scale } ;
53623 return DAG.getMaskedGather(Gather->getVTList(),
53624 Gather->getMemoryVT(), DL, Ops,
53625 Gather->getMemOperand(),
53626 Gather->getIndexType(),
53627 Gather->getExtensionType());
53628 }
53629 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
53630 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
53631 Scatter->getMask(), Base, Index, Scale };
53632 return DAG.getMaskedScatter(Scatter->getVTList(),
53633 Scatter->getMemoryVT(), DL,
53634 Ops, Scatter->getMemOperand(),
53635 Scatter->getIndexType(),
53636 Scatter->isTruncatingStore());
53637}
53638
53639static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
53640 TargetLowering::DAGCombinerInfo &DCI) {
53641 SDLoc DL(N);
53642 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
53643 SDValue Index = GorS->getIndex();
53644 SDValue Base = GorS->getBasePtr();
53645 SDValue Scale = GorS->getScale();
53646
53647 if (DCI.isBeforeLegalize()) {
53648 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53649
53650 // Shrink constant indices if they are larger than 32-bits.
53651 // Only do this before legalize types since v2i64 could become v2i32.
53652 // FIXME: We could check that the type is legal if we're after legalize
53653 // types, but then we would need to construct test cases where that happens.
53654 // FIXME: We could support more than just constant vectors, but we need to
53655 // careful with costing. A truncate that can be optimized out would be fine.
53656 // Otherwise we might only want to create a truncate if it avoids a split.
53657 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
53658 if (BV->isConstant() && IndexWidth > 32 &&
53659 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53660 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53661 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53662 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53663 }
53664 }
53665
53666 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
53667 // there are sufficient sign bits. Only do this before legalize types to
53668 // avoid creating illegal types in truncate.
53669 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
53670 Index.getOpcode() == ISD::ZERO_EXTEND) &&
53671 IndexWidth > 32 &&
53672 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
53673 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53674 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53675 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53676 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53677 }
53678 }
53679
53680 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53681 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53682 // Try to move splat constant adders from the index operand to the base
53683 // pointer operand. Taking care to multiply by the scale. We can only do
53684 // this when index element type is the same as the pointer type.
53685 // Otherwise we need to be sure the math doesn't wrap before the scale.
53686 if (Index.getOpcode() == ISD::ADD &&
53687 Index.getValueType().getVectorElementType() == PtrVT &&
53688 isa<ConstantSDNode>(Scale)) {
53689 uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
53690 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
53691 BitVector UndefElts;
53692 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
53693 // FIXME: Allow non-constant?
53694 if (UndefElts.none()) {
53695 // Apply the scale.
53696 APInt Adder = C->getAPIntValue() * ScaleAmt;
53697 // Add it to the existing base.
53698 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
53699 DAG.getConstant(Adder, DL, PtrVT));
53700 Index = Index.getOperand(0);
53701 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53702 }
53703 }
53704
53705 // It's also possible base is just a constant. In that case, just
53706 // replace it with 0 and move the displacement into the index.
53707 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
53708 isOneConstant(Scale)) {
53709 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
53710 // Combine the constant build_vector and the constant base.
53711 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53712 Index.getOperand(1), Splat);
53713 // Add to the LHS of the original Index add.
53714 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53715 Index.getOperand(0), Splat);
53716 Base = DAG.getConstant(0, DL, Base.getValueType());
53717 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53718 }
53719 }
53720 }
53721
53722 if (DCI.isBeforeLegalizeOps()) {
53723 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53724
53725 // Make sure the index is either i32 or i64
53726 if (IndexWidth != 32 && IndexWidth != 64) {
53727 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
53728 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
53729 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
53730 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53731 }
53732 }
53733
53734 // With vector masks we only demand the upper bit of the mask.
53735 SDValue Mask = GorS->getMask();
53736 if (Mask.getScalarValueSizeInBits() != 1) {
53737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53738 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53739 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53740 if (N->getOpcode() != ISD::DELETED_NODE)
53741 DCI.AddToWorklist(N);
53742 return SDValue(N, 0);
53743 }
53744 }
53745
53746 return SDValue();
53747}
53748
53749// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
53750static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
53751 const X86Subtarget &Subtarget) {
53752 SDLoc DL(N);
53753 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
53754 SDValue EFLAGS = N->getOperand(1);
53755
53756 // Try to simplify the EFLAGS and condition code operands.
53757 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
53758 return getSETCC(CC, Flags, DL, DAG);
53759
53760 return SDValue();
53761}
53762
53763/// Optimize branch condition evaluation.
53764static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
53765 const X86Subtarget &Subtarget) {
53766 SDLoc DL(N);
53767 SDValue EFLAGS = N->getOperand(3);
53768 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
53769
53770 // Try to simplify the EFLAGS and condition code operands.
53771 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
53772 // RAUW them under us.
53773 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
53774 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
53775 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
53776 N->getOperand(1), Cond, Flags);
53777 }
53778
53779 return SDValue();
53780}
53781
53782// TODO: Could we move this to DAGCombine?
53783static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
53784 SelectionDAG &DAG) {
53785 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
53786 // to optimize away operation when it's from a constant.
53787 //
53788 // The general transformation is:
53789 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
53790 // AND(VECTOR_CMP(x,y), constant2)
53791 // constant2 = UNARYOP(constant)
53792
53793 // Early exit if this isn't a vector operation, the operand of the
53794 // unary operation isn't a bitwise AND, or if the sizes of the operations
53795 // aren't the same.
53796 EVT VT = N->getValueType(0);
53797 bool IsStrict = N->isStrictFPOpcode();
53798 unsigned NumEltBits = VT.getScalarSizeInBits();
53799 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53800 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
53801 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
53802 VT.getSizeInBits() != Op0.getValueSizeInBits())
53803 return SDValue();
53804
53805 // Now check that the other operand of the AND is a constant. We could
53806 // make the transformation for non-constant splats as well, but it's unclear
53807 // that would be a benefit as it would not eliminate any operations, just
53808 // perform one more step in scalar code before moving to the vector unit.
53809 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
53810 // Bail out if the vector isn't a constant.
53811 if (!BV->isConstant())
53812 return SDValue();
53813
53814 // Everything checks out. Build up the new and improved node.
53815 SDLoc DL(N);
53816 EVT IntVT = BV->getValueType(0);
53817 // Create a new constant of the appropriate type for the transformed
53818 // DAG.
53819 SDValue SourceConst;
53820 if (IsStrict)
53821 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
53822 {N->getOperand(0), SDValue(BV, 0)});
53823 else
53824 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
53825 // The AND node needs bitcasts to/from an integer vector type around it.
53826 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
53827 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
53828 MaskConst);
53829 SDValue Res = DAG.getBitcast(VT, NewAnd);
53830 if (IsStrict)
53831 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
53832 return Res;
53833 }
53834
53835 return SDValue();
53836}
53837
53838/// If we are converting a value to floating-point, try to replace scalar
53839/// truncate of an extracted vector element with a bitcast. This tries to keep
53840/// the sequence on XMM registers rather than moving between vector and GPRs.
53841static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
53842 // TODO: This is currently only used by combineSIntToFP, but it is generalized
53843 // to allow being called by any similar cast opcode.
53844 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
53845 SDValue Trunc = N->getOperand(0);
53846 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
53847 return SDValue();
53848
53849 SDValue ExtElt = Trunc.getOperand(0);
53850 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53851 !isNullConstant(ExtElt.getOperand(1)))
53852 return SDValue();
53853
53854 EVT TruncVT = Trunc.getValueType();
53855 EVT SrcVT = ExtElt.getValueType();
53856 unsigned DestWidth = TruncVT.getSizeInBits();
53857 unsigned SrcWidth = SrcVT.getSizeInBits();
53858 if (SrcWidth % DestWidth != 0)
53859 return SDValue();
53860
53861 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
53862 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
53863 unsigned VecWidth = SrcVecVT.getSizeInBits();
53864 unsigned NumElts = VecWidth / DestWidth;
53865 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
53866 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
53867 SDLoc DL(N);
53868 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
53869 BitcastVec, ExtElt.getOperand(1));
53870 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
53871}
53872
53873static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
53874 const X86Subtarget &Subtarget) {
53875 bool IsStrict = N->isStrictFPOpcode();
53876 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53877 EVT VT = N->getValueType(0);
53878 EVT InVT = Op0.getValueType();
53879
53880 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
53881 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
53882 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
53883 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
53884 unsigned ScalarSize = InVT.getScalarSizeInBits();
53885 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
53886 return SDValue();
53887 SDLoc dl(N);
53888 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
53889 ScalarSize < 16 ? MVT::i16
53890 : ScalarSize < 32 ? MVT::i32
53891 : MVT::i64,
53892 InVT.getVectorNumElements());
53893 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
53894 if (IsStrict)
53895 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
53896 {N->getOperand(0), P});
53897 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
53898 }
53899
53900 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
53901 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
53902 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
53903 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
53904 VT.getScalarType() != MVT::f16) {
53905 SDLoc dl(N);
53906 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
53907 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
53908
53909 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
53910 if (IsStrict)
53911 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53912 {N->getOperand(0), P});
53913 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53914 }
53915
53916 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
53917 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
53918 // the optimization here.
53919 if (DAG.SignBitIsZero(Op0)) {
53920 if (IsStrict)
53921 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
53922 {N->getOperand(0), Op0});
53923 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
53924 }
53925
53926 return SDValue();
53927}
53928
53929static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
53930 TargetLowering::DAGCombinerInfo &DCI,
53931 const X86Subtarget &Subtarget) {
53932 // First try to optimize away the conversion entirely when it's
53933 // conditionally from a constant. Vectors only.
53934 bool IsStrict = N->isStrictFPOpcode();
53935 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
53936 return Res;
53937
53938 // Now move on to more general possibilities.
53939 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53940 EVT VT = N->getValueType(0);
53941 EVT InVT = Op0.getValueType();
53942
53943 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
53944 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
53945 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
53946 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
53947 unsigned ScalarSize = InVT.getScalarSizeInBits();
53948 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
53949 return SDValue();
53950 SDLoc dl(N);
53951 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
53952 ScalarSize < 16 ? MVT::i16
53953 : ScalarSize < 32 ? MVT::i32
53954 : MVT::i64,
53955 InVT.getVectorNumElements());
53956 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
53957 if (IsStrict)
53958 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53959 {N->getOperand(0), P});
53960 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53961 }
53962
53963 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
53964 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
53965 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
53966 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
53967 VT.getScalarType() != MVT::f16) {
53968 SDLoc dl(N);
53969 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
53970 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
53971 if (IsStrict)
53972 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53973 {N->getOperand(0), P});
53974 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53975 }
53976
53977 // Without AVX512DQ we only support i64 to float scalar conversion. For both
53978 // vectors and scalars, see if we know that the upper bits are all the sign
53979 // bit, in which case we can truncate the input to i32 and convert from that.
53980 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
53981 unsigned BitWidth = InVT.getScalarSizeInBits();
53982 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
53983 if (NumSignBits >= (BitWidth - 31)) {
53984 EVT TruncVT = MVT::i32;
53985 if (InVT.isVector())
53986 TruncVT = InVT.changeVectorElementType(TruncVT);
53987 SDLoc dl(N);
53988 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
53989 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
53990 if (IsStrict)
53991 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53992 {N->getOperand(0), Trunc});
53993 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
53994 }
53995 // If we're after legalize and the type is v2i32 we need to shuffle and
53996 // use CVTSI2P.
53997 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53997, __extension__
__PRETTY_FUNCTION__))
;
53998 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
53999 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
54000 { 0, 2, -1, -1 });
54001 if (IsStrict)
54002 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
54003 {N->getOperand(0), Shuf});
54004 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
54005 }
54006 }
54007
54008 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
54009 // a 32-bit target where SSE doesn't support i64->FP operations.
54010 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
54011 Op0.getOpcode() == ISD::LOAD) {
54012 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
54013
54014 // This transformation is not supported if the result type is f16 or f128.
54015 if (VT == MVT::f16 || VT == MVT::f128)
54016 return SDValue();
54017
54018 // If we have AVX512DQ we can use packed conversion instructions unless
54019 // the VT is f80.
54020 if (Subtarget.hasDQI() && VT != MVT::f80)
54021 return SDValue();
54022
54023 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
54024 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
54025 std::pair<SDValue, SDValue> Tmp =
54026 Subtarget.getTargetLowering()->BuildFILD(
54027 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
54028 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
54029 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
54030 return Tmp.first;
54031 }
54032 }
54033
54034 if (IsStrict)
54035 return SDValue();
54036
54037 if (SDValue V = combineToFPTruncExtElt(N, DAG))
54038 return V;
54039
54040 return SDValue();
54041}
54042
54043static bool needCarryOrOverflowFlag(SDValue Flags) {
54044 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54044, __extension__
__PRETTY_FUNCTION__))
;
54045
54046 for (const SDNode *User : Flags->uses()) {
54047 X86::CondCode CC;
54048 switch (User->getOpcode()) {
54049 default:
54050 // Be conservative.
54051 return true;
54052 case X86ISD::SETCC:
54053 case X86ISD::SETCC_CARRY:
54054 CC = (X86::CondCode)User->getConstantOperandVal(0);
54055 break;
54056 case X86ISD::BRCOND:
54057 case X86ISD::CMOV:
54058 CC = (X86::CondCode)User->getConstantOperandVal(2);
54059 break;
54060 }
54061
54062 switch (CC) {
54063 default: break;
54064 case X86::COND_A: case X86::COND_AE:
54065 case X86::COND_B: case X86::COND_BE:
54066 case X86::COND_O: case X86::COND_NO:
54067 case X86::COND_G: case X86::COND_GE:
54068 case X86::COND_L: case X86::COND_LE:
54069 return true;
54070 }
54071 }
54072
54073 return false;
54074}
54075
54076static bool onlyZeroFlagUsed(SDValue Flags) {
54077 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54077, __extension__
__PRETTY_FUNCTION__))
;
54078
54079 for (const SDNode *User : Flags->uses()) {
54080 unsigned CCOpNo;
54081 switch (User->getOpcode()) {
54082 default:
54083 // Be conservative.
54084 return false;
54085 case X86ISD::SETCC:
54086 case X86ISD::SETCC_CARRY:
54087 CCOpNo = 0;
54088 break;
54089 case X86ISD::BRCOND:
54090 case X86ISD::CMOV:
54091 CCOpNo = 2;
54092 break;
54093 }
54094
54095 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
54096 if (CC != X86::COND_E && CC != X86::COND_NE)
54097 return false;
54098 }
54099
54100 return true;
54101}
54102
54103/// If this is an add or subtract where one operand is produced by a cmp+setcc,
54104/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
54105/// with CMP+{ADC, SBB}.
54106/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
54107static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
54108 SDValue X, SDValue Y,
54109 SelectionDAG &DAG,
54110 bool ZeroSecondOpOnly = false) {
54111 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
54112 return SDValue();
54113
54114 // Look through a one-use zext.
54115 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
54116 Y = Y.getOperand(0);
54117
54118 X86::CondCode CC;
54119 SDValue EFLAGS;
54120 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
54121 CC = (X86::CondCode)Y.getConstantOperandVal(0);
54122 EFLAGS = Y.getOperand(1);
54123 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
54124 Y.hasOneUse()) {
54125 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
54126 }
54127
54128 if (!EFLAGS)
54129 return SDValue();
54130
54131 // If X is -1 or 0, then we have an opportunity to avoid constants required in
54132 // the general case below.
54133 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
54134 if (ConstantX && !ZeroSecondOpOnly) {
54135 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
54136 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
54137 // This is a complicated way to get -1 or 0 from the carry flag:
54138 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
54139 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
54140 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
54141 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
54142 EFLAGS);
54143 }
54144
54145 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
54146 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
54147 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
54148 EFLAGS.getValueType().isInteger() &&
54149 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
54150 // Swap the operands of a SUB, and we have the same pattern as above.
54151 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
54152 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
54153 SDValue NewSub = DAG.getNode(
54154 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
54155 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
54156 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
54157 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
54158 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
54159 NewEFLAGS);
54160 }
54161 }
54162 }
54163
54164 if (CC == X86::COND_B) {
54165 // X + SETB Z --> adc X, 0
54166 // X - SETB Z --> sbb X, 0
54167 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
54168 DAG.getVTList(VT, MVT::i32), X,
54169 DAG.getConstant(0, DL, VT), EFLAGS);
54170 }
54171
54172 if (ZeroSecondOpOnly)
54173 return SDValue();
54174
54175 if (CC == X86::COND_A) {
54176 // Try to convert COND_A into COND_B in an attempt to facilitate
54177 // materializing "setb reg".
54178 //
54179 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
54180 // cannot take an immediate as its first operand.
54181 //
54182 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
54183 EFLAGS.getValueType().isInteger() &&
54184 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
54185 SDValue NewSub =
54186 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
54187 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
54188 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
54189 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
54190 DAG.getVTList(VT, MVT::i32), X,
54191 DAG.getConstant(0, DL, VT), NewEFLAGS);
54192 }
54193 }
54194
54195 if (CC == X86::COND_AE) {
54196 // X + SETAE --> sbb X, -1
54197 // X - SETAE --> adc X, -1
54198 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
54199 DAG.getVTList(VT, MVT::i32), X,
54200 DAG.getConstant(-1, DL, VT), EFLAGS);
54201 }
54202
54203 if (CC == X86::COND_BE) {
54204 // X + SETBE --> sbb X, -1
54205 // X - SETBE --> adc X, -1
54206 // Try to convert COND_BE into COND_AE in an attempt to facilitate
54207 // materializing "setae reg".
54208 //
54209 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
54210 // cannot take an immediate as its first operand.
54211 //
54212 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
54213 EFLAGS.getValueType().isInteger() &&
54214 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
54215 SDValue NewSub =
54216 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
54217 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
54218 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
54219 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
54220 DAG.getVTList(VT, MVT::i32), X,
54221 DAG.getConstant(-1, DL, VT), NewEFLAGS);
54222 }
54223 }
54224
54225 if (CC != X86::COND_E && CC != X86::COND_NE)
54226 return SDValue();
54227
54228 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
54229 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
54230 !EFLAGS.getOperand(0).getValueType().isInteger())
54231 return SDValue();
54232
54233 SDValue Z = EFLAGS.getOperand(0);
54234 EVT ZVT = Z.getValueType();
54235
54236 // If X is -1 or 0, then we have an opportunity to avoid constants required in
54237 // the general case below.
54238 if (ConstantX) {
54239 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
54240 // fake operands:
54241 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
54242 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
54243 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
54244 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
54245 SDValue Zero = DAG.getConstant(0, DL, ZVT);
54246 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
54247 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
54248 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
54249 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
54250 SDValue(Neg.getNode(), 1));
54251 }
54252
54253 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
54254 // with fake operands:
54255 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
54256 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
54257 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
54258 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
54259 SDValue One = DAG.getConstant(1, DL, ZVT);
54260 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
54261 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
54262 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
54263 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
54264 Cmp1.getValue(1));
54265 }
54266 }
54267
54268 // (cmp Z, 1) sets the carry flag if Z is 0.
54269 SDValue One = DAG.getConstant(1, DL, ZVT);
54270 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
54271 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
54272
54273 // Add the flags type for ADC/SBB nodes.
54274 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54275
54276 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
54277 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
54278 if (CC == X86::COND_NE)
54279 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
54280 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
54281
54282 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
54283 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
54284 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
54285 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
54286}
54287
54288/// If this is an add or subtract where one operand is produced by a cmp+setcc,
54289/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
54290/// with CMP+{ADC, SBB}.
54291static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
54292 bool IsSub = N->getOpcode() == ISD::SUB;
54293 SDValue X = N->getOperand(0);
54294 SDValue Y = N->getOperand(1);
54295 EVT VT = N->getValueType(0);
54296 SDLoc DL(N);
54297
54298 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
54299 return ADCOrSBB;
54300
54301 // Commute and try again (negate the result for subtracts).
54302 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
54303 if (IsSub)
54304 ADCOrSBB =
54305 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
54306 return ADCOrSBB;
54307 }
54308
54309 return SDValue();
54310}
54311
54312static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
54313 // Only handle test patterns.
54314 if (!isNullConstant(N->getOperand(1)))
54315 return SDValue();
54316
54317 // If we have a CMP of a truncated binop, see if we can make a smaller binop
54318 // and use its flags directly.
54319 // TODO: Maybe we should try promoting compares that only use the zero flag
54320 // first if we can prove the upper bits with computeKnownBits?
54321 SDLoc dl(N);
54322 SDValue Op = N->getOperand(0);
54323 EVT VT = Op.getValueType();
54324
54325 // If we have a constant logical shift that's only used in a comparison
54326 // against zero turn it into an equivalent AND. This allows turning it into
54327 // a TEST instruction later.
54328 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
54329 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
54330 onlyZeroFlagUsed(SDValue(N, 0))) {
54331 unsigned BitWidth = VT.getSizeInBits();
54332 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
54333 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
54334 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
54335 APInt Mask = Op.getOpcode() == ISD::SRL
54336 ? APInt::getHighBitsSet(BitWidth, MaskBits)
54337 : APInt::getLowBitsSet(BitWidth, MaskBits);
54338 if (Mask.isSignedIntN(32)) {
54339 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
54340 DAG.getConstant(Mask, dl, VT));
54341 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54342 DAG.getConstant(0, dl, VT));
54343 }
54344 }
54345 }
54346
54347 // Peek through any zero-extend if we're only testing for a zero result.
54348 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
54349 SDValue Src = Op.getOperand(0);
54350 EVT SrcVT = Src.getValueType();
54351 if (SrcVT.getScalarSizeInBits() >= 8 &&
54352 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
54353 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
54354 DAG.getConstant(0, dl, SrcVT));
54355 }
54356
54357 // Look for a truncate.
54358 if (Op.getOpcode() != ISD::TRUNCATE)
54359 return SDValue();
54360
54361 SDValue Trunc = Op;
54362 Op = Op.getOperand(0);
54363
54364 // See if we can compare with zero against the truncation source,
54365 // which should help using the Z flag from many ops. Only do this for
54366 // i32 truncated op to prevent partial-reg compares of promoted ops.
54367 EVT OpVT = Op.getValueType();
54368 APInt UpperBits =
54369 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
54370 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
54371 onlyZeroFlagUsed(SDValue(N, 0))) {
54372 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54373 DAG.getConstant(0, dl, OpVT));
54374 }
54375
54376 // After this the truncate and arithmetic op must have a single use.
54377 if (!Trunc.hasOneUse() || !Op.hasOneUse())
54378 return SDValue();
54379
54380 unsigned NewOpc;
54381 switch (Op.getOpcode()) {
54382 default: return SDValue();
54383 case ISD::AND:
54384 // Skip and with constant. We have special handling for and with immediate
54385 // during isel to generate test instructions.
54386 if (isa<ConstantSDNode>(Op.getOperand(1)))
54387 return SDValue();
54388 NewOpc = X86ISD::AND;
54389 break;
54390 case ISD::OR: NewOpc = X86ISD::OR; break;
54391 case ISD::XOR: NewOpc = X86ISD::XOR; break;
54392 case ISD::ADD:
54393 // If the carry or overflow flag is used, we can't truncate.
54394 if (needCarryOrOverflowFlag(SDValue(N, 0)))
54395 return SDValue();
54396 NewOpc = X86ISD::ADD;
54397 break;
54398 case ISD::SUB:
54399 // If the carry or overflow flag is used, we can't truncate.
54400 if (needCarryOrOverflowFlag(SDValue(N, 0)))
54401 return SDValue();
54402 NewOpc = X86ISD::SUB;
54403 break;
54404 }
54405
54406 // We found an op we can narrow. Truncate its inputs.
54407 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
54408 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
54409
54410 // Use a X86 specific opcode to avoid DAG combine messing with it.
54411 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54412 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
54413
54414 // For AND, keep a CMP so that we can match the test pattern.
54415 if (NewOpc == X86ISD::AND)
54416 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54417 DAG.getConstant(0, dl, VT));
54418
54419 // Return the flags.
54420 return Op.getValue(1);
54421}
54422
54423static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
54424 TargetLowering::DAGCombinerInfo &DCI) {
54425 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54426, __extension__
__PRETTY_FUNCTION__))
54426 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54426, __extension__
__PRETTY_FUNCTION__))
;
54427
54428 SDLoc DL(N);
54429 SDValue LHS = N->getOperand(0);
54430 SDValue RHS = N->getOperand(1);
54431 MVT VT = LHS.getSimpleValueType();
54432 bool IsSub = X86ISD::SUB == N->getOpcode();
54433 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
54434
54435 // If we don't use the flag result, simplify back to a generic ADD/SUB.
54436 if (!N->hasAnyUseOfValue(1)) {
54437 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
54438 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
54439 }
54440
54441 // Fold any similar generic ADD/SUB opcodes to reuse this node.
54442 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
54443 SDValue Ops[] = {N0, N1};
54444 SDVTList VTs = DAG.getVTList(N->getValueType(0));
54445 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
54446 SDValue Op(N, 0);
54447 if (Negate)
54448 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
54449 DCI.CombineTo(GenericAddSub, Op);
54450 }
54451 };
54452 MatchGeneric(LHS, RHS, false);
54453 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
54454
54455 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
54456 // EFLAGS result doesn't change.
54457 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
54458 /*ZeroSecondOpOnly*/ true);
54459}
54460
54461static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
54462 SDValue LHS = N->getOperand(0);
54463 SDValue RHS = N->getOperand(1);
54464 SDValue BorrowIn = N->getOperand(2);
54465
54466 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
54467 MVT VT = N->getSimpleValueType(0);
54468 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54469 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
54470 }
54471
54472 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
54473 // iff the flag result is dead.
54474 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
54475 !N->hasAnyUseOfValue(1))
54476 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54477 LHS.getOperand(1), BorrowIn);
54478
54479 return SDValue();
54480}
54481
54482// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
54483static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
54484 TargetLowering::DAGCombinerInfo &DCI) {
54485 SDValue LHS = N->getOperand(0);
54486 SDValue RHS = N->getOperand(1);
54487 SDValue CarryIn = N->getOperand(2);
54488 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
54489 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
54490
54491 // Canonicalize constant to RHS.
54492 if (LHSC && !RHSC)
54493 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
54494 CarryIn);
54495
54496 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
54497 // the result is either zero or one (depending on the input carry bit).
54498 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
54499 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
54500 // We don't have a good way to replace an EFLAGS use, so only do this when
54501 // dead right now.
54502 SDValue(N, 1).use_empty()) {
54503 SDLoc DL(N);
54504 EVT VT = N->getValueType(0);
54505 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
54506 SDValue Res1 = DAG.getNode(
54507 ISD::AND, DL, VT,
54508 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
54509 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
54510 DAG.getConstant(1, DL, VT));
54511 return DCI.CombineTo(N, Res1, CarryOut);
54512 }
54513
54514 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
54515 // iff the flag result is dead.
54516 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
54517 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
54518 SDLoc DL(N);
54519 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
54520 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
54521 DAG.getConstant(0, DL, LHS.getValueType()),
54522 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
54523 }
54524
54525 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
54526 MVT VT = N->getSimpleValueType(0);
54527 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54528 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
54529 }
54530
54531 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
54532 // iff the flag result is dead.
54533 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
54534 !N->hasAnyUseOfValue(1))
54535 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54536 LHS.getOperand(1), CarryIn);
54537
54538 return SDValue();
54539}
54540
54541static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
54542 const SDLoc &DL, EVT VT,
54543 const X86Subtarget &Subtarget) {
54544 // Example of pattern we try to detect:
54545 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
54546 //(add (build_vector (extract_elt t, 0),
54547 // (extract_elt t, 2),
54548 // (extract_elt t, 4),
54549 // (extract_elt t, 6)),
54550 // (build_vector (extract_elt t, 1),
54551 // (extract_elt t, 3),
54552 // (extract_elt t, 5),
54553 // (extract_elt t, 7)))
54554
54555 if (!Subtarget.hasSSE2())
54556 return SDValue();
54557
54558 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
54559 Op1.getOpcode() != ISD::BUILD_VECTOR)
54560 return SDValue();
54561
54562 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54563 VT.getVectorNumElements() < 4 ||
54564 !isPowerOf2_32(VT.getVectorNumElements()))
54565 return SDValue();
54566
54567 // Check if one of Op0,Op1 is of the form:
54568 // (build_vector (extract_elt Mul, 0),
54569 // (extract_elt Mul, 2),
54570 // (extract_elt Mul, 4),
54571 // ...
54572 // the other is of the form:
54573 // (build_vector (extract_elt Mul, 1),
54574 // (extract_elt Mul, 3),
54575 // (extract_elt Mul, 5),
54576 // ...
54577 // and identify Mul.
54578 SDValue Mul;
54579 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
54580 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
54581 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
54582 // TODO: Be more tolerant to undefs.
54583 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54584 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54585 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54586 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
54587 return SDValue();
54588 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
54589 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
54590 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
54591 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
54592 if (!Const0L || !Const1L || !Const0H || !Const1H)
54593 return SDValue();
54594 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
54595 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
54596 // Commutativity of mul allows factors of a product to reorder.
54597 if (Idx0L > Idx1L)
54598 std::swap(Idx0L, Idx1L);
54599 if (Idx0H > Idx1H)
54600 std::swap(Idx0H, Idx1H);
54601 // Commutativity of add allows pairs of factors to reorder.
54602 if (Idx0L > Idx0H) {
54603 std::swap(Idx0L, Idx0H);
54604 std::swap(Idx1L, Idx1H);
54605 }
54606 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
54607 Idx1H != 2 * i + 3)
54608 return SDValue();
54609 if (!Mul) {
54610 // First time an extract_elt's source vector is visited. Must be a MUL
54611 // with 2X number of vector elements than the BUILD_VECTOR.
54612 // Both extracts must be from same MUL.
54613 Mul = Op0L->getOperand(0);
54614 if (Mul->getOpcode() != ISD::MUL ||
54615 Mul.getValueType().getVectorNumElements() != 2 * e)
54616 return SDValue();
54617 }
54618 // Check that the extract is from the same MUL previously seen.
54619 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
54620 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
54621 return SDValue();
54622 }
54623
54624 // Check if the Mul source can be safely shrunk.
54625 ShrinkMode Mode;
54626 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
54627 Mode == ShrinkMode::MULU16)
54628 return SDValue();
54629
54630 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54631 VT.getVectorNumElements() * 2);
54632 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
54633 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
54634
54635 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54636 ArrayRef<SDValue> Ops) {
54637 EVT InVT = Ops[0].getValueType();
54638 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54638, __extension__
__PRETTY_FUNCTION__))
;
54639 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54640 InVT.getVectorNumElements() / 2);
54641 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54642 };
54643 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
54644}
54645
54646// Attempt to turn this pattern into PMADDWD.
54647// (add (mul (sext (build_vector)), (sext (build_vector))),
54648// (mul (sext (build_vector)), (sext (build_vector)))
54649static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
54650 const SDLoc &DL, EVT VT,
54651 const X86Subtarget &Subtarget) {
54652 if (!Subtarget.hasSSE2())
54653 return SDValue();
54654
54655 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54656 return SDValue();
54657
54658 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54659 VT.getVectorNumElements() < 4 ||
54660 !isPowerOf2_32(VT.getVectorNumElements()))
54661 return SDValue();
54662
54663 SDValue N00 = N0.getOperand(0);
54664 SDValue N01 = N0.getOperand(1);
54665 SDValue N10 = N1.getOperand(0);
54666 SDValue N11 = N1.getOperand(1);
54667
54668 // All inputs need to be sign extends.
54669 // TODO: Support ZERO_EXTEND from known positive?
54670 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
54671 N01.getOpcode() != ISD::SIGN_EXTEND ||
54672 N10.getOpcode() != ISD::SIGN_EXTEND ||
54673 N11.getOpcode() != ISD::SIGN_EXTEND)
54674 return SDValue();
54675
54676 // Peek through the extends.
54677 N00 = N00.getOperand(0);
54678 N01 = N01.getOperand(0);
54679 N10 = N10.getOperand(0);
54680 N11 = N11.getOperand(0);
54681
54682 // Must be extending from vXi16.
54683 EVT InVT = N00.getValueType();
54684 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
54685 N10.getValueType() != InVT || N11.getValueType() != InVT)
54686 return SDValue();
54687
54688 // All inputs should be build_vectors.
54689 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54690 N01.getOpcode() != ISD::BUILD_VECTOR ||
54691 N10.getOpcode() != ISD::BUILD_VECTOR ||
54692 N11.getOpcode() != ISD::BUILD_VECTOR)
54693 return SDValue();
54694
54695 // For each element, we need to ensure we have an odd element from one vector
54696 // multiplied by the odd element of another vector and the even element from
54697 // one of the same vectors being multiplied by the even element from the
54698 // other vector. So we need to make sure for each element i, this operator
54699 // is being performed:
54700 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54701 SDValue In0, In1;
54702 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
54703 SDValue N00Elt = N00.getOperand(i);
54704 SDValue N01Elt = N01.getOperand(i);
54705 SDValue N10Elt = N10.getOperand(i);
54706 SDValue N11Elt = N11.getOperand(i);
54707 // TODO: Be more tolerant to undefs.
54708 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54709 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54710 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54711 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
54712 return SDValue();
54713 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54714 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54715 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54716 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54717 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54718 return SDValue();
54719 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54720 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54721 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54722 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54723 // Add is commutative so indices can be reordered.
54724 if (IdxN00 > IdxN10) {
54725 std::swap(IdxN00, IdxN10);
54726 std::swap(IdxN01, IdxN11);
54727 }
54728 // N0 indices be the even element. N1 indices must be the next odd element.
54729 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54730 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54731 return SDValue();
54732 SDValue N00In = N00Elt.getOperand(0);
54733 SDValue N01In = N01Elt.getOperand(0);
54734 SDValue N10In = N10Elt.getOperand(0);
54735 SDValue N11In = N11Elt.getOperand(0);
54736
54737 // First time we find an input capture it.
54738 if (!In0) {
54739 In0 = N00In;
54740 In1 = N01In;
54741
54742 // The input vectors must be at least as wide as the output.
54743 // If they are larger than the output, we extract subvector below.
54744 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
54745 In1.getValueSizeInBits() < VT.getSizeInBits())
54746 return SDValue();
54747 }
54748 // Mul is commutative so the input vectors can be in any order.
54749 // Canonicalize to make the compares easier.
54750 if (In0 != N00In)
54751 std::swap(N00In, N01In);
54752 if (In0 != N10In)
54753 std::swap(N10In, N11In);
54754 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
54755 return SDValue();
54756 }
54757
54758 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54759 ArrayRef<SDValue> Ops) {
54760 EVT OpVT = Ops[0].getValueType();
54761 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))
54762 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))
;
54763 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54763, __extension__
__PRETTY_FUNCTION__))
;
54764 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54765 OpVT.getVectorNumElements() / 2);
54766 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54767 };
54768
54769 // If the output is narrower than an input, extract the low part of the input
54770 // vector.
54771 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54772 VT.getVectorNumElements() * 2);
54773 if (OutVT16.bitsLT(In0.getValueType())) {
54774 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
54775 DAG.getIntPtrConstant(0, DL));
54776 }
54777 if (OutVT16.bitsLT(In1.getValueType())) {
54778 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
54779 DAG.getIntPtrConstant(0, DL));
54780 }
54781 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
54782 PMADDBuilder);
54783}
54784
54785// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
54786// If upper element in each pair of both VPMADDWD are zero then we can merge
54787// the operand elements and use the implicit add of VPMADDWD.
54788// TODO: Add support for VPMADDUBSW (which isn't commutable).
54789static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
54790 const SDLoc &DL, EVT VT) {
54791 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
54792 return SDValue();
54793
54794 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
54795 if (VT.getSizeInBits() > 128)
54796 return SDValue();
54797
54798 unsigned NumElts = VT.getVectorNumElements();
54799 MVT OpVT = N0.getOperand(0).getSimpleValueType();
54800 APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
54801 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
54802
54803 bool Op0HiZero =
54804 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
54805 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
54806 bool Op1HiZero =
54807 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
54808 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
54809
54810 // TODO: Check for zero lower elements once we have actual codegen that
54811 // creates them.
54812 if (!Op0HiZero || !Op1HiZero)
54813 return SDValue();
54814
54815 // Create a shuffle mask packing the lower elements from each VPMADDWD.
54816 SmallVector<int> Mask;
54817 for (int i = 0; i != (int)NumElts; ++i) {
54818 Mask.push_back(2 * i);
54819 Mask.push_back(2 * (i + NumElts));
54820 }
54821
54822 SDValue LHS =
54823 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
54824 SDValue RHS =
54825 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
54826 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
54827}
54828
54829/// CMOV of constants requires materializing constant operands in registers.
54830/// Try to fold those constants into an 'add' instruction to reduce instruction
54831/// count. We do this with CMOV rather the generic 'select' because there are
54832/// earlier folds that may be used to turn select-of-constants into logic hacks.
54833static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
54834 const X86Subtarget &Subtarget) {
54835 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
54836 // better because we eliminate 1-2 instructions. This transform is still
54837 // an improvement without zero operands because we trade 2 move constants and
54838 // 1 add for 2 adds (LEA) as long as the constants can be represented as
54839 // immediate asm operands (fit in 32-bits).
54840 auto isSuitableCmov = [](SDValue V) {
54841 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
54842 return false;
54843 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
54844 !isa<ConstantSDNode>(V.getOperand(1)))
54845 return false;
54846 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
54847 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
54848 V.getConstantOperandAPInt(1).isSignedIntN(32));
54849 };
54850
54851 // Match an appropriate CMOV as the first operand of the add.
54852 SDValue Cmov = N->getOperand(0);
54853 SDValue OtherOp = N->getOperand(1);
54854 if (!isSuitableCmov(Cmov))
54855 std::swap(Cmov, OtherOp);
54856 if (!isSuitableCmov(Cmov))
54857 return SDValue();
54858
54859 // Don't remove a load folding opportunity for the add. That would neutralize
54860 // any improvements from removing constant materializations.
54861 if (X86::mayFoldLoad(OtherOp, Subtarget))
54862 return SDValue();
54863
54864 EVT VT = N->getValueType(0);
54865 SDLoc DL(N);
54866 SDValue FalseOp = Cmov.getOperand(0);
54867 SDValue TrueOp = Cmov.getOperand(1);
54868
54869 // We will push the add through the select, but we can potentially do better
54870 // if we know there is another add in the sequence and this is pointer math.
54871 // In that case, we can absorb an add into the trailing memory op and avoid
54872 // a 3-operand LEA which is likely slower than a 2-operand LEA.
54873 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
54874 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
54875 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
54876 all_of(N->uses(), [&](SDNode *Use) {
54877 auto *MemNode = dyn_cast<MemSDNode>(Use);
54878 return MemNode && MemNode->getBasePtr().getNode() == N;
54879 })) {
54880 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
54881 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
54882 // it is possible that choosing op1 might be better.
54883 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
54884 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
54885 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
54886 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
54887 Cmov.getOperand(2), Cmov.getOperand(3));
54888 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
54889 }
54890
54891 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
54892 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
54893 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
54894 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
54895 Cmov.getOperand(3));
54896}
54897
54898static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
54899 TargetLowering::DAGCombinerInfo &DCI,
54900 const X86Subtarget &Subtarget) {
54901 EVT VT = N->getValueType(0);
54902 SDValue Op0 = N->getOperand(0);
54903 SDValue Op1 = N->getOperand(1);
54904 SDLoc DL(N);
54905
54906 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
54907 return Select;
54908
54909 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
54910 return MAdd;
54911 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
54912 return MAdd;
54913 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
54914 return MAdd;
54915
54916 // Try to synthesize horizontal adds from adds of shuffles.
54917 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
54918 return V;
54919
54920 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
54921 // (sub Y, (sext (vXi1 X))).
54922 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
54923 // generic DAG combine without a legal type check, but adding this there
54924 // caused regressions.
54925 if (VT.isVector()) {
54926 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54927 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
54928 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54929 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
54930 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
54931 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
54932 }
54933
54934 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
54935 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54936 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
54937 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
54938 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
54939 }
54940 }
54941
54942 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
54943 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
54944 X86::isZeroNode(Op0.getOperand(1))) {
54945 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54945, __extension__
__PRETTY_FUNCTION__))
;
54946 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
54947 Op0.getOperand(0), Op0.getOperand(2));
54948 }
54949
54950 return combineAddOrSubToADCOrSBB(N, DAG);
54951}
54952
54953// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
54954// condition comes from the subtract node that produced -X. This matches the
54955// cmov expansion for absolute value. By swapping the operands we convert abs
54956// to nabs.
54957static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
54958 SDValue N0 = N->getOperand(0);
54959 SDValue N1 = N->getOperand(1);
54960
54961 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
54962 return SDValue();
54963
54964 X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
54965 if (CC != X86::COND_S && CC != X86::COND_NS)
54966 return SDValue();
54967
54968 // Condition should come from a negate operation.
54969 SDValue Cond = N1.getOperand(3);
54970 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
54971 return SDValue();
54972 assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54972, __extension__
__PRETTY_FUNCTION__))
;
54973
54974 // Get the X and -X from the negate.
54975 SDValue NegX = Cond.getValue(0);
54976 SDValue X = Cond.getOperand(1);
54977
54978 SDValue FalseOp = N1.getOperand(0);
54979 SDValue TrueOp = N1.getOperand(1);
54980
54981 // Cmov operands should be X and NegX. Order doesn't matter.
54982 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
54983 return SDValue();
54984
54985 // Build a new CMOV with the operands swapped.
54986 SDLoc DL(N);
54987 MVT VT = N->getSimpleValueType(0);
54988 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
54989 N1.getOperand(2), Cond);
54990 // Convert sub to add.
54991 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
54992}
54993
54994static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
54995 TargetLowering::DAGCombinerInfo &DCI,
54996 const X86Subtarget &Subtarget) {
54997 SDValue Op0 = N->getOperand(0);
54998 SDValue Op1 = N->getOperand(1);
54999
55000 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55001 auto IsNonOpaqueConstant = [&](SDValue Op) {
55002 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
55003 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55004 return !Cst->isOpaque();
55005 return true;
55006 }
55007 return false;
55008 };
55009
55010 // X86 can't encode an immediate LHS of a sub. See if we can push the
55011 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55012 // one use and a constant, invert the immediate, saving one register.
55013 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55014 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55015 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
55016 SDLoc DL(N);
55017 EVT VT = Op0.getValueType();
55018 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55019 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55020 SDValue NewAdd =
55021 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55022 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55023 }
55024
55025 if (SDValue V = combineSubABS(N, DAG))
55026 return V;
55027
55028 // Try to synthesize horizontal subs from subs of shuffles.
55029 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55030 return V;
55031
55032 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55033 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55034 X86::isZeroNode(Op1.getOperand(1))) {
55035 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55035, __extension__
__PRETTY_FUNCTION__))
;
55036 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55037 Op1.getOperand(0), Op1.getOperand(2));
55038 }
55039
55040 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55041 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55042 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55043 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55044 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55044, __extension__
__PRETTY_FUNCTION__))
;
55045 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55046 Op1.getOperand(1), Op1.getOperand(2));
55047 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
55048 Op1.getOperand(0));
55049 }
55050
55051 return combineAddOrSubToADCOrSBB(N, DAG);
55052}
55053
55054static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
55055 const X86Subtarget &Subtarget) {
55056 MVT VT = N->getSimpleValueType(0);
55057 SDLoc DL(N);
55058
55059 if (N->getOperand(0) == N->getOperand(1)) {
55060 if (N->getOpcode() == X86ISD::PCMPEQ)
55061 return DAG.getConstant(-1, DL, VT);
55062 if (N->getOpcode() == X86ISD::PCMPGT)
55063 return DAG.getConstant(0, DL, VT);
55064 }
55065
55066 return SDValue();
55067}
55068
55069/// Helper that combines an array of subvector ops as if they were the operands
55070/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
55071/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
55072static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
55073 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
55074 TargetLowering::DAGCombinerInfo &DCI,
55075 const X86Subtarget &Subtarget) {
55076 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55076, __extension__
__PRETTY_FUNCTION__))
;
55077 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55078
55079 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
55080 return DAG.getUNDEF(VT);
55081
55082 if (llvm::all_of(Ops, [](SDValue Op) {
55083 return ISD::isBuildVectorAllZeros(Op.getNode());
55084 }))
55085 return getZeroVector(VT, Subtarget, DAG, DL);
55086
55087 SDValue Op0 = Ops[0];
55088 bool IsSplat = llvm::all_equal(Ops);
55089
55090 // Repeated subvectors.
55091 if (IsSplat &&
55092 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55093 // If this broadcast is inserted into both halves, use a larger broadcast.
55094 if (Op0.getOpcode() == X86ISD::VBROADCAST)
55095 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
55096
55097 // If this simple subvector or scalar/subvector broadcast_load is inserted
55098 // into both halves, use a larger broadcast_load. Update other uses to use
55099 // an extracted subvector.
55100 if (ISD::isNormalLoad(Op0.getNode()) ||
55101 Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
55102 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
55103 auto *Mem = cast<MemSDNode>(Op0);
55104 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
55105 ? X86ISD::VBROADCAST_LOAD
55106 : X86ISD::SUBV_BROADCAST_LOAD;
55107 if (SDValue BcastLd =
55108 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
55109 SDValue BcastSrc =
55110 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
55111 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
55112 return BcastLd;
55113 }
55114 }
55115
55116 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
55117 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
55118 (Subtarget.hasAVX2() ||
55119 X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
55120 VT.getScalarType(), Subtarget)))
55121 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
55122 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
55123 Op0.getOperand(0),
55124 DAG.getIntPtrConstant(0, DL)));
55125
55126 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
55127 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
55128 (Subtarget.hasAVX2() ||
55129 (EltSizeInBits >= 32 &&
55130 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
55131 Op0.getOperand(0).getValueType() == VT.getScalarType())
55132 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
55133
55134 // concat_vectors(extract_subvector(broadcast(x)),
55135 // extract_subvector(broadcast(x))) -> broadcast(x)
55136 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55137 Op0.getOperand(0).getValueType() == VT) {
55138 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
55139 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
55140 return Op0.getOperand(0);
55141 }
55142 }
55143
55144 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
55145 // Only concat of subvector high halves which vperm2x128 is best at.
55146 // TODO: This should go in combineX86ShufflesRecursively eventually.
55147 if (VT.is256BitVector() && Ops.size() == 2) {
55148 SDValue Src0 = peekThroughBitcasts(Ops[0]);
55149 SDValue Src1 = peekThroughBitcasts(Ops[1]);
55150 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55151 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
55152 EVT SrcVT0 = Src0.getOperand(0).getValueType();
55153 EVT SrcVT1 = Src1.getOperand(0).getValueType();
55154 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
55155 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
55156 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
55157 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
55158 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
55159 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
55160 DAG.getBitcast(VT, Src0.getOperand(0)),
55161 DAG.getBitcast(VT, Src1.getOperand(0)),
55162 DAG.getTargetConstant(0x31, DL, MVT::i8));
55163 }
55164 }
55165 }
55166
55167 // Repeated opcode.
55168 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
55169 // but it currently struggles with different vector widths.
55170 if (llvm::all_of(Ops, [Op0](SDValue Op) {
55171 return Op.getOpcode() == Op0.getOpcode();
55172 })) {
55173 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
55174 SmallVector<SDValue> Subs;
55175 for (SDValue SubOp : SubOps)
55176 Subs.push_back(SubOp.getOperand(I));
55177 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
55178 };
55179 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
55180 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
55181 SDValue Sub = SubOps[I].getOperand(Op);
55182 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
55183 if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
55184 Sub.getOperand(0).getValueType() != VT ||
55185 Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
55186 return false;
55187 }
55188 return true;
55189 };
55190
55191 unsigned NumOps = Ops.size();
55192 switch (Op0.getOpcode()) {
55193 case X86ISD::VBROADCAST: {
55194 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
55195 return Op.getOperand(0).getValueType().is128BitVector();
55196 })) {
55197 if (VT == MVT::v4f64 || VT == MVT::v4i64)
55198 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
55199 ConcatSubOperand(VT, Ops, 0),
55200 ConcatSubOperand(VT, Ops, 0));
55201 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
55202 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
55203 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
55204 : X86ISD::PSHUFD,
55205 DL, VT, ConcatSubOperand(VT, Ops, 0),
55206 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55207 }
55208 break;
55209 }
55210 case X86ISD::MOVDDUP:
55211 case X86ISD::MOVSHDUP:
55212 case X86ISD::MOVSLDUP: {
55213 if (!IsSplat)
55214 return DAG.getNode(Op0.getOpcode(), DL, VT,
55215 ConcatSubOperand(VT, Ops, 0));
55216 break;
55217 }
55218 case X86ISD::SHUFP: {
55219 // Add SHUFPD support if/when necessary.
55220 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
55221 llvm::all_of(Ops, [Op0](SDValue Op) {
55222 return Op.getOperand(2) == Op0.getOperand(2);
55223 })) {
55224 return DAG.getNode(Op0.getOpcode(), DL, VT,
55225 ConcatSubOperand(VT, Ops, 0),
55226 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55227 }
55228 break;
55229 }
55230 case X86ISD::PSHUFHW:
55231 case X86ISD::PSHUFLW:
55232 case X86ISD::PSHUFD:
55233 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
55234 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
55235 return DAG.getNode(Op0.getOpcode(), DL, VT,
55236 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55237 }
55238 [[fallthrough]];
55239 case X86ISD::VPERMILPI:
55240 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
55241 Op0.getOperand(1) == Ops[1].getOperand(1)) {
55242 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
55243 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
55244 Op0.getOperand(1));
55245 return DAG.getBitcast(VT, Res);
55246 }
55247 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
55248 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
55249 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
55250 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
55251 return DAG.getNode(Op0.getOpcode(), DL, VT,
55252 ConcatSubOperand(VT, Ops, 0),
55253 DAG.getTargetConstant(Idx, DL, MVT::i8));
55254 }
55255 break;
55256 case X86ISD::PSHUFB:
55257 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55258 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55259 return DAG.getNode(Op0.getOpcode(), DL, VT,
55260 ConcatSubOperand(VT, Ops, 0),
55261 ConcatSubOperand(VT, Ops, 1));
55262 }
55263 break;
55264 case X86ISD::VPERMV3:
55265 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55266 MVT OpVT = Op0.getSimpleValueType();
55267 int NumSrcElts = OpVT.getVectorNumElements();
55268 SmallVector<int, 64> ConcatMask;
55269 for (unsigned i = 0; i != NumOps; ++i) {
55270 SmallVector<int, 64> SubMask;
55271 SmallVector<SDValue, 2> SubOps;
55272 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
55273 SubMask))
55274 break;
55275 for (int M : SubMask) {
55276 if (0 <= M) {
55277 M += M < NumSrcElts ? 0 : NumSrcElts;
55278 M += i * NumSrcElts;
55279 }
55280 ConcatMask.push_back(M);
55281 }
55282 }
55283 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55284 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
55285 Ops[1].getOperand(0), DAG, DL);
55286 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
55287 Ops[1].getOperand(2), DAG, DL);
55288 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
55289 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55290 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55291 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
55292 }
55293 }
55294 break;
55295 case X86ISD::VSHLI:
55296 case X86ISD::VSRLI:
55297 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
55298 // TODO: Move this to LowerShiftByScalarImmediate?
55299 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
55300 llvm::all_of(Ops, [](SDValue Op) {
55301 return Op.getConstantOperandAPInt(1) == 32;
55302 })) {
55303 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
55304 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
55305 if (Op0.getOpcode() == X86ISD::VSHLI) {
55306 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55307 {8, 0, 8, 2, 8, 4, 8, 6});
55308 } else {
55309 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55310 {1, 8, 3, 8, 5, 8, 7, 8});
55311 }
55312 return DAG.getBitcast(VT, Res);
55313 }
55314 [[fallthrough]];
55315 case X86ISD::VSRAI:
55316 case X86ISD::VSHL:
55317 case X86ISD::VSRL:
55318 case X86ISD::VSRA:
55319 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
55320 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55321 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
55322 llvm::all_of(Ops, [Op0](SDValue Op) {
55323 return Op0.getOperand(1) == Op.getOperand(1);
55324 })) {
55325 return DAG.getNode(Op0.getOpcode(), DL, VT,
55326 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55327 }
55328 break;
55329 case X86ISD::VPERMI:
55330 case X86ISD::VROTLI:
55331 case X86ISD::VROTRI:
55332 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55333 llvm::all_of(Ops, [Op0](SDValue Op) {
55334 return Op0.getOperand(1) == Op.getOperand(1);
55335 })) {
55336 return DAG.getNode(Op0.getOpcode(), DL, VT,
55337 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55338 }
55339 break;
55340 case ISD::AND:
55341 case ISD::OR:
55342 case ISD::XOR:
55343 case X86ISD::ANDNP:
55344 // TODO: Add 256-bit support.
55345 if (!IsSplat && VT.is512BitVector()) {
55346 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55347 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55348 NumOps * SrcVT.getVectorNumElements());
55349 return DAG.getNode(Op0.getOpcode(), DL, VT,
55350 ConcatSubOperand(SrcVT, Ops, 0),
55351 ConcatSubOperand(SrcVT, Ops, 1));
55352 }
55353 break;
55354 case X86ISD::GF2P8AFFINEQB:
55355 if (!IsSplat &&
55356 (VT.is256BitVector() ||
55357 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55358 llvm::all_of(Ops, [Op0](SDValue Op) {
55359 return Op0.getOperand(2) == Op.getOperand(2);
55360 })) {
55361 return DAG.getNode(Op0.getOpcode(), DL, VT,
55362 ConcatSubOperand(VT, Ops, 0),
55363 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55364 }
55365 break;
55366 case X86ISD::HADD:
55367 case X86ISD::HSUB:
55368 case X86ISD::FHADD:
55369 case X86ISD::FHSUB:
55370 case X86ISD::PACKSS:
55371 case X86ISD::PACKUS:
55372 if (!IsSplat && VT.is256BitVector() &&
55373 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
55374 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55375 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55376 NumOps * SrcVT.getVectorNumElements());
55377 return DAG.getNode(Op0.getOpcode(), DL, VT,
55378 ConcatSubOperand(SrcVT, Ops, 0),
55379 ConcatSubOperand(SrcVT, Ops, 1));
55380 }
55381 break;
55382 case X86ISD::PALIGNR:
55383 if (!IsSplat &&
55384 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55385 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
55386 llvm::all_of(Ops, [Op0](SDValue Op) {
55387 return Op0.getOperand(2) == Op.getOperand(2);
55388 })) {
55389 return DAG.getNode(Op0.getOpcode(), DL, VT,
55390 ConcatSubOperand(VT, Ops, 0),
55391 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55392 }
55393 break;
55394 case ISD::VSELECT:
55395 case X86ISD::BLENDV:
55396 if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
55397 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasInt256()) &&
55398 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
55399 EVT SelVT = Ops[0].getOperand(0).getValueType();
55400 SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
55401 if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
55402 return DAG.getNode(Op0.getOpcode(), DL, VT,
55403 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55404 ConcatSubOperand(VT, Ops, 1),
55405 ConcatSubOperand(VT, Ops, 2));
55406 }
55407 break;
55408 }
55409 }
55410
55411 // Fold subvector loads into one.
55412 // If needed, look through bitcasts to get to the load.
55413 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
55414 unsigned Fast;
55415 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
55416 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
55417 *FirstLd->getMemOperand(), &Fast) &&
55418 Fast) {
55419 if (SDValue Ld =
55420 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
55421 return Ld;
55422 }
55423 }
55424
55425 // Attempt to fold target constant loads.
55426 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55427 SmallVector<APInt> EltBits;
55428 APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
55429 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
55430 APInt OpUndefElts;
55431 SmallVector<APInt> OpEltBits;
55432 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55433 OpEltBits, true, false))
55434 break;
55435 EltBits.append(OpEltBits);
55436 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55437 }
55438 if (EltBits.size() == VT.getVectorNumElements())
55439 return getConstVector(EltBits, UndefElts, VT, DAG, DL);
55440 }
55441
55442 return SDValue();
55443}
55444
55445static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
55446 TargetLowering::DAGCombinerInfo &DCI,
55447 const X86Subtarget &Subtarget) {
55448 EVT VT = N->getValueType(0);
55449 EVT SrcVT = N->getOperand(0).getValueType();
55450 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55451
55452 // Don't do anything for i1 vectors.
55453 if (VT.getVectorElementType() == MVT::i1)
55454 return SDValue();
55455
55456 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
55457 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
55458 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
55459 DCI, Subtarget))
55460 return R;
55461 }
55462
55463 return SDValue();
55464}
55465
55466static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
55467 TargetLowering::DAGCombinerInfo &DCI,
55468 const X86Subtarget &Subtarget) {
55469 if (DCI.isBeforeLegalizeOps())
55470 return SDValue();
55471
55472 MVT OpVT = N->getSimpleValueType(0);
55473
55474 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
55475
55476 SDLoc dl(N);
55477 SDValue Vec = N->getOperand(0);
55478 SDValue SubVec = N->getOperand(1);
55479
55480 uint64_t IdxVal = N->getConstantOperandVal(2);
55481 MVT SubVecVT = SubVec.getSimpleValueType();
55482
55483 if (Vec.isUndef() && SubVec.isUndef())
55484 return DAG.getUNDEF(OpVT);
55485
55486 // Inserting undefs/zeros into zeros/undefs is a zero vector.
55487 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
55488 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
55489 return getZeroVector(OpVT, Subtarget, DAG, dl);
55490
55491 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
55492 // If we're inserting into a zero vector and then into a larger zero vector,
55493 // just insert into the larger zero vector directly.
55494 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55495 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
55496 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
55497 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55498 getZeroVector(OpVT, Subtarget, DAG, dl),
55499 SubVec.getOperand(1),
55500 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
55501 }
55502
55503 // If we're inserting into a zero vector and our input was extracted from an
55504 // insert into a zero vector of the same type and the extraction was at
55505 // least as large as the original insertion. Just insert the original
55506 // subvector into a zero vector.
55507 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
55508 isNullConstant(SubVec.getOperand(1)) &&
55509 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
55510 SDValue Ins = SubVec.getOperand(0);
55511 if (isNullConstant(Ins.getOperand(2)) &&
55512 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
55513 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
55514 SubVecVT.getFixedSizeInBits())
55515 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55516 getZeroVector(OpVT, Subtarget, DAG, dl),
55517 Ins.getOperand(1), N->getOperand(2));
55518 }
55519 }
55520
55521 // Stop here if this is an i1 vector.
55522 if (IsI1Vector)
55523 return SDValue();
55524
55525 // If this is an insert of an extract, combine to a shuffle. Don't do this
55526 // if the insert or extract can be represented with a subregister operation.
55527 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55528 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
55529 (IdxVal != 0 ||
55530 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
55531 int ExtIdxVal = SubVec.getConstantOperandVal(1);
55532 if (ExtIdxVal != 0) {
55533 int VecNumElts = OpVT.getVectorNumElements();
55534 int SubVecNumElts = SubVecVT.getVectorNumElements();
55535 SmallVector<int, 64> Mask(VecNumElts);
55536 // First create an identity shuffle mask.
55537 for (int i = 0; i != VecNumElts; ++i)
55538 Mask[i] = i;
55539 // Now insert the extracted portion.
55540 for (int i = 0; i != SubVecNumElts; ++i)
55541 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
55542
55543 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
55544 }
55545 }
55546
55547 // Match concat_vector style patterns.
55548 SmallVector<SDValue, 2> SubVectorOps;
55549 if (collectConcatOps(N, SubVectorOps, DAG)) {
55550 if (SDValue Fold =
55551 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
55552 return Fold;
55553
55554 // If we're inserting all zeros into the upper half, change this to
55555 // a concat with zero. We will match this to a move
55556 // with implicit upper bit zeroing during isel.
55557 // We do this here because we don't want combineConcatVectorOps to
55558 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
55559 if (SubVectorOps.size() == 2 &&
55560 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
55561 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55562 getZeroVector(OpVT, Subtarget, DAG, dl),
55563 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
55564 }
55565
55566 // If this is a broadcast insert into an upper undef, use a larger broadcast.
55567 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
55568 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
55569
55570 // If this is a broadcast load inserted into an upper undef, use a larger
55571 // broadcast load.
55572 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
55573 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
55574 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
55575 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
55576 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
55577 SDValue BcastLd =
55578 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
55579 MemIntr->getMemoryVT(),
55580 MemIntr->getMemOperand());
55581 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
55582 return BcastLd;
55583 }
55584
55585 // If we're splatting the lower half subvector of a full vector load into the
55586 // upper half, attempt to create a subvector broadcast.
55587 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
55588 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
55589 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
55590 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
55591 if (VecLd && SubLd &&
55592 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
55593 SubVec.getValueSizeInBits() / 8, 0))
55594 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
55595 SubLd, 0, DAG);
55596 }
55597
55598 return SDValue();
55599}
55600
55601/// If we are extracting a subvector of a vector select and the select condition
55602/// is composed of concatenated vectors, try to narrow the select width. This
55603/// is a common pattern for AVX1 integer code because 256-bit selects may be
55604/// legal, but there is almost no integer math/logic available for 256-bit.
55605/// This function should only be called with legal types (otherwise, the calls
55606/// to get simple value types will assert).
55607static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
55608 SDValue Sel = Ext->getOperand(0);
55609 SmallVector<SDValue, 4> CatOps;
55610 if (Sel.getOpcode() != ISD::VSELECT ||
55611 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
55612 return SDValue();
55613
55614 // Note: We assume simple value types because this should only be called with
55615 // legal operations/types.
55616 // TODO: This can be extended to handle extraction to 256-bits.
55617 MVT VT = Ext->getSimpleValueType(0);
55618 if (!VT.is128BitVector())
55619 return SDValue();
55620
55621 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
55622 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
55623 return SDValue();
55624
55625 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
55626 MVT SelVT = Sel.getSimpleValueType();
55627 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55628, __extension__
__PRETTY_FUNCTION__))
55628 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55628, __extension__
__PRETTY_FUNCTION__))
;
55629
55630 unsigned SelElts = SelVT.getVectorNumElements();
55631 unsigned CastedElts = WideVT.getVectorNumElements();
55632 unsigned ExtIdx = Ext->getConstantOperandVal(1);
55633 if (SelElts % CastedElts == 0) {
55634 // The select has the same or more (narrower) elements than the extract
55635 // operand. The extraction index gets scaled by that factor.
55636 ExtIdx *= (SelElts / CastedElts);
55637 } else if (CastedElts % SelElts == 0) {
55638 // The select has less (wider) elements than the extract operand. Make sure
55639 // that the extraction index can be divided evenly.
55640 unsigned IndexDivisor = CastedElts / SelElts;
55641 if (ExtIdx % IndexDivisor != 0)
55642 return SDValue();
55643 ExtIdx /= IndexDivisor;
55644 } else {
55645 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55645)
;
55646 }
55647
55648 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
55649 unsigned NarrowElts = SelElts / NarrowingFactor;
55650 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
55651 SDLoc DL(Ext);
55652 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
55653 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
55654 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
55655 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
55656 return DAG.getBitcast(VT, NarrowSel);
55657}
55658
55659static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
55660 TargetLowering::DAGCombinerInfo &DCI,
55661 const X86Subtarget &Subtarget) {
55662 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
55663 // eventually get combined/lowered into ANDNP) with a concatenated operand,
55664 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
55665 // We let generic combining take over from there to simplify the
55666 // insert/extract and 'not'.
55667 // This pattern emerges during AVX1 legalization. We handle it before lowering
55668 // to avoid complications like splitting constant vector loads.
55669
55670 // Capture the original wide type in the likely case that we need to bitcast
55671 // back to this type.
55672 if (!N->getValueType(0).isSimple())
55673 return SDValue();
55674
55675 MVT VT = N->getSimpleValueType(0);
55676 SDValue InVec = N->getOperand(0);
55677 unsigned IdxVal = N->getConstantOperandVal(1);
55678 SDValue InVecBC = peekThroughBitcasts(InVec);
55679 EVT InVecVT = InVec.getValueType();
55680 unsigned SizeInBits = VT.getSizeInBits();
55681 unsigned InSizeInBits = InVecVT.getSizeInBits();
55682 unsigned NumSubElts = VT.getVectorNumElements();
55683 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55684
55685 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
55686 TLI.isTypeLegal(InVecVT) &&
55687 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
55688 auto isConcatenatedNot = [](SDValue V) {
55689 V = peekThroughBitcasts(V);
55690 if (!isBitwiseNot(V))
55691 return false;
55692 SDValue NotOp = V->getOperand(0);
55693 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
55694 };
55695 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
55696 isConcatenatedNot(InVecBC.getOperand(1))) {
55697 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
55698 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
55699 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
55700 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
55701 }
55702 }
55703
55704 if (DCI.isBeforeLegalizeOps())
55705 return SDValue();
55706
55707 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
55708 return V;
55709
55710 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
55711 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
55712
55713 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
55714 if (VT.getScalarType() == MVT::i1)
55715 return DAG.getConstant(1, SDLoc(N), VT);
55716 return getOnesVector(VT, DAG, SDLoc(N));
55717 }
55718
55719 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
55720 return DAG.getBuildVector(VT, SDLoc(N),
55721 InVec->ops().slice(IdxVal, NumSubElts));
55722
55723 // If we are extracting from an insert into a larger vector, replace with a
55724 // smaller insert if we don't access less than the original subvector. Don't
55725 // do this for i1 vectors.
55726 // TODO: Relax the matching indices requirement?
55727 if (VT.getVectorElementType() != MVT::i1 &&
55728 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
55729 IdxVal == InVec.getConstantOperandVal(2) &&
55730 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
55731 SDLoc DL(N);
55732 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
55733 InVec.getOperand(0), N->getOperand(1));
55734 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
55735 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
55736 InVec.getOperand(1),
55737 DAG.getVectorIdxConstant(NewIdxVal, DL));
55738 }
55739
55740 // If we're extracting an upper subvector from a broadcast we should just
55741 // extract the lowest subvector instead which should allow
55742 // SimplifyDemandedVectorElts do more simplifications.
55743 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
55744 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
55745 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
55746 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
55747
55748 // If we're extracting a broadcasted subvector, just use the lowest subvector.
55749 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55750 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
55751 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
55752
55753 // Attempt to extract from the source of a shuffle vector.
55754 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
55755 SmallVector<int, 32> ShuffleMask;
55756 SmallVector<int, 32> ScaledMask;
55757 SmallVector<SDValue, 2> ShuffleInputs;
55758 unsigned NumSubVecs = InSizeInBits / SizeInBits;
55759 // Decode the shuffle mask and scale it so its shuffling subvectors.
55760 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
55761 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
55762 unsigned SubVecIdx = IdxVal / NumSubElts;
55763 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
55764 return DAG.getUNDEF(VT);
55765 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
55766 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
55767 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
55768 if (Src.getValueSizeInBits() == InSizeInBits) {
55769 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
55770 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
55771 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
55772 SDLoc(N), SizeInBits);
55773 }
55774 }
55775 }
55776
55777 // If we're extracting the lowest subvector and we're the only user,
55778 // we may be able to perform this with a smaller vector width.
55779 unsigned InOpcode = InVec.getOpcode();
55780 if (InVec.hasOneUse()) {
55781 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
55782 // v2f64 CVTDQ2PD(v4i32).
55783 if (InOpcode == ISD::SINT_TO_FP &&
55784 InVec.getOperand(0).getValueType() == MVT::v4i32) {
55785 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
55786 }
55787 // v2f64 CVTUDQ2PD(v4i32).
55788 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
55789 InVec.getOperand(0).getValueType() == MVT::v4i32) {
55790 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
55791 }
55792 // v2f64 CVTPS2PD(v4f32).
55793 if (InOpcode == ISD::FP_EXTEND &&
55794 InVec.getOperand(0).getValueType() == MVT::v4f32) {
55795 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
55796 }
55797 }
55798 if (IdxVal == 0 &&
55799 (InOpcode == ISD::ANY_EXTEND ||
55800 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
55801 InOpcode == ISD::ZERO_EXTEND ||
55802 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
55803 InOpcode == ISD::SIGN_EXTEND ||
55804 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55805 (SizeInBits == 128 || SizeInBits == 256) &&
55806 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
55807 SDLoc DL(N);
55808 SDValue Ext = InVec.getOperand(0);
55809 if (Ext.getValueSizeInBits() > SizeInBits)
55810 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
55811 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
55812 return DAG.getNode(ExtOp, DL, VT, Ext);
55813 }
55814 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
55815 InVec.getOperand(0).getValueType().is256BitVector() &&
55816 InVec.getOperand(1).getValueType().is256BitVector() &&
55817 InVec.getOperand(2).getValueType().is256BitVector()) {
55818 SDLoc DL(N);
55819 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
55820 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
55821 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
55822 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
55823 }
55824 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
55825 (VT.is128BitVector() || VT.is256BitVector())) {
55826 SDLoc DL(N);
55827 SDValue InVecSrc = InVec.getOperand(0);
55828 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
55829 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
55830 return DAG.getNode(InOpcode, DL, VT, Ext);
55831 }
55832 if (InOpcode == X86ISD::MOVDDUP &&
55833 (VT.is128BitVector() || VT.is256BitVector())) {
55834 SDLoc DL(N);
55835 SDValue Ext0 =
55836 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
55837 return DAG.getNode(InOpcode, DL, VT, Ext0);
55838 }
55839 }
55840
55841 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
55842 // as this is very likely to fold into a shuffle/truncation.
55843 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
55844 InVecVT.getScalarSizeInBits() == 64 &&
55845 InVec.getConstantOperandAPInt(1) == 32) {
55846 SDLoc DL(N);
55847 SDValue Ext =
55848 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
55849 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
55850 }
55851
55852 return SDValue();
55853}
55854
55855static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
55856 EVT VT = N->getValueType(0);
55857 SDValue Src = N->getOperand(0);
55858 SDLoc DL(N);
55859
55860 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
55861 // This occurs frequently in our masked scalar intrinsic code and our
55862 // floating point select lowering with AVX512.
55863 // TODO: SimplifyDemandedBits instead?
55864 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
55865 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
55866 if (C->getAPIntValue().isOne())
55867 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
55868 Src.getOperand(0));
55869
55870 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
55871 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55872 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
55873 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
55874 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
55875 if (C->isZero())
55876 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
55877 Src.getOperand(1));
55878
55879 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
55880 // TODO: Move to DAGCombine/SimplifyDemandedBits?
55881 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
55882 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
55883 if (Op.getValueType() != MVT::i64)
55884 return SDValue();
55885 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
55886 if (Op.getOpcode() == Opc &&
55887 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
55888 return Op.getOperand(0);
55889 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
55890 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
55891 if (Ld->getExtensionType() == Ext &&
55892 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
55893 return Op;
55894 if (IsZeroExt) {
55895 KnownBits Known = DAG.computeKnownBits(Op);
55896 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
55897 return Op;
55898 }
55899 return SDValue();
55900 };
55901
55902 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
55903 return DAG.getBitcast(
55904 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
55905 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
55906
55907 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
55908 return DAG.getBitcast(
55909 VT,
55910 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
55911 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
55912 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
55913 }
55914
55915 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
55916 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
55917 Src.getOperand(0).getValueType() == MVT::x86mmx)
55918 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
55919
55920 // See if we're broadcasting the scalar value, in which case just reuse that.
55921 // Ensure the same SDValue from the SDNode use is being used.
55922 if (VT.getScalarType() == Src.getValueType())
55923 for (SDNode *User : Src->uses())
55924 if (User->getOpcode() == X86ISD::VBROADCAST &&
55925 Src == User->getOperand(0)) {
55926 unsigned SizeInBits = VT.getFixedSizeInBits();
55927 unsigned BroadcastSizeInBits =
55928 User->getValueSizeInBits(0).getFixedValue();
55929 if (BroadcastSizeInBits == SizeInBits)
55930 return SDValue(User, 0);
55931 if (BroadcastSizeInBits > SizeInBits)
55932 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
55933 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
55934 // coverage.
55935 }
55936
55937 return SDValue();
55938}
55939
55940// Simplify PMULDQ and PMULUDQ operations.
55941static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
55942 TargetLowering::DAGCombinerInfo &DCI,
55943 const X86Subtarget &Subtarget) {
55944 SDValue LHS = N->getOperand(0);
55945 SDValue RHS = N->getOperand(1);
55946
55947 // Canonicalize constant to RHS.
55948 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
55949 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
55950 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
55951
55952 // Multiply by zero.
55953 // Don't return RHS as it may contain UNDEFs.
55954 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
55955 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
55956
55957 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
55958 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55959 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
55960 return SDValue(N, 0);
55961
55962 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
55963 // convert it to any_extend_invec, due to the LegalOperations check, do the
55964 // conversion directly to a vector shuffle manually. This exposes combine
55965 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
55966 // combineX86ShufflesRecursively on SSE4.1 targets.
55967 // FIXME: This is basically a hack around several other issues related to
55968 // ANY_EXTEND_VECTOR_INREG.
55969 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
55970 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
55971 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55972 LHS.getOperand(0).getValueType() == MVT::v4i32) {
55973 SDLoc dl(N);
55974 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
55975 LHS.getOperand(0), { 0, -1, 1, -1 });
55976 LHS = DAG.getBitcast(MVT::v2i64, LHS);
55977 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
55978 }
55979 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
55980 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
55981 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55982 RHS.getOperand(0).getValueType() == MVT::v4i32) {
55983 SDLoc dl(N);
55984 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
55985 RHS.getOperand(0), { 0, -1, 1, -1 });
55986 RHS = DAG.getBitcast(MVT::v2i64, RHS);
55987 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
55988 }
55989
55990 return SDValue();
55991}
55992
55993// Simplify VPMADDUBSW/VPMADDWD operations.
55994static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
55995 TargetLowering::DAGCombinerInfo &DCI) {
55996 EVT VT = N->getValueType(0);
55997 SDValue LHS = N->getOperand(0);
55998 SDValue RHS = N->getOperand(1);
55999
56000 // Multiply by zero.
56001 // Don't return LHS/RHS as it may contain UNDEFs.
56002 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
56003 ISD::isBuildVectorAllZeros(RHS.getNode()))
56004 return DAG.getConstant(0, SDLoc(N), VT);
56005
56006 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56007 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56008 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56009 return SDValue(N, 0);
56010
56011 return SDValue();
56012}
56013
56014static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
56015 TargetLowering::DAGCombinerInfo &DCI,
56016 const X86Subtarget &Subtarget) {
56017 EVT VT = N->getValueType(0);
56018 SDValue In = N->getOperand(0);
56019 unsigned Opcode = N->getOpcode();
56020 unsigned InOpcode = In.getOpcode();
56021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56022 SDLoc DL(N);
56023
56024 // Try to merge vector loads and extend_inreg to an extload.
56025 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
56026 In.hasOneUse()) {
56027 auto *Ld = cast<LoadSDNode>(In);
56028 if (Ld->isSimple()) {
56029 MVT SVT = In.getSimpleValueType().getVectorElementType();
56030 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
56031 ? ISD::SEXTLOAD
56032 : ISD::ZEXTLOAD;
56033 EVT MemVT = VT.changeVectorElementType(SVT);
56034 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
56035 SDValue Load = DAG.getExtLoad(
56036 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
56037 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
56038 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
56039 return Load;
56040 }
56041 }
56042 }
56043
56044 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
56045 if (Opcode == InOpcode)
56046 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
56047
56048 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
56049 // -> EXTEND_VECTOR_INREG(X).
56050 // TODO: Handle non-zero subvector indices.
56051 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
56052 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
56053 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
56054 In.getValueSizeInBits())
56055 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
56056
56057 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
56058 // TODO: Move to DAGCombine?
56059 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
56060 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
56061 In.getValueSizeInBits() == VT.getSizeInBits()) {
56062 unsigned NumElts = VT.getVectorNumElements();
56063 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
56064 EVT EltVT = In.getOperand(0).getValueType();
56065 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
56066 for (unsigned I = 0; I != NumElts; ++I)
56067 Elts[I * Scale] = In.getOperand(I);
56068 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
56069 }
56070
56071 // Attempt to combine as a shuffle on SSE41+ targets.
56072 if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
56073 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
56074 Subtarget.hasSSE41()) {
56075 SDValue Op(N, 0);
56076 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
56077 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56078 return Res;
56079 }
56080
56081 return SDValue();
56082}
56083
56084static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
56085 TargetLowering::DAGCombinerInfo &DCI) {
56086 EVT VT = N->getValueType(0);
56087
56088 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
56089 return DAG.getConstant(0, SDLoc(N), VT);
56090
56091 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56092 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56093 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56094 return SDValue(N, 0);
56095
56096 return SDValue();
56097}
56098
56099// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
56100// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
56101// extra instructions between the conversion due to going to scalar and back.
56102static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
56103 const X86Subtarget &Subtarget) {
56104 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
56105 return SDValue();
56106
56107 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
56108 return SDValue();
56109
56110 if (N->getValueType(0) != MVT::f32 ||
56111 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
56112 return SDValue();
56113
56114 SDLoc dl(N);
56115 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
56116 N->getOperand(0).getOperand(0));
56117 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
56118 DAG.getTargetConstant(4, dl, MVT::i32));
56119 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
56120 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
56121 DAG.getIntPtrConstant(0, dl));
56122}
56123
56124static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
56125 const X86Subtarget &Subtarget) {
56126 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56127 return SDValue();
56128
56129 if (Subtarget.hasFP16())
56130 return SDValue();
56131
56132 bool IsStrict = N->isStrictFPOpcode();
56133 EVT VT = N->getValueType(0);
56134 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56135 EVT SrcVT = Src.getValueType();
56136
56137 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
56138 return SDValue();
56139
56140 if (VT.getVectorElementType() != MVT::f32 &&
56141 VT.getVectorElementType() != MVT::f64)
56142 return SDValue();
56143
56144 unsigned NumElts = VT.getVectorNumElements();
56145 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56146 return SDValue();
56147
56148 SDLoc dl(N);
56149
56150 // Convert the input to vXi16.
56151 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
56152 Src = DAG.getBitcast(IntVT, Src);
56153
56154 // Widen to at least 8 input elements.
56155 if (NumElts < 8) {
56156 unsigned NumConcats = 8 / NumElts;
56157 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
56158 : DAG.getConstant(0, dl, IntVT);
56159 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
56160 Ops[0] = Src;
56161 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
56162 }
56163
56164 // Destination is vXf32 with at least 4 elements.
56165 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
56166 std::max(4U, NumElts));
56167 SDValue Cvt, Chain;
56168 if (IsStrict) {
56169 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
56170 {N->getOperand(0), Src});
56171 Chain = Cvt.getValue(1);
56172 } else {
56173 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
56174 }
56175
56176 if (NumElts < 4) {
56177 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56177, __extension__
__PRETTY_FUNCTION__))
;
56178 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
56179 DAG.getIntPtrConstant(0, dl));
56180 }
56181
56182 if (IsStrict) {
56183 // Extend to the original VT if necessary.
56184 if (Cvt.getValueType() != VT) {
56185 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
56186 {Chain, Cvt});
56187 Chain = Cvt.getValue(1);
56188 }
56189 return DAG.getMergeValues({Cvt, Chain}, dl);
56190 }
56191
56192 // Extend to the original VT if necessary.
56193 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
56194}
56195
56196// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
56197// from. Limit this to cases where the loads have the same input chain and the
56198// output chains are unused. This avoids any memory ordering issues.
56199static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
56200 TargetLowering::DAGCombinerInfo &DCI) {
56201 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56203, __extension__
__PRETTY_FUNCTION__))
56202 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56203, __extension__
__PRETTY_FUNCTION__))
56203 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56203, __extension__
__PRETTY_FUNCTION__))
;
56204
56205 // Only do this if the chain result is unused.
56206 if (N->hasAnyUseOfValue(1))
56207 return SDValue();
56208
56209 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
56210
56211 SDValue Ptr = MemIntrin->getBasePtr();
56212 SDValue Chain = MemIntrin->getChain();
56213 EVT VT = N->getSimpleValueType(0);
56214 EVT MemVT = MemIntrin->getMemoryVT();
56215
56216 // Look at other users of our base pointer and try to find a wider broadcast.
56217 // The input chain and the size of the memory VT must match.
56218 for (SDNode *User : Ptr->uses())
56219 if (User != N && User->getOpcode() == N->getOpcode() &&
56220 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
56221 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
56222 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
56223 MemVT.getSizeInBits() &&
56224 !User->hasAnyUseOfValue(1) &&
56225 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
56226 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
56227 VT.getSizeInBits());
56228 Extract = DAG.getBitcast(VT, Extract);
56229 return DCI.CombineTo(N, Extract, SDValue(User, 1));
56230 }
56231
56232 return SDValue();
56233}
56234
56235static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
56236 const X86Subtarget &Subtarget) {
56237 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56238 return SDValue();
56239
56240 if (Subtarget.hasFP16())
56241 return SDValue();
56242
56243 bool IsStrict = N->isStrictFPOpcode();
56244 EVT VT = N->getValueType(0);
56245 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56246 EVT SrcVT = Src.getValueType();
56247
56248 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
56249 SrcVT.getVectorElementType() != MVT::f32)
56250 return SDValue();
56251
56252 unsigned NumElts = VT.getVectorNumElements();
56253 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56254 return SDValue();
56255
56256 SDLoc dl(N);
56257
56258 // Widen to at least 4 input elements.
56259 if (NumElts < 4)
56260 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
56261 DAG.getConstantFP(0.0, dl, SrcVT));
56262
56263 // Destination is v8i16 with at least 8 elements.
56264 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56265 std::max(8U, NumElts));
56266 SDValue Cvt, Chain;
56267 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
56268 if (IsStrict) {
56269 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
56270 {N->getOperand(0), Src, Rnd});
56271 Chain = Cvt.getValue(1);
56272 } else {
56273 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
56274 }
56275
56276 // Extract down to real number of elements.
56277 if (NumElts < 8) {
56278 EVT IntVT = VT.changeVectorElementTypeToInteger();
56279 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
56280 DAG.getIntPtrConstant(0, dl));
56281 }
56282
56283 Cvt = DAG.getBitcast(VT, Cvt);
56284
56285 if (IsStrict)
56286 return DAG.getMergeValues({Cvt, Chain}, dl);
56287
56288 return Cvt;
56289}
56290
56291static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
56292 SDValue Src = N->getOperand(0);
56293
56294 // Turn MOVDQ2Q+simple_load into an mmx load.
56295 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
56296 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
56297
56298 if (LN->isSimple()) {
56299 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
56300 LN->getBasePtr(),
56301 LN->getPointerInfo(),
56302 LN->getOriginalAlign(),
56303 LN->getMemOperand()->getFlags());
56304 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
56305 return NewLd;
56306 }
56307 }
56308
56309 return SDValue();
56310}
56311
56312static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
56313 TargetLowering::DAGCombinerInfo &DCI) {
56314 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
56315 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56316 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
56317 return SDValue(N, 0);
56318
56319 return SDValue();
56320}
56321
56322SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
56323 DAGCombinerInfo &DCI) const {
56324 SelectionDAG &DAG = DCI.DAG;
56325 switch (N->getOpcode()) {
56326 default: break;
56327 case ISD::SCALAR_TO_VECTOR:
56328 return combineScalarToVector(N, DAG);
56329 case ISD::EXTRACT_VECTOR_ELT:
56330 case X86ISD::PEXTRW:
56331 case X86ISD::PEXTRB:
56332 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
56333 case ISD::CONCAT_VECTORS:
56334 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
56335 case ISD::INSERT_SUBVECTOR:
56336 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
56337 case ISD::EXTRACT_SUBVECTOR:
56338 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
56339 case ISD::VSELECT:
56340 case ISD::SELECT:
56341 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
56342 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
56343 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
56344 case X86ISD::CMP: return combineCMP(N, DAG);
56345 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
56346 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
56347 case X86ISD::ADD:
56348 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
56349 case X86ISD::SBB: return combineSBB(N, DAG);
56350 case X86ISD::ADC: return combineADC(N, DAG, DCI);
56351 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
56352 case ISD::SHL: return combineShiftLeft(N, DAG);
56353 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
56354 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
56355 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
56356 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
56357 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
56358 case X86ISD::BEXTR:
56359 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
56360 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
56361 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
56362 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
56363 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
56364 case X86ISD::VEXTRACT_STORE:
56365 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
56366 case ISD::SINT_TO_FP:
56367 case ISD::STRICT_SINT_TO_FP:
56368 return combineSIntToFP(N, DAG, DCI, Subtarget);
56369 case ISD::UINT_TO_FP:
56370 case ISD::STRICT_UINT_TO_FP:
56371 return combineUIntToFP(N, DAG, Subtarget);
56372 case ISD::FADD:
56373 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
56374 case X86ISD::VFCMULC:
56375 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
56376 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
56377 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
56378 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
56379 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
56380 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
56381 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
56382 case X86ISD::FXOR:
56383 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
56384 case X86ISD::FMIN:
56385 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
56386 case ISD::FMINNUM:
56387 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
56388 case X86ISD::CVTSI2P:
56389 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
56390 case X86ISD::CVTP2SI:
56391 case X86ISD::CVTP2UI:
56392 case X86ISD::STRICT_CVTTP2SI:
56393 case X86ISD::CVTTP2SI:
56394 case X86ISD::STRICT_CVTTP2UI:
56395 case X86ISD::CVTTP2UI:
56396 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
56397 case X86ISD::STRICT_CVTPH2PS:
56398 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
56399 case X86ISD::BT: return combineBT(N, DAG, DCI);
56400 case ISD::ANY_EXTEND:
56401 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
56402 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
56403 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
56404 case ISD::ANY_EXTEND_VECTOR_INREG:
56405 case ISD::SIGN_EXTEND_VECTOR_INREG:
56406 case ISD::ZERO_EXTEND_VECTOR_INREG:
56407 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
56408 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
56409 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
56410 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
56411 case X86ISD::PACKSS:
56412 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
56413 case X86ISD::HADD:
56414 case X86ISD::HSUB:
56415 case X86ISD::FHADD:
56416 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
56417 case X86ISD::VSHL:
56418 case X86ISD::VSRA:
56419 case X86ISD::VSRL:
56420 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
56421 case X86ISD::VSHLI:
56422 case X86ISD::VSRAI:
56423 case X86ISD::VSRLI:
56424 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
56425 case ISD::INSERT_VECTOR_ELT:
56426 case X86ISD::PINSRB:
56427 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
56428 case X86ISD::SHUFP: // Handle all target specific shuffles
56429 case X86ISD::INSERTPS:
56430 case X86ISD::EXTRQI:
56431 case X86ISD::INSERTQI:
56432 case X86ISD::VALIGN:
56433 case X86ISD::PALIGNR:
56434 case X86ISD::VSHLDQ:
56435 case X86ISD::VSRLDQ:
56436 case X86ISD::BLENDI:
56437 case X86ISD::UNPCKH:
56438 case X86ISD::UNPCKL:
56439 case X86ISD::MOVHLPS:
56440 case X86ISD::MOVLHPS:
56441 case X86ISD::PSHUFB:
56442 case X86ISD::PSHUFD:
56443 case X86ISD::PSHUFHW:
56444 case X86ISD::PSHUFLW:
56445 case X86ISD::MOVSHDUP:
56446 case X86ISD::MOVSLDUP:
56447 case X86ISD::MOVDDUP:
56448 case X86ISD::MOVSS:
56449 case X86ISD::MOVSD:
56450 case X86ISD::MOVSH:
56451 case X86ISD::VBROADCAST:
56452 case X86ISD::VPPERM:
56453 case X86ISD::VPERMI:
56454 case X86ISD::VPERMV:
56455 case X86ISD::VPERMV3:
56456 case X86ISD::VPERMIL2:
56457 case X86ISD::VPERMILPI:
56458 case X86ISD::VPERMILPV:
56459 case X86ISD::VPERM2X128:
56460 case X86ISD::SHUF128:
56461 case X86ISD::VZEXT_MOVL:
56462 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
56463 case X86ISD::FMADD_RND:
56464 case X86ISD::FMSUB:
56465 case X86ISD::STRICT_FMSUB:
56466 case X86ISD::FMSUB_RND:
56467 case X86ISD::FNMADD:
56468 case X86ISD::STRICT_FNMADD:
56469 case X86ISD::FNMADD_RND:
56470 case X86ISD::FNMSUB:
56471 case X86ISD::STRICT_FNMSUB:
56472 case X86ISD::FNMSUB_RND:
56473 case ISD::FMA:
56474 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
56475 case X86ISD::FMADDSUB_RND:
56476 case X86ISD::FMSUBADD_RND:
56477 case X86ISD::FMADDSUB:
56478 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
56479 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
56480 case X86ISD::MGATHER:
56481 case X86ISD::MSCATTER:
56482 return combineX86GatherScatter(N, DAG, DCI, Subtarget);
56483 case ISD::MGATHER:
56484 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
56485 case X86ISD::PCMPEQ:
56486 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
56487 case X86ISD::PMULDQ:
56488 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
56489 case X86ISD::VPMADDUBSW:
56490 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
56491 case X86ISD::KSHIFTL:
56492 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
56493 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
56494 case ISD::STRICT_FP_EXTEND:
56495 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
56496 case ISD::STRICT_FP_ROUND:
56497 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
56498 case X86ISD::VBROADCAST_LOAD:
56499 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
56500 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
56501 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
56502 }
56503
56504 return SDValue();
56505}
56506
56507bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
56508 if (!isTypeLegal(VT))
56509 return false;
56510
56511 // There are no vXi8 shifts.
56512 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
56513 return false;
56514
56515 // TODO: Almost no 8-bit ops are desirable because they have no actual
56516 // size/speed advantages vs. 32-bit ops, but they do have a major
56517 // potential disadvantage by causing partial register stalls.
56518 //
56519 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
56520 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
56521 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
56522 // check for a constant operand to the multiply.
56523 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
56524 return false;
56525
56526 // i16 instruction encodings are longer and some i16 instructions are slow,
56527 // so those are not desirable.
56528 if (VT == MVT::i16) {
56529 switch (Opc) {
56530 default:
56531 break;
56532 case ISD::LOAD:
56533 case ISD::SIGN_EXTEND:
56534 case ISD::ZERO_EXTEND:
56535 case ISD::ANY_EXTEND:
56536 case ISD::SHL:
56537 case ISD::SRA:
56538 case ISD::SRL:
56539 case ISD::SUB:
56540 case ISD::ADD:
56541 case ISD::MUL:
56542 case ISD::AND:
56543 case ISD::OR:
56544 case ISD::XOR:
56545 return false;
56546 }
56547 }
56548
56549 // Any legal type not explicitly accounted for above here is desirable.
56550 return true;
56551}
56552
56553SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
56554 SDValue Value, SDValue Addr,
56555 SelectionDAG &DAG) const {
56556 const Module *M = DAG.getMachineFunction().getMMI().getModule();
56557 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
56558 if (IsCFProtectionSupported) {
56559 // In case control-flow branch protection is enabled, we need to add
56560 // notrack prefix to the indirect branch.
56561 // In order to do that we create NT_BRIND SDNode.
56562 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
56563 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
56564 }
56565
56566 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
56567}
56568
56569bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
56570 EVT VT = Op.getValueType();
56571 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
56572 isa<ConstantSDNode>(Op.getOperand(1));
56573
56574 // i16 is legal, but undesirable since i16 instruction encodings are longer
56575 // and some i16 instructions are slow.
56576 // 8-bit multiply-by-constant can usually be expanded to something cheaper
56577 // using LEA and/or other ALU ops.
56578 if (VT != MVT::i16 && !Is8BitMulByConstant)
56579 return false;
56580
56581 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
56582 if (!Op.hasOneUse())
56583 return false;
56584 SDNode *User = *Op->use_begin();
56585 if (!ISD::isNormalStore(User))
56586 return false;
56587 auto *Ld = cast<LoadSDNode>(Load);
56588 auto *St = cast<StoreSDNode>(User);
56589 return Ld->getBasePtr() == St->getBasePtr();
56590 };
56591
56592 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
56593 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
56594 return false;
56595 if (!Op.hasOneUse())
56596 return false;
56597 SDNode *User = *Op->use_begin();
56598 if (User->getOpcode() != ISD::ATOMIC_STORE)
56599 return false;
56600 auto *Ld = cast<AtomicSDNode>(Load);
56601 auto *St = cast<AtomicSDNode>(User);
56602 return Ld->getBasePtr() == St->getBasePtr();
56603 };
56604
56605 bool Commute = false;
56606 switch (Op.getOpcode()) {
56607 default: return false;
56608 case ISD::SIGN_EXTEND:
56609 case ISD::ZERO_EXTEND:
56610 case ISD::ANY_EXTEND:
56611 break;
56612 case ISD::SHL:
56613 case ISD::SRA:
56614 case ISD::SRL: {
56615 SDValue N0 = Op.getOperand(0);
56616 // Look out for (store (shl (load), x)).
56617 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
56618 return false;
56619 break;
56620 }
56621 case ISD::ADD:
56622 case ISD::MUL:
56623 case ISD::AND:
56624 case ISD::OR:
56625 case ISD::XOR:
56626 Commute = true;
56627 [[fallthrough]];
56628 case ISD::SUB: {
56629 SDValue N0 = Op.getOperand(0);
56630 SDValue N1 = Op.getOperand(1);
56631 // Avoid disabling potential load folding opportunities.
56632 if (X86::mayFoldLoad(N1, Subtarget) &&
56633 (!Commute || !isa<ConstantSDNode>(N0) ||
56634 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
56635 return false;
56636 if (X86::mayFoldLoad(N0, Subtarget) &&
56637 ((Commute && !isa<ConstantSDNode>(N1)) ||
56638 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
56639 return false;
56640 if (IsFoldableAtomicRMW(N0, Op) ||
56641 (Commute && IsFoldableAtomicRMW(N1, Op)))
56642 return false;
56643 }
56644 }
56645
56646 PVT = MVT::i32;
56647 return true;
56648}
56649
56650//===----------------------------------------------------------------------===//
56651// X86 Inline Assembly Support
56652//===----------------------------------------------------------------------===//
56653
56654// Helper to match a string separated by whitespace.
56655static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
56656 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
56657
56658 for (StringRef Piece : Pieces) {
56659 if (!S.startswith(Piece)) // Check if the piece matches.
56660 return false;
56661
56662 S = S.substr(Piece.size());
56663 StringRef::size_type Pos = S.find_first_not_of(" \t");
56664 if (Pos == 0) // We matched a prefix.
56665 return false;
56666
56667 S = S.substr(Pos);
56668 }
56669
56670 return S.empty();
56671}
56672
56673static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
56674
56675 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
56676 if (llvm::is_contained(AsmPieces, "~{cc}") &&
56677 llvm::is_contained(AsmPieces, "~{flags}") &&
56678 llvm::is_contained(AsmPieces, "~{fpsr}")) {
56679
56680 if (AsmPieces.size() == 3)
56681 return true;
56682 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
56683 return true;
56684 }
56685 }
56686 return false;
56687}
56688
56689bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
56690 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
56691
56692 const std::string &AsmStr = IA->getAsmString();
56693
56694 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
56695 if (!Ty || Ty->getBitWidth() % 16 != 0)
56696 return false;
56697
56698 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
56699 SmallVector<StringRef, 4> AsmPieces;
56700 SplitString(AsmStr, AsmPieces, ";\n");
56701
56702 switch (AsmPieces.size()) {
56703 default: return false;
56704 case 1:
56705 // FIXME: this should verify that we are targeting a 486 or better. If not,
56706 // we will turn this bswap into something that will be lowered to logical
56707 // ops instead of emitting the bswap asm. For now, we don't support 486 or
56708 // lower so don't worry about this.
56709 // bswap $0
56710 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
56711 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
56712 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
56713 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
56714 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
56715 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
56716 // No need to check constraints, nothing other than the equivalent of
56717 // "=r,0" would be valid here.
56718 return IntrinsicLowering::LowerToByteSwap(CI);
56719 }
56720
56721 // rorw $$8, ${0:w} --> llvm.bswap.i16
56722 if (CI->getType()->isIntegerTy(16) &&
56723 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
56724 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
56725 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
56726 AsmPieces.clear();
56727 StringRef ConstraintsStr = IA->getConstraintString();
56728 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
56729 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
56730 if (clobbersFlagRegisters(AsmPieces))
56731 return IntrinsicLowering::LowerToByteSwap(CI);
56732 }
56733 break;
56734 case 3:
56735 if (CI->getType()->isIntegerTy(32) &&
56736 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
56737 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
56738 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
56739 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
56740 AsmPieces.clear();
56741 StringRef ConstraintsStr = IA->getConstraintString();
56742 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
56743 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
56744 if (clobbersFlagRegisters(AsmPieces))
56745 return IntrinsicLowering::LowerToByteSwap(CI);
56746 }
56747
56748 if (CI->getType()->isIntegerTy(64)) {
56749 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
56750 if (Constraints.size() >= 2 &&
56751 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
56752 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
56753 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
56754 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
56755 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
56756 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
56757 return IntrinsicLowering::LowerToByteSwap(CI);
56758 }
56759 }
56760 break;
56761 }
56762 return false;
56763}
56764
56765static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
56766 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
56767 .Case("{@cca}", X86::COND_A)
56768 .Case("{@ccae}", X86::COND_AE)
56769 .Case("{@ccb}", X86::COND_B)
56770 .Case("{@ccbe}", X86::COND_BE)
56771 .Case("{@ccc}", X86::COND_B)
56772 .Case("{@cce}", X86::COND_E)
56773 .Case("{@ccz}", X86::COND_E)
56774 .Case("{@ccg}", X86::COND_G)
56775 .Case("{@ccge}", X86::COND_GE)
56776 .Case("{@ccl}", X86::COND_L)
56777 .Case("{@ccle}", X86::COND_LE)
56778 .Case("{@ccna}", X86::COND_BE)
56779 .Case("{@ccnae}", X86::COND_B)
56780 .Case("{@ccnb}", X86::COND_AE)
56781 .Case("{@ccnbe}", X86::COND_A)
56782 .Case("{@ccnc}", X86::COND_AE)
56783 .Case("{@ccne}", X86::COND_NE)
56784 .Case("{@ccnz}", X86::COND_NE)
56785 .Case("{@ccng}", X86::COND_LE)
56786 .Case("{@ccnge}", X86::COND_L)
56787 .Case("{@ccnl}", X86::COND_GE)
56788 .Case("{@ccnle}", X86::COND_G)
56789 .Case("{@ccno}", X86::COND_NO)
56790 .Case("{@ccnp}", X86::COND_NP)
56791 .Case("{@ccns}", X86::COND_NS)
56792 .Case("{@cco}", X86::COND_O)
56793 .Case("{@ccp}", X86::COND_P)
56794 .Case("{@ccs}", X86::COND_S)
56795 .Default(X86::COND_INVALID);
56796 return Cond;
56797}
56798
56799/// Given a constraint letter, return the type of constraint for this target.
56800X86TargetLowering::ConstraintType
56801X86TargetLowering::getConstraintType(StringRef Constraint) const {
56802 if (Constraint.size() == 1) {
56803 switch (Constraint[0]) {
56804 case 'R':
56805 case 'q':
56806 case 'Q':
56807 case 'f':
56808 case 't':
56809 case 'u':
56810 case 'y':
56811 case 'x':
56812 case 'v':
56813 case 'l':
56814 case 'k': // AVX512 masking registers.
56815 return C_RegisterClass;
56816 case 'a':
56817 case 'b':
56818 case 'c':
56819 case 'd':
56820 case 'S':
56821 case 'D':
56822 case 'A':
56823 return C_Register;
56824 case 'I':
56825 case 'J':
56826 case 'K':
56827 case 'N':
56828 case 'G':
56829 case 'L':
56830 case 'M':
56831 return C_Immediate;
56832 case 'C':
56833 case 'e':
56834 case 'Z':
56835 return C_Other;
56836 default:
56837 break;
56838 }
56839 }
56840 else if (Constraint.size() == 2) {
56841 switch (Constraint[0]) {
56842 default:
56843 break;
56844 case 'Y':
56845 switch (Constraint[1]) {
56846 default:
56847 break;
56848 case 'z':
56849 return C_Register;
56850 case 'i':
56851 case 'm':
56852 case 'k':
56853 case 't':
56854 case '2':
56855 return C_RegisterClass;
56856 }
56857 }
56858 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
56859 return C_Other;
56860 return TargetLowering::getConstraintType(Constraint);
56861}
56862
56863/// Examine constraint type and operand type and determine a weight value.
56864/// This object must already have been set up with the operand type
56865/// and the current alternative constraint selected.
56866TargetLowering::ConstraintWeight
56867 X86TargetLowering::getSingleConstraintMatchWeight(
56868 AsmOperandInfo &info, const char *constraint) const {
56869 ConstraintWeight weight = CW_Invalid;
56870 Value *CallOperandVal = info.CallOperandVal;
56871 // If we don't have a value, we can't do a match,
56872 // but allow it at the lowest weight.
56873 if (!CallOperandVal)
56874 return CW_Default;
56875 Type *type = CallOperandVal->getType();
56876 // Look at the constraint type.
56877 switch (*constraint) {
56878 default:
56879 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
56880 [[fallthrough]];
56881 case 'R':
56882 case 'q':
56883 case 'Q':
56884 case 'a':
56885 case 'b':
56886 case 'c':
56887 case 'd':
56888 case 'S':
56889 case 'D':
56890 case 'A':
56891 if (CallOperandVal->getType()->isIntegerTy())
56892 weight = CW_SpecificReg;
56893 break;
56894 case 'f':
56895 case 't':
56896 case 'u':
56897 if (type->isFloatingPointTy())
56898 weight = CW_SpecificReg;
56899 break;
56900 case 'y':
56901 if (type->isX86_MMXTy() && Subtarget.hasMMX())
56902 weight = CW_SpecificReg;
56903 break;
56904 case 'Y':
56905 if (StringRef(constraint).size() != 2)
56906 break;
56907 switch (constraint[1]) {
56908 default:
56909 return CW_Invalid;
56910 // XMM0
56911 case 'z':
56912 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
56913 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
56914 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
56915 return CW_SpecificReg;
56916 return CW_Invalid;
56917 // Conditional OpMask regs (AVX512)
56918 case 'k':
56919 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
56920 return CW_Register;
56921 return CW_Invalid;
56922 // Any MMX reg
56923 case 'm':
56924 if (type->isX86_MMXTy() && Subtarget.hasMMX())
56925 return weight;
56926 return CW_Invalid;
56927 // Any SSE reg when ISA >= SSE2, same as 'x'
56928 case 'i':
56929 case 't':
56930 case '2':
56931 if (!Subtarget.hasSSE2())
56932 return CW_Invalid;
56933 break;
56934 }
56935 break;
56936 case 'v':
56937 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
56938 weight = CW_Register;
56939 [[fallthrough]];
56940 case 'x':
56941 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
56942 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
56943 weight = CW_Register;
56944 break;
56945 case 'k':
56946 // Enable conditional vector operations using %k<#> registers.
56947 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
56948 weight = CW_Register;
56949 break;
56950 case 'I':
56951 if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
56952 if (C->getZExtValue() <= 31)
56953 weight = CW_Constant;
56954 }
56955 break;
56956 case 'J':
56957 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
56958 if (C->getZExtValue() <= 63)
56959 weight = CW_Constant;
56960 }
56961 break;
56962 case 'K':
56963 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
56964 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
56965 weight = CW_Constant;
56966 }
56967 break;
56968 case 'L':
56969 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
56970 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
56971 weight = CW_Constant;
56972 }
56973 break;
56974 case 'M':
56975 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
56976 if (C->getZExtValue() <= 3)
56977 weight = CW_Constant;
56978 }
56979 break;
56980 case 'N':
56981 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
56982 if (C->getZExtValue() <= 0xff)
56983 weight = CW_Constant;
56984 }
56985 break;
56986 case 'G':
56987 case 'C':
56988 if (isa<ConstantFP>(CallOperandVal)) {
56989 weight = CW_Constant;
56990 }
56991 break;
56992 case 'e':
56993 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
56994 if ((C->getSExtValue() >= -0x80000000LL) &&
56995 (C->getSExtValue() <= 0x7fffffffLL))
56996 weight = CW_Constant;
56997 }
56998 break;
56999 case 'Z':
57000 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
57001 if (C->getZExtValue() <= 0xffffffff)
57002 weight = CW_Constant;
57003 }
57004 break;
57005 }
57006 return weight;
57007}
57008
57009/// Try to replace an X constraint, which matches anything, with another that
57010/// has more specific requirements based on the type of the corresponding
57011/// operand.
57012const char *X86TargetLowering::
57013LowerXConstraint(EVT ConstraintVT) const {
57014 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
57015 // 'f' like normal targets.
57016 if (ConstraintVT.isFloatingPoint()) {
57017 if (Subtarget.hasSSE1())
57018 return "x";
57019 }
57020
57021 return TargetLowering::LowerXConstraint(ConstraintVT);
57022}
57023
57024// Lower @cc targets via setcc.
57025SDValue X86TargetLowering::LowerAsmOutputForConstraint(
57026 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
57027 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
57028 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
57029 if (Cond == X86::COND_INVALID)
57030 return SDValue();
57031 // Check that return type is valid.
57032 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
57033 OpInfo.ConstraintVT.getSizeInBits() < 8)
57034 report_fatal_error("Flag output operand is of invalid type");
57035
57036 // Get EFLAGS register. Only update chain when copyfrom is glued.
57037 if (Flag.getNode()) {
57038 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
57039 Chain = Flag.getValue(1);
57040 } else
57041 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
57042 // Extract CC code.
57043 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
57044 // Extend to 32-bits
57045 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
57046
57047 return Result;
57048}
57049
57050/// Lower the specified operand into the Ops vector.
57051/// If it is invalid, don't add anything to Ops.
57052void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
57053 std::string &Constraint,
57054 std::vector<SDValue>&Ops,
57055 SelectionDAG &DAG) const {
57056 SDValue Result;
57057
57058 // Only support length 1 constraints for now.
57059 if (Constraint.length() > 1) return;
57060
57061 char ConstraintLetter = Constraint[0];
57062 switch (ConstraintLetter) {
57063 default: break;
57064 case 'I':
57065 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57066 if (C->getZExtValue() <= 31) {
57067 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57068 Op.getValueType());
57069 break;
57070 }
57071 }
57072 return;
57073 case 'J':
57074 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57075 if (C->getZExtValue() <= 63) {
57076 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57077 Op.getValueType());
57078 break;
57079 }
57080 }
57081 return;
57082 case 'K':
57083 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57084 if (isInt<8>(C->getSExtValue())) {
57085 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57086 Op.getValueType());
57087 break;
57088 }
57089 }
57090 return;
57091 case 'L':
57092 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57093 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
57094 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
57095 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
57096 Op.getValueType());
57097 break;
57098 }
57099 }
57100 return;
57101 case 'M':
57102 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57103 if (C->getZExtValue() <= 3) {
57104 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57105 Op.getValueType());
57106 break;
57107 }
57108 }
57109 return;
57110 case 'N':
57111 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57112 if (C->getZExtValue() <= 255) {
57113 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57114 Op.getValueType());
57115 break;
57116 }
57117 }
57118 return;
57119 case 'O':
57120 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57121 if (C->getZExtValue() <= 127) {
57122 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57123 Op.getValueType());
57124 break;
57125 }
57126 }
57127 return;
57128 case 'e': {
57129 // 32-bit signed value
57130 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57131 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
57132 C->getSExtValue())) {
57133 // Widen to 64 bits here to get it sign extended.
57134 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
57135 break;
57136 }
57137 // FIXME gcc accepts some relocatable values here too, but only in certain
57138 // memory models; it's complicated.
57139 }
57140 return;
57141 }
57142 case 'Z': {
57143 // 32-bit unsigned value
57144 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57145 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
57146 C->getZExtValue())) {
57147 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57148 Op.getValueType());
57149 break;
57150 }
57151 }
57152 // FIXME gcc accepts some relocatable values here too, but only in certain
57153 // memory models; it's complicated.
57154 return;
57155 }
57156 case 'i': {
57157 // Literal immediates are always ok.
57158 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
57159 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
57160 BooleanContent BCont = getBooleanContents(MVT::i64);
57161 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
57162 : ISD::SIGN_EXTEND;
57163 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
57164 : CST->getSExtValue();
57165 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
57166 break;
57167 }
57168
57169 // In any sort of PIC mode addresses need to be computed at runtime by
57170 // adding in a register or some sort of table lookup. These can't
57171 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
57172 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
57173 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
57174 return;
57175
57176 // If we are in non-pic codegen mode, we allow the address of a global (with
57177 // an optional displacement) to be used with 'i'.
57178 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57179 // If we require an extra load to get this address, as in PIC mode, we
57180 // can't accept it.
57181 if (isGlobalStubReference(
57182 Subtarget.classifyGlobalReference(GA->getGlobal())))
57183 return;
57184 break;
57185 }
57186 }
57187
57188 if (Result.getNode()) {
57189 Ops.push_back(Result);
57190 return;
57191 }
57192 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
57193}
57194
57195/// Check if \p RC is a general purpose register class.
57196/// I.e., GR* or one of their variant.
57197static bool isGRClass(const TargetRegisterClass &RC) {
57198 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
57199 RC.hasSuperClassEq(&X86::GR16RegClass) ||
57200 RC.hasSuperClassEq(&X86::GR32RegClass) ||
57201 RC.hasSuperClassEq(&X86::GR64RegClass) ||
57202 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
57203}
57204
57205/// Check if \p RC is a vector register class.
57206/// I.e., FR* / VR* or one of their variant.
57207static bool isFRClass(const TargetRegisterClass &RC) {
57208 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
57209 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
57210 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
57211 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
57212 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
57213 RC.hasSuperClassEq(&X86::VR512RegClass);
57214}
57215
57216/// Check if \p RC is a mask register class.
57217/// I.e., VK* or one of their variant.
57218static bool isVKClass(const TargetRegisterClass &RC) {
57219 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
57220 RC.hasSuperClassEq(&X86::VK2RegClass) ||
57221 RC.hasSuperClassEq(&X86::VK4RegClass) ||
57222 RC.hasSuperClassEq(&X86::VK8RegClass) ||
57223 RC.hasSuperClassEq(&X86::VK16RegClass) ||
57224 RC.hasSuperClassEq(&X86::VK32RegClass) ||
57225 RC.hasSuperClassEq(&X86::VK64RegClass);
57226}
57227
57228std::pair<unsigned, const TargetRegisterClass *>
57229X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
57230 StringRef Constraint,
57231 MVT VT) const {
57232 // First, see if this is a constraint that directly corresponds to an LLVM
57233 // register class.
57234 if (Constraint.size() == 1) {
57235 // GCC Constraint Letters
57236 switch (Constraint[0]) {
57237 default: break;
57238 // 'A' means [ER]AX + [ER]DX.
57239 case 'A':
57240 if (Subtarget.is64Bit())
57241 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
57242 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57243, __extension__
__PRETTY_FUNCTION__))
57243 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57243, __extension__
__PRETTY_FUNCTION__))
;
57244 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57245
57246 // TODO: Slight differences here in allocation order and leaving
57247 // RIP in the class. Do they matter any more here than they do
57248 // in the normal allocation?
57249 case 'k':
57250 if (Subtarget.hasAVX512()) {
57251 if (VT == MVT::i1)
57252 return std::make_pair(0U, &X86::VK1RegClass);
57253 if (VT == MVT::i8)
57254 return std::make_pair(0U, &X86::VK8RegClass);
57255 if (VT == MVT::i16)
57256 return std::make_pair(0U, &X86::VK16RegClass);
57257 }
57258 if (Subtarget.hasBWI()) {
57259 if (VT == MVT::i32)
57260 return std::make_pair(0U, &X86::VK32RegClass);
57261 if (VT == MVT::i64)
57262 return std::make_pair(0U, &X86::VK64RegClass);
57263 }
57264 break;
57265 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
57266 if (Subtarget.is64Bit()) {
57267 if (VT == MVT::i8 || VT == MVT::i1)
57268 return std::make_pair(0U, &X86::GR8RegClass);
57269 if (VT == MVT::i16)
57270 return std::make_pair(0U, &X86::GR16RegClass);
57271 if (VT == MVT::i32 || VT == MVT::f32)
57272 return std::make_pair(0U, &X86::GR32RegClass);
57273 if (VT != MVT::f80 && !VT.isVector())
57274 return std::make_pair(0U, &X86::GR64RegClass);
57275 break;
57276 }
57277 [[fallthrough]];
57278 // 32-bit fallthrough
57279 case 'Q': // Q_REGS
57280 if (VT == MVT::i8 || VT == MVT::i1)
57281 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
57282 if (VT == MVT::i16)
57283 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
57284 if (VT == MVT::i32 || VT == MVT::f32 ||
57285 (!VT.isVector() && !Subtarget.is64Bit()))
57286 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
57287 if (VT != MVT::f80 && !VT.isVector())
57288 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
57289 break;
57290 case 'r': // GENERAL_REGS
57291 case 'l': // INDEX_REGS
57292 if (VT == MVT::i8 || VT == MVT::i1)
57293 return std::make_pair(0U, &X86::GR8RegClass);
57294 if (VT == MVT::i16)
57295 return std::make_pair(0U, &X86::GR16RegClass);
57296 if (VT == MVT::i32 || VT == MVT::f32 ||
57297 (!VT.isVector() && !Subtarget.is64Bit()))
57298 return std::make_pair(0U, &X86::GR32RegClass);
57299 if (VT != MVT::f80 && !VT.isVector())
57300 return std::make_pair(0U, &X86::GR64RegClass);
57301 break;
57302 case 'R': // LEGACY_REGS
57303 if (VT == MVT::i8 || VT == MVT::i1)
57304 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
57305 if (VT == MVT::i16)
57306 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
57307 if (VT == MVT::i32 || VT == MVT::f32 ||
57308 (!VT.isVector() && !Subtarget.is64Bit()))
57309 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
57310 if (VT != MVT::f80 && !VT.isVector())
57311 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
57312 break;
57313 case 'f': // FP Stack registers.
57314 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
57315 // value to the correct fpstack register class.
57316 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
57317 return std::make_pair(0U, &X86::RFP32RegClass);
57318 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
57319 return std::make_pair(0U, &X86::RFP64RegClass);
57320 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
57321 return std::make_pair(0U, &X86::RFP80RegClass);
57322 break;
57323 case 'y': // MMX_REGS if MMX allowed.
57324 if (!Subtarget.hasMMX()) break;
57325 return std::make_pair(0U, &X86::VR64RegClass);
57326 case 'v':
57327 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
57328 if (!Subtarget.hasSSE1()) break;
57329 bool VConstraint = (Constraint[0] == 'v');
57330
57331 switch (VT.SimpleTy) {
57332 default: break;
57333 // Scalar SSE types.
57334 case MVT::f16:
57335 if (VConstraint && Subtarget.hasFP16())
57336 return std::make_pair(0U, &X86::FR16XRegClass);
57337 break;
57338 case MVT::f32:
57339 case MVT::i32:
57340 if (VConstraint && Subtarget.hasVLX())
57341 return std::make_pair(0U, &X86::FR32XRegClass);
57342 return std::make_pair(0U, &X86::FR32RegClass);
57343 case MVT::f64:
57344 case MVT::i64:
57345 if (VConstraint && Subtarget.hasVLX())
57346 return std::make_pair(0U, &X86::FR64XRegClass);
57347 return std::make_pair(0U, &X86::FR64RegClass);
57348 case MVT::i128:
57349 if (Subtarget.is64Bit()) {
57350 if (VConstraint && Subtarget.hasVLX())
57351 return std::make_pair(0U, &X86::VR128XRegClass);
57352 return std::make_pair(0U, &X86::VR128RegClass);
57353 }
57354 break;
57355 // Vector types and fp128.
57356 case MVT::v8f16:
57357 if (!Subtarget.hasFP16())
57358 break;
57359 [[fallthrough]];
57360 case MVT::f128:
57361 case MVT::v16i8:
57362 case MVT::v8i16:
57363 case MVT::v4i32:
57364 case MVT::v2i64:
57365 case MVT::v4f32:
57366 case MVT::v2f64:
57367 if (VConstraint && Subtarget.hasVLX())
57368 return std::make_pair(0U, &X86::VR128XRegClass);
57369 return std::make_pair(0U, &X86::VR128RegClass);
57370 // AVX types.
57371 case MVT::v16f16:
57372 if (!Subtarget.hasFP16())
57373 break;
57374 [[fallthrough]];
57375 case MVT::v32i8:
57376 case MVT::v16i16:
57377 case MVT::v8i32:
57378 case MVT::v4i64:
57379 case MVT::v8f32:
57380 case MVT::v4f64:
57381 if (VConstraint && Subtarget.hasVLX())
57382 return std::make_pair(0U, &X86::VR256XRegClass);
57383 if (Subtarget.hasAVX())
57384 return std::make_pair(0U, &X86::VR256RegClass);
57385 break;
57386 case MVT::v32f16:
57387 if (!Subtarget.hasFP16())
57388 break;
57389 [[fallthrough]];
57390 case MVT::v64i8:
57391 case MVT::v32i16:
57392 case MVT::v8f64:
57393 case MVT::v16f32:
57394 case MVT::v16i32:
57395 case MVT::v8i64:
57396 if (!Subtarget.hasAVX512()) break;
57397 if (VConstraint)
57398 return std::make_pair(0U, &X86::VR512RegClass);
57399 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57400 }
57401 break;
57402 }
57403 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
57404 switch (Constraint[1]) {
57405 default:
57406 break;
57407 case 'i':
57408 case 't':
57409 case '2':
57410 return getRegForInlineAsmConstraint(TRI, "x", VT);
57411 case 'm':
57412 if (!Subtarget.hasMMX()) break;
57413 return std::make_pair(0U, &X86::VR64RegClass);
57414 case 'z':
57415 if (!Subtarget.hasSSE1()) break;
57416 switch (VT.SimpleTy) {
57417 default: break;
57418 // Scalar SSE types.
57419 case MVT::f16:
57420 if (!Subtarget.hasFP16())
57421 break;
57422 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
57423 case MVT::f32:
57424 case MVT::i32:
57425 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
57426 case MVT::f64:
57427 case MVT::i64:
57428 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
57429 case MVT::v8f16:
57430 if (!Subtarget.hasFP16())
57431 break;
57432 [[fallthrough]];
57433 case MVT::f128:
57434 case MVT::v16i8:
57435 case MVT::v8i16:
57436 case MVT::v4i32:
57437 case MVT::v2i64:
57438 case MVT::v4f32:
57439 case MVT::v2f64:
57440 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57441 // AVX types.
57442 case MVT::v16f16:
57443 if (!Subtarget.hasFP16())
57444 break;
57445 [[fallthrough]];
57446 case MVT::v32i8:
57447 case MVT::v16i16:
57448 case MVT::v8i32:
57449 case MVT::v4i64:
57450 case MVT::v8f32:
57451 case MVT::v4f64:
57452 if (Subtarget.hasAVX())
57453 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57454 break;
57455 case MVT::v32f16:
57456 if (!Subtarget.hasFP16())
57457 break;
57458 [[fallthrough]];
57459 case MVT::v64i8:
57460 case MVT::v32i16:
57461 case MVT::v8f64:
57462 case MVT::v16f32:
57463 case MVT::v16i32:
57464 case MVT::v8i64:
57465 if (Subtarget.hasAVX512())
57466 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57467 break;
57468 }
57469 break;
57470 case 'k':
57471 // This register class doesn't allocate k0 for masked vector operation.
57472 if (Subtarget.hasAVX512()) {
57473 if (VT == MVT::i1)
57474 return std::make_pair(0U, &X86::VK1WMRegClass);
57475 if (VT == MVT::i8)
57476 return std::make_pair(0U, &X86::VK8WMRegClass);
57477 if (VT == MVT::i16)
57478 return std::make_pair(0U, &X86::VK16WMRegClass);
57479 }
57480 if (Subtarget.hasBWI()) {
57481 if (VT == MVT::i32)
57482 return std::make_pair(0U, &X86::VK32WMRegClass);
57483 if (VT == MVT::i64)
57484 return std::make_pair(0U, &X86::VK64WMRegClass);
57485 }
57486 break;
57487 }
57488 }
57489
57490 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57491 return std::make_pair(0U, &X86::GR32RegClass);
57492
57493 // Use the default implementation in TargetLowering to convert the register
57494 // constraint into a member of a register class.
57495 std::pair<Register, const TargetRegisterClass*> Res;
57496 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
57497
57498 // Not found as a standard register?
57499 if (!Res.second) {
57500 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
57501 // to/from f80.
57502 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
57503 // Map st(0) -> st(7) -> ST0
57504 if (Constraint.size() == 7 && Constraint[0] == '{' &&
57505 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
57506 Constraint[3] == '(' &&
57507 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
57508 Constraint[5] == ')' && Constraint[6] == '}') {
57509 // st(7) is not allocatable and thus not a member of RFP80. Return
57510 // singleton class in cases where we have a reference to it.
57511 if (Constraint[4] == '7')
57512 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
57513 return std::make_pair(X86::FP0 + Constraint[4] - '0',
57514 &X86::RFP80RegClass);
57515 }
57516
57517 // GCC allows "st(0)" to be called just plain "st".
57518 if (StringRef("{st}").equals_insensitive(Constraint))
57519 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
57520 }
57521
57522 // flags -> EFLAGS
57523 if (StringRef("{flags}").equals_insensitive(Constraint))
57524 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
57525
57526 // dirflag -> DF
57527 // Only allow for clobber.
57528 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
57529 VT == MVT::Other)
57530 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
57531
57532 // fpsr -> FPSW
57533 if (StringRef("{fpsr}").equals_insensitive(Constraint))
57534 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
57535
57536 return Res;
57537 }
57538
57539 // Make sure it isn't a register that requires 64-bit mode.
57540 if (!Subtarget.is64Bit() &&
57541 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
57542 TRI->getEncodingValue(Res.first) >= 8) {
57543 // Register requires REX prefix, but we're in 32-bit mode.
57544 return std::make_pair(0, nullptr);
57545 }
57546
57547 // Make sure it isn't a register that requires AVX512.
57548 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
57549 TRI->getEncodingValue(Res.first) & 0x10) {
57550 // Register requires EVEX prefix.
57551 return std::make_pair(0, nullptr);
57552 }
57553
57554 // Otherwise, check to see if this is a register class of the wrong value
57555 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
57556 // turn into {ax},{dx}.
57557 // MVT::Other is used to specify clobber names.
57558 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
57559 return Res; // Correct type already, nothing to do.
57560
57561 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
57562 // return "eax". This should even work for things like getting 64bit integer
57563 // registers when given an f64 type.
57564 const TargetRegisterClass *Class = Res.second;
57565 // The generic code will match the first register class that contains the
57566 // given register. Thus, based on the ordering of the tablegened file,
57567 // the "plain" GR classes might not come first.
57568 // Therefore, use a helper method.
57569 if (isGRClass(*Class)) {
57570 unsigned Size = VT.getSizeInBits();
57571 if (Size == 1) Size = 8;
57572 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
57573 if (DestReg > 0) {
57574 bool is64Bit = Subtarget.is64Bit();
57575 const TargetRegisterClass *RC =
57576 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
57577 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
57578 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
57579 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
57580 : nullptr;
57581 if (Size == 64 && !is64Bit) {
57582 // Model GCC's behavior here and select a fixed pair of 32-bit
57583 // registers.
57584 switch (DestReg) {
57585 case X86::RAX:
57586 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57587 case X86::RDX:
57588 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
57589 case X86::RCX:
57590 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
57591 case X86::RBX:
57592 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
57593 case X86::RSI:
57594 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
57595 case X86::RDI:
57596 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
57597 case X86::RBP:
57598 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
57599 default:
57600 return std::make_pair(0, nullptr);
57601 }
57602 }
57603 if (RC && RC->contains(DestReg))
57604 return std::make_pair(DestReg, RC);
57605 return Res;
57606 }
57607 // No register found/type mismatch.
57608 return std::make_pair(0, nullptr);
57609 } else if (isFRClass(*Class)) {
57610 // Handle references to XMM physical registers that got mapped into the
57611 // wrong class. This can happen with constraints like {xmm0} where the
57612 // target independent register mapper will just pick the first match it can
57613 // find, ignoring the required type.
57614
57615 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
57616 if (VT == MVT::f16)
57617 Res.second = &X86::FR16XRegClass;
57618 else if (VT == MVT::f32 || VT == MVT::i32)
57619 Res.second = &X86::FR32XRegClass;
57620 else if (VT == MVT::f64 || VT == MVT::i64)
57621 Res.second = &X86::FR64XRegClass;
57622 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
57623 Res.second = &X86::VR128XRegClass;
57624 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
57625 Res.second = &X86::VR256XRegClass;
57626 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
57627 Res.second = &X86::VR512RegClass;
57628 else {
57629 // Type mismatch and not a clobber: Return an error;
57630 Res.first = 0;
57631 Res.second = nullptr;
57632 }
57633 } else if (isVKClass(*Class)) {
57634 if (VT == MVT::i1)
57635 Res.second = &X86::VK1RegClass;
57636 else if (VT == MVT::i8)
57637 Res.second = &X86::VK8RegClass;
57638 else if (VT == MVT::i16)
57639 Res.second = &X86::VK16RegClass;
57640 else if (VT == MVT::i32)
57641 Res.second = &X86::VK32RegClass;
57642 else if (VT == MVT::i64)
57643 Res.second = &X86::VK64RegClass;
57644 else {
57645 // Type mismatch and not a clobber: Return an error;
57646 Res.first = 0;
57647 Res.second = nullptr;
57648 }
57649 }
57650
57651 return Res;
57652}
57653
57654bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
57655 // Integer division on x86 is expensive. However, when aggressively optimizing
57656 // for code size, we prefer to use a div instruction, as it is usually smaller
57657 // than the alternative sequence.
57658 // The exception to this is vector division. Since x86 doesn't have vector
57659 // integer division, leaving the division as-is is a loss even in terms of
57660 // size, because it will have to be scalarized, while the alternative code
57661 // sequence can be performed in vector form.
57662 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
57663 return OptSize && !VT.isVector();
57664}
57665
57666void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
57667 if (!Subtarget.is64Bit())
57668 return;
57669
57670 // Update IsSplitCSR in X86MachineFunctionInfo.
57671 X86MachineFunctionInfo *AFI =
57672 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
57673 AFI->setIsSplitCSR(true);
57674}
57675
57676void X86TargetLowering::insertCopiesSplitCSR(
57677 MachineBasicBlock *Entry,
57678 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
57679 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
57680 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
57681 if (!IStart)
57682 return;
57683
57684 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
57685 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
57686 MachineBasicBlock::iterator MBBI = Entry->begin();
57687 for (const MCPhysReg *I = IStart; *I; ++I) {
57688 const TargetRegisterClass *RC = nullptr;
57689 if (X86::GR64RegClass.contains(*I))
57690 RC = &X86::GR64RegClass;
57691 else
57692 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57692)
;
57693
57694 Register NewVR = MRI->createVirtualRegister(RC);
57695 // Create copy from CSR to a virtual register.
57696 // FIXME: this currently does not emit CFI pseudo-instructions, it works
57697 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
57698 // nounwind. If we want to generalize this later, we may need to emit
57699 // CFI pseudo-instructions.
57700 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57702, __extension__
__PRETTY_FUNCTION__))
57701 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57702, __extension__
__PRETTY_FUNCTION__))
57702 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57702, __extension__
__PRETTY_FUNCTION__))
;
57703 Entry->addLiveIn(*I);
57704 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
57705 .addReg(*I);
57706
57707 // Insert the copy-back instructions right before the terminator.
57708 for (auto *Exit : Exits)
57709 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
57710 TII->get(TargetOpcode::COPY), *I)
57711 .addReg(NewVR);
57712 }
57713}
57714
57715bool X86TargetLowering::supportSwiftError() const {
57716 return Subtarget.is64Bit();
57717}
57718
57719/// Returns true if stack probing through a function call is requested.
57720bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
57721 return !getStackProbeSymbolName(MF).empty();
57722}
57723
57724/// Returns true if stack probing through inline assembly is requested.
57725bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
57726
57727 // No inline stack probe for Windows, they have their own mechanism.
57728 if (Subtarget.isOSWindows() ||
57729 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
57730 return false;
57731
57732 // If the function specifically requests inline stack probes, emit them.
57733 if (MF.getFunction().hasFnAttribute("probe-stack"))
57734 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
57735 "inline-asm";
57736
57737 return false;
57738}
57739
57740/// Returns the name of the symbol used to emit stack probes or the empty
57741/// string if not applicable.
57742StringRef
57743X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
57744 // Inline Stack probes disable stack probe call
57745 if (hasInlineStackProbe(MF))
57746 return "";
57747
57748 // If the function specifically requests stack probes, emit them.
57749 if (MF.getFunction().hasFnAttribute("probe-stack"))
57750 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
57751
57752 // Generally, if we aren't on Windows, the platform ABI does not include
57753 // support for stack probes, so don't emit them.
57754 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
57755 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
57756 return "";
57757
57758 // We need a stack probe to conform to the Windows ABI. Choose the right
57759 // symbol.
57760 if (Subtarget.is64Bit())
57761 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
57762 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
57763}
57764
57765unsigned
57766X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
57767 // The default stack probe size is 4096 if the function has no stackprobesize
57768 // attribute.
57769 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
57770 4096);
57771}
57772
57773Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
57774 if (ML->isInnermost() &&
57775 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
57776 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
57777 return TargetLowering::getPrefLoopAlignment();
57778}